# Week 22: Configure the project workspace and access the dataset specific to the project topic.

In [23]:
pip install datasets




In [24]:
from datasets import load_dataset

In [25]:
ds = load_dataset('SetFit/20_newsgroups')

Repo card metadata block was not found. Setting CardData to empty.


In [26]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 11314
    })
    test: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 7532
    })
})

In [27]:
ds['train'][0]

{'text': 'I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.',
 'label': 7,
 'label_text': 'rec.autos'}

In [28]:
ds['train'].features

{'text': Value(dtype='string', id=None),
 'label': Value(dtype='int64', id=None),
 'label_text': Value(dtype='string', id=None)}

In [29]:
import pandas as pd
ds_train = pd.DataFrame(ds['train'])
ds_test = pd.DataFrame(ds['test'])

In [30]:
ds_train.head()
ds_test.head()

Unnamed: 0,text,label,label_text
0,I am a little confused on all of the models of...,7,rec.autos
1,I'm not familiar at all with the format of the...,5,comp.windows.x
2,"\nIn a word, yes.\n",0,alt.atheism
3,\nThey were attacking the Iraqis to drive them...,17,talk.politics.mideast
4,\nI've just spent two solid months arguing tha...,19,talk.religion.misc


In [31]:
import pandas as pd
import re

# Define a cleaning function

def clean_text(text):

    # Remove hyperlinks (URLs)

    text = re.sub(r'http\S+', '', text)
    
    # Remove special characters and digits

    text = re.sub(r'[^A-Za-z\s]+', '', text)
    
    # Remove extra whitespaces

    text = ' '.join(text.split())
    
    return text

# Apply cleaning function to 'text' column in ds_train

ds_train['cleaned_text'] = ds_train['text'].apply(clean_text)

# Apply cleaning function to 'text' column in ds_test

ds_test['cleaned_text'] = ds_test['text'].apply(clean_text)

# Print cleaned datasets to verify

print("Cleaned ds_train:")
print(ds_train[['text', 'cleaned_text', 'label', 'label_text']])

print("\nCleaned ds_test:")
print(ds_test[['text', 'cleaned_text', 'label', 'label_text']])


Cleaned ds_train:
                                                    text  \
0      I was wondering if anyone out there could enli...   
1      A fair number of brave souls who upgraded thei...   
2      well folks, my mac plus finally gave up the gh...   
3      \nDo you have Weitek's address/phone number?  ...   
4      From article <C5owCB.n3p@world.std.com>, by to...   
...                                                  ...   
11309  DN> From: nyeda@cnsvax.uwec.edu (David Nye)\nD...   
11310  I have a (very old) Mac 512k and a Mac Plus, b...   
11311  I just installed a DX2-66 CPU in a clone mothe...   
11312  \nWouldn't this require a hyper-sphere.  In 3-...   
11313  Stolen from Pasadena between 4:30 and 6:30 pm ...   

                                            cleaned_text  label  \
0      I was wondering if anyone out there could enli...      7   
1      A fair number of brave souls who upgraded thei...      4   
2      well folks my mac plus finally gave up the gho...    

In [32]:
from datasets import Dataset, DatasetDict

# assign the splits

train = Dataset.from_pandas(ds_train)
test = Dataset.from_pandas(ds_test)

# reconstruct both datasets into a Dataset Dict object

new_ds = DatasetDict(
    {
        'train': train,
        'test': test
    }
)
# view the resulting dataset dict object

new_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text', 'cleaned_text'],
        num_rows: 11314
    })
    test: Dataset({
        features: ['text', 'label', 'label_text', 'cleaned_text'],
        num_rows: 7532
    })
})

In [33]:
%store new_ds

Stored 'new_ds' (DatasetDict)


In [34]:
# Save as csv

train_df = new_ds['train'].to_pandas()
test_df = new_ds['test'].to_pandas()

train_df.to_csv('train.csv', index=False)
test_df.to_csv('test.csv', index=False)