# Loading Datasets

## Local Files Loading


In [None]:
!wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-train.json.gz
!wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-test.json.gz
!gzip -dkv SQuAD_it-*.json.gz

In [None]:
!pip install datasets

In [8]:
from datasets import load_dataset

data_files = {"train": "SQuAD_it-train.json", "test": "SQuAD_it-test.json"}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")
squad_it_dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 48
    })
})

## Remote Loading

In [9]:
url = "https://github.com/crux82/squad-it/raw/master/"
data_files = {
    "train": url + "SQuAD_it-train.json.gz",
    "test": url + "SQuAD_it-test.json.gz",
}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/7.73M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [10]:
squad_it_dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 442
    })
    test: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 48
    })
})

# Manipulate Datasets

In [None]:
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
!unzip -y drugsCom_raw.zip

In [22]:
from datasets import load_dataset

data_split_files = {"train": "drugsComTrain_raw.tsv", "test": "drugsComTest_raw.tsv"}
dataset = load_dataset("csv", data_files=data_split_files, delimiter="\t")

In [17]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})


## Shuffle and slice

In [19]:
#shuffle and select a range of the dataset
samples = dataset['train'].shuffle(seed=42).select(range(10))
print(samples)

Dataset({
    features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
    num_rows: 10
})


## Rename

In [23]:
# rename a column
dataset = dataset.rename_column(
    original_column_name="Unnamed: 0", new_column_name="patient_id"
)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})


## Filter NaNs

In [30]:
print(f'length dataset before = {len(dataset["train"])+len(dataset["test"])}')
dataset = dataset.filter(lambda x: x['condition'] is not None)
print(f'length dataset after = {len(dataset["train"])+ len(dataset["test"])}')


length dataset before = 215063


Filter:   0%|          | 0/161297 [00:00<?, ? examples/s]

Filter:   0%|          | 0/53766 [00:00<?, ? examples/s]

length dataset after = 213869


## Map functions

In [31]:
def lowercase_condition(sample):
  return {"condition": sample['condition'].lower()}

print(dataset['train'][0])
dataset.map(lowercase_condition)
print(dataset['train'][0])


{'patient_id': 206461, 'drugName': 'Valsartan', 'condition': 'Left Ventricular Dysfunction', 'review': '"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"', 'rating': 9.0, 'date': 'May 20, 2012', 'usefulCount': 27}


Map:   0%|          | 0/160398 [00:00<?, ? examples/s]

Map:   0%|          | 0/53471 [00:00<?, ? examples/s]

{'patient_id': 206461, 'drugName': 'Valsartan', 'condition': 'Left Ventricular Dysfunction', 'review': '"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"', 'rating': 9.0, 'date': 'May 20, 2012', 'usefulCount': 27}


## Add a column

In [33]:
def review_length(sample):
    return {"review_length": len(sample["review"].split())}

dataset = dataset.map(review_length)
print(dataset['train'])


Map:   0%|          | 0/160398 [00:00<?, ? examples/s]

Map:   0%|          | 0/53471 [00:00<?, ? examples/s]

Dataset({
    features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
    num_rows: 160398
})


## Sort values

In [34]:
dataset["train"].sort("review_length")[:3]

{'patient_id': [111469, 13653, 53602],
 'drugName': ['Ledipasvir / sofosbuvir',
  'Amphetamine / dextroamphetamine',
  'Alesse'],
 'condition': ['Hepatitis C', 'ADHD', 'Birth Control'],
 'review': ['"Headache"', '"Great"', '"Awesome"'],
 'rating': [10.0, 10.0, 10.0],
 'date': ['February 3, 2015', 'October 20, 2009', 'November 23, 2015'],
 'usefulCount': [41, 3, 0],
 'review_length': [1, 1, 1]}

## Deal with HTML characters

In [35]:
import html

text = "I&#039;m a transformer called BERT"
html.unescape(text)

"I'm a transformer called BERT"

In [46]:
%%timeit
new_dataset = dataset.map(lambda sample: {"review": html.unescape(sample["review"])}, batched=True)

14.4 ms ± 3.45 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [47]:
%%timeit
new_dataset = dataset.map(
    lambda x: {"review": [html.unescape(o) for o in x["review"]]}, batched=True
)

15.8 ms ± 1.78 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


## Tokenizers and Batching

In [None]:
!pip install transformers

In [54]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=True)

def tokenize_function(sample):
  return tokenizer(sample['review'], truncation=True)

In [55]:
%time dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/160398 [00:00<?, ? examples/s]

CPU times: user 1min 35s, sys: 854 ms, total: 1min 36s
Wall time: 1min 8s


DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 160398
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 53471
    })
})

# Datasets and Pandas

In [56]:
dataset.set_format('pandas')

In [59]:
dataset['train'][:3]

Unnamed: 0,patient_id,drugName,condition,review,rating,date,usefulCount,review_length
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27,17
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192,141
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17,134


In [60]:
train_df = dataset['train'][:]
frequencies = (
    train_df["condition"]
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={"index": "condition", "condition": "frequency"})
)
frequencies.head()

Unnamed: 0,condition,frequency
0,Birth Control,28788
1,Depression,9069
2,Pain,6145
3,Anxiety,5904
4,Acne,5588


In [61]:
from datasets import Dataset

freq_dataset = Dataset.from_pandas(frequencies)
freq_dataset

Dataset({
    features: ['condition', 'frequency'],
    num_rows: 884
})

In [None]:
# reset output format of dataset
dataset.reset_format()


# Train-Test Split

In [63]:
drug_dataset_clean = dataset["train"].train_test_split(train_size=0.8, seed=42)
# Rename the default "test" split to "validation"
drug_dataset_clean["validation"] = drug_dataset_clean.pop("test")
# Add the "test" set to our `DatasetDict`
drug_dataset_clean["test"] = dataset["test"]
drug_dataset_clean

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 128318
    })
    validation: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 32080
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 53471
    })
})

# Save Dataset

In [None]:
# Example arrow format
drug_dataset_clean.save_to_disk("drug-reviews")

# Example json format
for split, dataset in drug_dataset_clean.items():
    dataset.to_json(f"drug-reviews-{split}.jsonl")

# Loading saved dataset
drug_dataset_reloaded = load_from_disk("drug-reviews")
