## Using datasets library to manipulate data

In [1]:
from datasets import load_dataset

data_files = {"train": "drugsCom/drugsComTrain_raw.tsv", "test": "drugsCom/drugsComTest_raw.tsv"}
# remove \t (tab character)
drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")

Using custom data configuration default-97c6b25da183d6dd
Found cached dataset csv (/home/jovyan/.cache/huggingface/datasets/csv/default-97c6b25da183d6dd/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
# Get a random sample of data
drug_sample = drug_dataset["train"].shuffle(seed=42).select(range(1000))
# Peek at first few examples
drug_sample[:3]

Loading cached shuffled indices for dataset at /home/jovyan/.cache/huggingface/datasets/csv/default-97c6b25da183d6dd/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-0b007d0bb412a255.arrow


{'Unnamed: 0': [87571, 178045, 80482],
 'drugName': ['Naproxen', 'Duloxetine', 'Mobic'],
 'condition': ['Gout, Acute', 'ibromyalgia', 'Inflammatory Conditions'],
 'review': ['"like the previous person mention, I&#039;m a strong believer of aleve, it works faster for my gout than the prescription meds I take. No more going to the doctor for refills.....Aleve works!"',
  '"I have taken Cymbalta for about a year and a half for fibromyalgia pain. It is great\r\nas a pain reducer and an anti-depressant, however, the side effects outweighed \r\nany benefit I got from it. I had trouble with restlessness, being tired constantly,\r\ndizziness, dry mouth, numbness and tingling in my feet, and horrible sweating. I am\r\nbeing weaned off of it now. Went from 60 mg to 30mg and now to 15 mg. I will be\r\noff completely in about a week. The fibro pain is coming back, but I would rather deal with it than the side effects."',
  '"I have been taking Mobic for over a year with no side effects other than 

In the above, a few problems arise and require cleaning to be done
- `Unnamed: 0` column seems to be anonymized ID for each patient
- `condition` has a mix of upper and lowercase labels
- `review` have varying lengths and mix of Python separators and HTML character codes

## Fixing with Datasets

In [3]:
# Check if `Unnamed: 0` has unique values -> possible patient ID column
for split in drug_dataset.keys():
    assert len(drug_dataset[split]) == len(drug_dataset[split].unique("Unnamed: 0")), "Column doesn't contain unique values"

In [4]:
# Rename `Unnamed: 0` to a more accurate reflection
drug_dataset = drug_dataset.rename_column(
    original_column_name="Unnamed: 0", new_column_name="patient_id"
)

drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

### Using Lambda functions
`lambda <arguments> : <expression>`

To apply to an input, include the input in parenthesis:
`(lambda x: x * x)(3)`

In [5]:
# Remove all condition data that is None
drug_dataset = drug_dataset.filter(lambda x: x["condition"] is not None)

# Make all condition labels to lowercase
def lowercase_condition(example):
    return {"condition": example["condition"].lower()}

drug_dataset = drug_dataset.map(lowercase_condition)

## Alternatively, using Lambda function
# drug_dataset = drug_dataset.map(lambda x: {"condition": x["condition"].lower()})

Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/csv/default-97c6b25da183d6dd/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-111556d3cd4c29ff.arrow
Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/csv/default-97c6b25da183d6dd/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-bd7e353c72f7fc1c.arrow
Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/csv/default-97c6b25da183d6dd/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-20b33e358e20ba05.arrow
Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/csv/default-97c6b25da183d6dd/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-a8055dc50a65a597.arrow


In [6]:
# Checking that lowercasing worked
drug_dataset["train"]["condition"][:3]

['left ventricular dysfunction', 'adhd', 'birth control']

## Creating new columns

In [7]:
# Create a new column stating the length of reviews
def compute_review_length(example):
    return {"review_length": len(example["review"].split())}

drug_dataset = drug_dataset.map(compute_review_length)

# Inspect first training sample
drug_dataset["train"][0]

Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/csv/default-97c6b25da183d6dd/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-0f45d2efa82cde38.arrow
Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/csv/default-97c6b25da183d6dd/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-aee987a42e792552.arrow


{'patient_id': 206461,
 'drugName': 'Valsartan',
 'condition': 'left ventricular dysfunction',
 'review': '"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
 'rating': 9.0,
 'date': 'May 20, 2012',
 'usefulCount': 27,
 'review_length': 17}

In [8]:
# Sorting based on review length
drug_dataset["train"].sort("review_length")[:3]

Loading cached sorted indices for dataset at /home/jovyan/.cache/huggingface/datasets/csv/default-97c6b25da183d6dd/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-11c0886302407ee3.arrow


{'patient_id': [103488, 23627, 20558],
 'drugName': ['Loestrin 21 1 / 20', 'Chlorzoxazone', 'Nucynta'],
 'condition': ['birth control', 'muscle spasm', 'pain'],
 'review': ['"Excellent."', '"useless"', '"ok"'],
 'rating': [10.0, 1.0, 6.0],
 'date': ['November 4, 2008', 'March 24, 2017', 'August 20, 2016'],
 'usefulCount': [5, 2, 10],
 'review_length': [1, 1, 1]}

Alternatively `Dataset.add_column()` can also be used, where to column is a Python list or NumPy array

In [9]:
# Remove reviews with less than 30 words
drug_dataset = drug_dataset.filter(lambda x: x["review_length"] > 30)
print(drug_dataset.num_rows)

Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/csv/default-97c6b25da183d6dd/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-7148d432cb1e963d.arrow
Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/csv/default-97c6b25da183d6dd/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-efdd282431f12076.arrow


{'train': 138514, 'test': 46108}


## Removing HTML characters

In [10]:
import html

drug_dataset = drug_dataset.map(lambda x: {"review": html.unescape(x["review"])})

Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/csv/default-97c6b25da183d6dd/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-0a04daef21bc05bc.arrow
Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/csv/default-97c6b25da183d6dd/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-e223015b6aca6b70.arrow


## Further .map applications and optimizations

If we use `batched=True` in `Dataset.map()`, a batch of examples are sent to map function at once (default 1,000). <br>
When using `batched=True`, the function receives a dictionary with the fields of the dataset, but each value is a list of values instead of a single value. <br>
Hence, we use list comprehension to apply the same `unescape` functionality.

In [11]:
%%time

# Apply list comprehension when using batched=True
new_drug_dataset = drug_dataset.map(
    lambda x: {"review": [html.unescape(o) for o in x["review"]]}, batched=True
)

Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/csv/default-97c6b25da183d6dd/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-4ea74fc943684a38.arrow
Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/csv/default-97c6b25da183d6dd/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-49ee338eb27110c2.arrow


CPU times: user 9.06 ms, sys: 367 µs, total: 9.42 ms
Wall time: 8.5 ms


With `batched=True`, we are able to have fast tokenizers

In [12]:
%%time

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["review"], truncation=True)

CPU times: user 222 ms, sys: 96.3 ms, total: 319 ms
Wall time: 1.98 s


Now, we try multiprocessing with tokenizers

In [13]:
%%time

slow_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False)

def slow_tokenize_function(examples):
    return slow_tokenizer(examples["review"], truncation=True)

tokenized_dataset = drug_dataset.map(slow_tokenize_function, batched=True, num_proc=8)

 

Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/csv/default-97c6b25da183d6dd/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-5045070b19151322.arrow


 

Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/csv/default-97c6b25da183d6dd/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-181f4f6f4f724840.arrow


 

Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/csv/default-97c6b25da183d6dd/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-9242f68f534e3485.arrow


 

Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/csv/default-97c6b25da183d6dd/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-2d3380c34091e2dd.arrow


 

Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/csv/default-97c6b25da183d6dd/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-e45bf40bbe9f540e.arrow


 

Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/csv/default-97c6b25da183d6dd/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-a8d7592365a18617.arrow


 

Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/csv/default-97c6b25da183d6dd/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-3ec7fd9e4d192d78.arrow


 

Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/csv/default-97c6b25da183d6dd/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-9a81e070acd219d3.arrow


 

Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/csv/default-97c6b25da183d6dd/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-bcfece771939d008.arrow


 

Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/csv/default-97c6b25da183d6dd/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-1133aa0efb8d2121.arrow


 

Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/csv/default-97c6b25da183d6dd/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-96c1585161000564.arrow


 

Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/csv/default-97c6b25da183d6dd/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-6fbba5be04c1f45d.arrow


 

Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/csv/default-97c6b25da183d6dd/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-26f4bc61446e6a24.arrow


 

Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/csv/default-97c6b25da183d6dd/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-8aa3078d8ca45459.arrow


 

Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/csv/default-97c6b25da183d6dd/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-351e23cc8b30beab.arrow


 

Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/csv/default-97c6b25da183d6dd/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-c4a4e3fe9f8bf354.arrow


CPU times: user 4.22 s, sys: 0 ns, total: 4.22 s
Wall time: 5.2 s


`Dataset.map()` and `batched=True` also can be used to change the number of elements in the dataset. (Useful when we want to create several features from one example)

In [14]:
def tokenize(examples):
    return tokenizer(
        examples["review"],
        truncation=True,
    )

result = tokenize(drug_dataset["train"][0])
print(len(result["input_ids"]))

175


In [15]:
# Here we split up the input_ids into 2 separate sequences to fit the max_length of 128
def tokenize_and_split(examples):
    return tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )

result = tokenize_and_split(drug_dataset["train"][0])
[len(inp) for inp in result["input_ids"]]

[128, 49]

In [16]:
## Applying to whole dataset (this will result in an error due to the splitting
# tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)

# We need to either remove columns from old dataset, or make them the same size as they are in the new dataset.
# METHOD 1: To remove columns, we can use remove_columns

tokenized_dataset_method_1 = drug_dataset.map(
    tokenize_and_split, batched=True, remove_columns=drug_dataset["train"].column_names
)

Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/csv/default-97c6b25da183d6dd/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-0c40f73da428c1ce.arrow
Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/csv/default-97c6b25da183d6dd/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-afef10f91e566986.arrow


In [17]:
tokenized_dataset_method_1

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'overflow_to_sample_mapping'],
        num_rows: 206772
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'overflow_to_sample_mapping'],
        num_rows: 68876
    })
})

In [18]:
# METHOD 2: Make old columns the same size as the new ones using overflow_to_sample

def tokenize_and_split(examples):
    result = tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )
    
    # Extract mapping between new and old indices
    sample_map = result.pop("overflow_to_sample_mapping")
    for k, v in examples.items():
        result[k] = [v[i] for i in sample_map]
    return result

tokenized_dataset_method_2 = drug_dataset.map(tokenize_and_split, batched=True)

Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/csv/default-97c6b25da183d6dd/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-0e334cd67c9c2859.arrow
Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/csv/default-97c6b25da183d6dd/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-1c57ecc473d0bb53.arrow


In [19]:
tokenized_dataset_method_2

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 206772
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 68876
    })
})

## Using Datasets with DataFrames

In [27]:
# Converting to pandas dataframe format
drug_dataset.set_format("pandas")
drug_dataset["train"][:3]

Unnamed: 0,patient_id,drugName,condition,review,rating,date,usefulCount,review_length
0,95260,Guanfacine,adhd,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192,141
1,92703,Lybrel,birth control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17,134
2,138000,Ortho Evra,birth control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10,89


In [28]:
train_df = drug_dataset["train"][:]

In [30]:
frequencies=(
    train_df["condition"]
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={"index": "condition", "condition": "frequency"})
)
frequencies.head()

Unnamed: 0,condition,frequency
0,birth control,27655
1,depression,8023
2,acne,5209
3,anxiety,4991
4,pain,4744


In [46]:
mean_rating=(
    train_df
    .groupby('drugName')['rating']
    .mean()
    .reset_index()
    .rename(columns={'rating': 'mean_rating'})
)
mean_rating.head()

Unnamed: 0,drugName,mean_rating
0,A + D Cracked Skin Relief,10.000000
1,A / B Otic,10.000000
2,Abacavir / dolutegravir / lamivudine,7.953488
3,Abacavir / lamivudine / zidovudine,9.000000
4,Abatacept,7.312500
...,...,...
3047,Zyvox,9.200000
3048,ZzzQuil,4.000000
3049,depo-subQ provera 104,1.000000
3050,ella,6.847826


In [48]:
# Converting back to Apache Arrow format
from datasets import Dataset

freq_dataset = Dataset.from_pandas(frequencies)
freq_dataset

Dataset({
    features: ['condition', 'frequency'],
    num_rows: 819
})

In [49]:
mean_rating_dataset = Dataset.from_pandas(mean_rating)
mean_rating_dataset

Dataset({
    features: ['drugName', 'mean_rating'],
    num_rows: 3052
})

In [50]:
# Converting back to Apache Arrow -> for modelling purposes
drug_dataset.reset_format()

## Creating validation set

Using `Dataset.train_test_split()` we can split the training set into training and validation

In [51]:
drug_dataset_clean = drug_dataset["train"].train_test_split(train_size=0.8, seed=42)
# Rename default "test" split to "validation"
drug_dataset_clean["validation"] = drug_dataset_clean.pop("test")
# Add actual "test" set to `DatasetDict`
drug_dataset_clean["test"] = drug_dataset["test"]
drug_dataset_clean

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 110811
    })
    validation: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 27703
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
})

## Saving processed datasets

After cleaning the raw datasets, we may wish to save it

In [52]:
# Saving in Arrow
drug_dataset_clean.save_to_disk("drug-reviews")

Flattening the indices:   0%|          | 0/111 [00:00<?, ?ba/s]

Saving the dataset (0/1 shards):   0%|          | 0/110811 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/28 [00:00<?, ?ba/s]

Saving the dataset (0/1 shards):   0%|          | 0/27703 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/46108 [00:00<?, ? examples/s]

In [53]:
# Loading in Arrow
from datasets import load_from_disk

drug_dataset_reloaded = load_from_disk("drug-reviews")
drug_dataset_reloaded



DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 110811
    })
    validation: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 27703
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
})

### .csv and .json

For .csv and .json, we save each split as a separate file. This can be done as shown below

In [54]:
# Saving to json

for split, dataset in drug_dataset_clean.items():
    dataset.to_json(f"drug-reviews-{split}.jsonl")

Creating json from Arrow format:   0%|          | 0/111 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/28 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/47 [00:00<?, ?ba/s]

In [55]:
# Loading from json

data_files = {
    "train": "drug-reviews-train.jsonl",
    "validation": "drug-reviews-validation.jsonl",
    "test": "drug-reviews-test.jsonl",
}
drug_dataset_reloaded = load_dataset("json", data_files=data_files)

Using custom data configuration default-78827673c9a84218


Downloading and preparing dataset json/default to /home/jovyan/.cache/huggingface/datasets/json/default-78827673c9a84218/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /home/jovyan/.cache/huggingface/datasets/json/default-78827673c9a84218/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]