# Datasets
How to work with local and remote datasets 

In [1]:
# download json datasets (large-scale dataset for question answering in Italian)
!curl -0 https://github.com/crux82/squad-it/raw/master/SQuAD_it-train.json.gz
!curl -0 https://github.com/crux82/squad-it/raw/master/SQuAD_it-test.json.gz

# !gzip -dkv SQuAD_it-*.json.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0


In [2]:
from datasets import load_dataset

# can load just a single file
squad_it_dataset = load_dataset("json", data_files="data/SQuAD_it-train.json", field="data")

# or many files using a dict
data_files = {
    "train": "data/SQuAD_it-train.json", 
    "test": "data/SQuAD_it-test.json"
}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")
squad_it_dataset


DatasetDict({
    train: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 442
    })
    test: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 48
    })
})

In [3]:
# or even decompress files (damn)
data_files = {
    "train": "data/SQuAD_it-train.json.gz", 
    "test": "data/SQuAD_it-test.json.gz"
}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")
squad_it_dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 442
    })
    test: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 48
    })
})

In [4]:
# or even a remote dataset (DAMN!)
url = "https://github.com/crux82/squad-it/raw/master/"
data_files = {
    "train": url + "SQuAD_it-train.json.gz",
    "test": url + "SQuAD_it-test.json.gz",
}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")
squad_it_dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 442
    })
    test: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 48
    })
})

# Manipulating data

In [5]:
# Download the Drug Review Dataset
# !wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
# !unzip drugsCom_raw.zip
# !mv drugs* data/

In [6]:
data_files = {"train": "data/drugsComTrain_raw.tsv", "test": "data/drugsComTest_raw.tsv"}
# \t is the tab character in Python
drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")

In [7]:
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [8]:
# take a look at a small sample of them
drug_sample = drug_dataset["train"].shuffle(seed=42).select(range(1000))
drug_sample[:3]

{'Unnamed: 0': [87571, 178045, 80482],
 'drugName': ['Naproxen', 'Duloxetine', 'Mobic'],
 'condition': ['Gout, Acute', 'ibromyalgia', 'Inflammatory Conditions'],
 'review': ['"like the previous person mention, I&#039;m a strong believer of aleve, it works faster for my gout than the prescription meds I take. No more going to the doctor for refills.....Aleve works!"',
  '"I have taken Cymbalta for about a year and a half for fibromyalgia pain. It is great\r\nas a pain reducer and an anti-depressant, however, the side effects outweighed \r\nany benefit I got from it. I had trouble with restlessness, being tired constantly,\r\ndizziness, dry mouth, numbness and tingling in my feet, and horrible sweating. I am\r\nbeing weaned off of it now. Went from 60 mg to 30mg and now to 15 mg. I will be\r\noff completely in about a week. The fibro pain is coming back, but I would rather deal with it than the side effects."',
  '"I have been taking Mobic for over a year with no side effects other than 

In [9]:
# the "Unnamed: 0" column seems to be a unique patient id so check this
for split in drug_dataset.keys():
    assert len(drug_dataset[split]) == len(drug_dataset[split].unique("Unnamed: 0"))

In [10]:
# rename "Unnamed: 0" to "patient_id"
drug_dataset = drug_dataset.rename_column(
    original_column_name="Unnamed: 0", new_column_name="patient_id"
)
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [11]:
# convert all `condotion` entries to lowercase by making a function to map onto dataset
# Returns a dict with the new value of the key to replace
def lowercase_condition(example):
    return {"condition": example["condition"].lower()}


# # try to map it but uh-oh there are some null values
# drug_dataset.map(lowercase_condition)

In [12]:
# so filter out null values with lambda function
drug_dataset = drug_dataset.filter(lambda x: x["condition"] is not None)

# now we can lowercase
drug_dataset = drug_dataset.map(lowercase_condition)

drug_dataset["train"][0]

{'patient_id': 206461,
 'drugName': 'Valsartan',
 'condition': 'left ventricular dysfunction',
 'review': '"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
 'rating': 9.0,
 'date': 'May 20, 2012',
 'usefulCount': 27}

# Creating new columns (features)

In [13]:
# create function to compute new column value
# Returns a dict with the new key:value pair to add to each example
def compute_review_length(example):
    return {"review_length": len(example["review"].split())}

drug_dataset = drug_dataset.map(compute_review_length)
drug_dataset["train"][0]

{'patient_id': 206461,
 'drugName': 'Valsartan',
 'condition': 'left ventricular dysfunction',
 'review': '"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
 'rating': 9.0,
 'date': 'May 20, 2012',
 'usefulCount': 27,
 'review_length': 17}

# Sorting
Allows you to see extreme values

In [14]:
drug_dataset["train"].sort("review_length")[:3]

{'patient_id': [111469, 13653, 53602],
 'drugName': ['Ledipasvir / sofosbuvir',
  'Amphetamine / dextroamphetamine',
  'Alesse'],
 'condition': ['hepatitis c', 'adhd', 'birth control'],
 'review': ['"Headache"', '"Great"', '"Awesome"'],
 'rating': [10.0, 10.0, 10.0],
 'date': ['February 3, 2015', 'October 20, 2009', 'November 23, 2015'],
 'usefulCount': [41, 3, 0],
 'review_length': [1, 1, 1]}

In [15]:
# review that are short are not helpful so let's filter them out
drug_dataset = drug_dataset.filter(lambda x: x["review_length"] > 30)
drug_dataset.num_rows

{'train': 138514, 'test': 46108}

# Escaping HTML characters
Use python `html` module's `unescape()` function

In [16]:
import html

text = "I&#039;m a giant tool"
html.unescape(text)

"I'm a giant tool"

In [17]:
drug_dataset = drug_dataset.map(lambda x: {"review": html.unescape(x["review"])})

# Speeding up `.map()` calls
Using the `batched=True` argument makes it run faster 
(because it operates on a list of values at the same time, not just one

In [18]:
new_drug_dataset = drug_dataset.map(lambda x: {"review": [html.unescape(rev) for rev in x["review"]]}, batched=True)

## Putting it all together
Here we will tokenize our examples and truncate them to a maximum length of 128, but we will ask the tokenizer to return all the chunks of the texts instead of just the first one. 

This can be done with `return_overflowing_tokens=True`

In [19]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_and_split(examples):
    return tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True, # causes truncated tokens to be returned as another example
    )
    

In [20]:
# Try in out
result = tokenize_and_split(drug_dataset["train"][0])
[len(inp) for inp in result["input_ids"]]

[128, 49]

In [21]:
# # THIS WILL ERRROR
# tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)

This produced an error because the .map() function deals with 1000 in a batch but the `return_overflowing_tokens=True` caused there to be 1463. However these extra 463 have no values for the other columns.

`ArrowInvalid: Column 8 named input_ids expected length 1000 but got length 1463`

We need to either 
1. remove the columns from the old dataset. We can do the former with the `remove_columns` argument
2. make them the same size as they are in the new dataset. Use the `overflow_to_sample_mapping` field

In [22]:
# 1. remove the columns from the old dataset
tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True, 
                                     remove_columns=drug_dataset["train"].column_names)

len(tokenized_dataset["train"]), len(drug_dataset["train"])

(206772, 138514)

In [23]:
# 2. make them the same size as they are in the new dataset
def tokenize_and_split(examples):
    result = tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )
    # Extract mapping between new and old indices
    sample_map = result.pop("overflow_to_sample_mapping")
    for key, values in examples.items():
        result[key] = [values[i] for i in sample_map]
    return result

In [24]:
# This works! it preserves all the data
tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 206772
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 68876
    })
})

# Converting betweed DataSet and Pandas, etc
Under the hood, datasets are Apache Arrow, but we can change the display format, eg:
```python
drug_dataset.set_format("pandas")
```
This changes the return format for the dataset’s `__getitem__()` dunder method

In [25]:
type(drug_dataset["train"])

datasets.arrow_dataset.Dataset

In [26]:
# change output format so it returns a dataframe, (even though it is still a dataset)
drug_dataset.set_format("pandas")
drug_dataset["train"][:3]

Unnamed: 0,patient_id,drugName,condition,review,rating,date,usefulCount,review_length
0,95260,Guanfacine,adhd,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192,141
1,92703,Lybrel,birth control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17,134
2,138000,Ortho Evra,birth control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10,89


In [27]:
# create a pandas.DataFrame for the whole training set by selecting all the elements of drug_dataset["train"]
train_df = drug_dataset["train"][:]
type(drug_dataset["train"][:])

pandas.core.frame.DataFrame

In [28]:
# Now we can do fancy pandas analysis
frequencies = (
    train_df["condition"]
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={"index": "condition", "condition": "frequency"})
)
frequencies.head()

Unnamed: 0,frequency,count
0,birth control,27655
1,depression,8023
2,acne,5209
3,anxiety,4991
4,pain,4744


In [29]:
#### Convert from pandas df into dataset

from datasets import Dataset

freq_dataset = Dataset.from_pandas(frequencies)
freq_dataset

Dataset({
    features: ['frequency', 'count'],
    num_rows: 819
})

In [30]:
# change format back to dataset
drug_dataset.reset_format()

# Train test split

In [31]:
# make a 20% validation set
drug_dataset_clean = drug_dataset["train"].train_test_split(train_size=0.8, seed=42)

# Rename the default "test" split to "validation"
drug_dataset_clean["validation"] = drug_dataset_clean.pop("test")

# Add the "test" set to our `DatasetDict`
drug_dataset_clean["test"] = drug_dataset["test"]
drug_dataset_clean

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 110811
    })
    validation: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 27703
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
})

# Saving Datasets
Can be saved in arrow(default), csv, json, parquet
- Arrow:	`Dataset.save_to_disk()` 
- CSV	`Dataset.to_csv()`
- JSON	`Dataset.to_json`
EG save cleaned dataset in the Arrow format

```python
drug_dataset_clean.save_to_disk("drug-reiews")
```

```tree
This will create a directory with the followingtx```treeopied
drug-reviews/
├── dataset_dict.json
├── test
│   ├── dataset.arrow
│   ├── dataset_info.json
│   └── state.json
├── train
│   ├── dataset.arrow
│   ├── dataset_info.json
│   ├── indices.arrow
│   └── state.json
└── validation
    ├── dataset.arrow
    ├── dataset_info.json
    ├── indices.arrow
    └── state.json
```

In [32]:
# arrow
from datasets import load_from_disk

drug_dataset_clean.save_to_disk("drug-reviews")
drug_dataset_reloaded = load_from_disk("drug-reviews")
drug_dataset_reloaded

Saving the dataset (0/1 shards):   0%|          | 0/110811 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/27703 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/46108 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 110811
    })
    validation: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 27703
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
})

In [37]:
# json (uses jsonlines `.jsonl` format (like a json array but uses `\n` instead of `,` as separator

# save
folder = "drug-reviews-json"
for split, dataset in drug_dataset_clean.items():
    dataset.to_json(f"{folder}/drug-reviews-{split}.jsonl")
# load
data_files = {
    "train": f"{folder}/drug-reviews-train.jsonl",
    "validation": f"{folder}/drug-reviews-validation.jsonl",
    "test": f"{folder}/drug-reviews-test.jsonl",
}
drug_dataset_reloaded = load_dataset("json", data_files=data_files)

Creating json from Arrow format:   0%|          | 0/111 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/28 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/47 [00:00<?, ?ba/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

# Dealing with very large datasets
Uses loading and streaming so data doesn't have to all fit in memory. Arrow uses memory mapping to map parts of the dataset onto virtual memory locations so they can be accessed in smaller pieces.

**Iterable Dataset** is used when the dataset is so huge that even downloading it is hard. So is it downloaded and used in pieces. Used by setting the `streamoing=True` argument in `load_dataset()`

Example: The Pile (825GB) is a huge NLP corpus, available in 14GB chunks

In [38]:
from datasets import load_dataset
# look at the PubMed Abstracts section (This takes a few minutes to run)
# THIS DATASET WAS TAKEN DOWN :(
data_files = "https://the-eye.eu/public/AI/pile_preliminary_components/PUBMED_title_abstracts_2019_baseline.jsonl.zst"
pubmed_dataset = load_dataset("json", data_files=data_files, split="train")
pubmed_dataset

FileNotFoundError: Unable to find 'https://the-eye.eu/public/AI/pile_preliminary_components/PUBMED_title_abstracts_2019_baseline.jsonl.zst'

# Assembling a dataset
ake dataset of github issues associated with the HF Datasets Repository

In [46]:
import requests
from dotenv import load_dotenv
import os
load_dotenv()  # take environment variables from .env

True

In [50]:
# retrieve the first issue on the first page
url = "https://api.github.com/repos/huggingface/datasets/issues?page=1&per_page=1"
headers = {"Authorization": f'token {os.getenv("GITHUB_TOKEN")}'}
response = requests.get(url, headers=headers)

In [51]:
response.json()

[{'url': 'https://api.github.com/repos/huggingface/datasets/issues/6640',
  'repository_url': 'https://api.github.com/repos/huggingface/datasets',
  'labels_url': 'https://api.github.com/repos/huggingface/datasets/issues/6640/labels{/name}',
  'comments_url': 'https://api.github.com/repos/huggingface/datasets/issues/6640/comments',
  'events_url': 'https://api.github.com/repos/huggingface/datasets/issues/6640/events',
  'html_url': 'https://github.com/huggingface/datasets/issues/6640',
  'id': 2115864531,
  'node_id': 'I_kwDODunzps5-HYfT',
  'number': 6640,
  'title': 'Sign Language Support',
  'user': {'login': 'Merterm',
   'id': 6684795,
   'node_id': 'MDQ6VXNlcjY2ODQ3OTU=',
   'avatar_url': 'https://avatars.githubusercontent.com/u/6684795?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/Merterm',
   'html_url': 'https://github.com/Merterm',
   'followers_url': 'https://api.github.com/users/Merterm/followers',
   'following_url': 'https://api.github.com/users/Mert

In [52]:
import time
import math
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm


def fetch_issues(
    owner="huggingface",
    repo="datasets",
    num_issues=10_000,
    rate_limit=5_000,
    issues_path=Path("."),
):
    if not issues_path.is_dir():
        issues_path.mkdir(exist_ok=True)

    batch = []
    all_issues = []
    per_page = 100  # Number of issues to return per page
    num_pages = math.ceil(num_issues / per_page)
    base_url = "https://api.github.com/repos"

    for page in tqdm(range(num_pages)):
        # Query with state=all to get both open and closed issues
        query = f"issues?page={page}&per_page={per_page}&state=all"
        issues = requests.get(f"{base_url}/{owner}/{repo}/{query}", headers=headers)
        batch.extend(issues.json())

        if len(batch) > rate_limit and len(all_issues) < num_issues:
            all_issues.extend(batch)
            batch = []  # Flush batch for next time period
            print(f"Reached GitHub rate limit. Sleeping for one hour ...")
            time.sleep(60 * 60 + 1)

    all_issues.extend(batch)
    df = pd.DataFrame.from_records(all_issues)
    df.to_json(f"{issues_path}/{repo}-issues.jsonl", orient="records", lines=True)
    print(
        f"Downloaded all the issues for {repo}! Dataset stored at {issues_path}/{repo}-issues.jsonl"
    )


In [None]:
# This takes a long time as we max ou t the api hourly limit
# we'll skip this part and use a prepared dataset :(
fetch_issues(issues_path=Path("./issues"))

  0%|          | 0/100 [00:00<?, ?it/s]

Reached GitHub rate limit. Sleeping for one hour ...


# Semantic Search
We will apply embeddings to this dataset to identify semantically similar issues. 


In [None]:
from datasets import load_dataset

issues_dataset = load_dataset("lewtun/github-issues", split="train")
issues_dataset