# 5. Datasets Library


In [3]:
# Load nested format json by field parameter
from datasets import load_dataset

squad_it_dataset = load_dataset("json", data_files="./datafolder/SQuAD/SQuAD_it-train.json", field="data")

Generating train split: 0 examples [00:00, ? examples/s]

In [4]:
squad_it_dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 442
    })
})

In [5]:
# indexing을 통한 data 살펴보기
squad_it_dataset["train"][3]


{'title': 'A cappella',
 'paragraphs': [{'context': 'A cappella[un kap? p. p. lla] (italiano per "alla maniera della cappella") la musica è specificamente gruppale o solista che canta senza accompagnamento strumentale, o un brano destinato ad essere eseguito in questo modo. Contrasta con la cantata, che è accompagnata dal canto. Il termine "a cappella" aveva originariamente lo scopo di differenziare la polifonia rinascimentale dallo stile concertato barocco.',
   'qas': []},
  {'context': "Originariamente la musica cappella era usata nella musica religiosa, in particolare la musica della chiesa, nonché anasheed e zemirot. Il canto gregoriano è un esempio di canto a cappella, così come la maggior parte della musica vocale secolare rinascimentale. Il madrigale, fino al suo sviluppo nel primo barocco in forma strumentalmente accompagnata, è di solito anche a cappella. La musica ebraica e cristiana era in origine una cappella, e questa pratica è continuata sia in queste due religioni che n

In [7]:
# data_files parameter에 각 split name을 
# 해당 집합 file명에 mapping 하는 dictionary를 지정하면 된다
data_files = {
    "train": "./datafolder/SQuAD/SQuAD_it-train.json",
    "test": "./datafolder/SQuAD/SQuAD_it-test.json"
}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")
squad_it_dataset

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 442
    })
    test: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 48
    })
})

## 5-2. Slicing & Dicing

In [8]:
# !wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
# !unzip drugsCom_raw.zip
from datasets import load_dataset
data_files = {
    "train": "./datafolder/DrugReview/drugsComTrain_raw.tsv",
    "test": "./datafolder/DrugReview/drugsComTest_raw.tsv"
}
drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")

In [9]:
# Get random samples by chaining Dataset.shuffle() , Dataset.select()
drug_sample = drug_dataset["train"].shuffle(seed=42).select(range(1000))
drug_sample[:3]

{'Unnamed: 0': [87571, 178045, 80482],
 'drugName': ['Naproxen', 'Duloxetine', 'Mobic'],
 'condition': ['Gout, Acute', 'ibromyalgia', 'Inflammatory Conditions'],
 'review': ['"like the previous person mention, I&#039;m a strong believer of aleve, it works faster for my gout than the prescription meds I take. No more going to the doctor for refills.....Aleve works!"',
  '"I have taken Cymbalta for about a year and a half for fibromyalgia pain. It is great\r\nas a pain reducer and an anti-depressant, however, the side effects outweighed \r\nany benefit I got from it. I had trouble with restlessness, being tired constantly,\r\ndizziness, dry mouth, numbness and tingling in my feet, and horrible sweating. I am\r\nbeing weaned off of it now. Went from 60 mg to 30mg and now to 15 mg. I will be\r\noff completely in about a week. The fibro pain is coming back, but I would rather deal with it than the side effects."',
  '"I have been taking Mobic for over a year with no side effects other than 

In [10]:
# Data cleaning
for split in drug_dataset.keys():
    assert len(drug_dataset[split]) == len(drug_dataset[split].unique("Unnamed: 0"))

In [11]:
# 위에서 오류 안 났으므로 Unnamed: 0 == patient_id 임을 확인가능
drug_dataset = drug_dataset.rename_column(
    original_column_name="Unnamed: 0",
    new_column_name="patient_id"
)
drug_dataset



DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [15]:
def lowercase_condition(ex):
    return {"condition": ex["condition"].lower()}
# drug_dataset.map(lowercase_condition) <- error occured 예상됨

In [13]:
# def filter_nones (x):
#     return x["condition"] is not None
# drug_dataset = drug_drug_dataset.filter(filter_nones)
# or
drug_dataset = drug_dataset.filter(lambda x: x["condition"] is not None)

In [14]:
def lowercase_condition(ex):
    return {"condition": ex["condition"].lower()}
drug_dataset = drug_dataset.map(lowercase_condition)

In [17]:
drug_dataset["train"]["condition"][:3]

['left ventricular dysfunction', 'adhd', 'birth control']

In [18]:
def compute_review_length(ex):
	return {"review_length": len(ex["review"].split())}

In [19]:
drug_dataset = drug_dataset.map(compute_review_length)
drug_dataset["train"][0]

{'patient_id': 206461,
 'drugName': 'Valsartan',
 'condition': 'left ventricular dysfunction',
 'review': '"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
 'rating': 9.0,
 'date': 'May 20, 2012',
 'usefulCount': 27,
 'review_length': 17}

In [20]:
drug_dataset["train"].sort("review_length")[:3]

{'patient_id': [111469, 13653, 53602],
 'drugName': ['Ledipasvir / sofosbuvir',
  'Amphetamine / dextroamphetamine',
  'Alesse'],
 'condition': ['hepatitis c', 'adhd', 'birth control'],
 'review': ['"Headache"', '"Great"', '"Awesome"'],
 'rating': [10.0, 10.0, 10.0],
 'date': ['February 3, 2015', 'October 20, 2009', 'November 23, 2015'],
 'usefulCount': [41, 3, 0],
 'review_length': [1, 1, 1]}