### 데이터 불러오기

In [2]:
from datasets import load_dataset

data_files = {
    "train" : "./data/drugsCom_raw/drugsComTrain_raw.tsv",
    "test" : "./data/drugsCom_raw/drugsComTest_raw.tsv",
}
drug_dataset = load_dataset("csv", data_files=data_files, delimiter='\t')

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [3]:
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

### 데이터 확인

In [5]:
drug_sample = drug_dataset["train"].shuffle(seed=42).select(range(1000))
drug_sample[:3]

{'Unnamed: 0': [87571, 178045, 80482],
 'drugName': ['Naproxen', 'Duloxetine', 'Mobic'],
 'condition': ['Gout, Acute', 'ibromyalgia', 'Inflammatory Conditions'],
 'review': ['"like the previous person mention, I&#039;m a strong believer of aleve, it works faster for my gout than the prescription meds I take. No more going to the doctor for refills.....Aleve works!"',
  '"I have taken Cymbalta for about a year and a half for fibromyalgia pain. It is great\r\nas a pain reducer and an anti-depressant, however, the side effects outweighed \r\nany benefit I got from it. I had trouble with restlessness, being tired constantly,\r\ndizziness, dry mouth, numbness and tingling in my feet, and horrible sweating. I am\r\nbeing weaned off of it now. Went from 60 mg to 30mg and now to 15 mg. I will be\r\noff completely in about a week. The fibro pain is coming back, but I would rather deal with it than the side effects."',
  '"I have been taking Mobic for over a year with no side effects other than 

### 유니크 체크

In [6]:
for split in drug_dataset.keys():
    assert len(drug_dataset[split]) == len(drug_dataset[split].unique("Unnamed: 0"))

### 컬럼 이름 변경

In [7]:
drug_dataset = drug_dataset.rename_column(
    original_column_name="Unnamed: 0", new_column_name="patient_id"
)
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

### filter와 map

In [8]:
def filter_nones(x):
    return x["condition"] is not None

def lowercase_condition(x):
    return {"condition" : x["condition"].lower()}

In [9]:
drug_dataset = drug_dataset.filter(filter_nones)
drug_dataset = drug_dataset.map(lowercase_condition)
drug_dataset["train"]["condition"][:3]

Filter:   0%|          | 0/161297 [00:00<?, ? examples/s]

Filter:   0%|          | 0/53766 [00:00<?, ? examples/s]

Map:   0%|          | 0/160398 [00:00<?, ? examples/s]

Map:   0%|          | 0/53471 [00:00<?, ? examples/s]

['left ventricular dysfunction', 'adhd', 'birth control']

### 새로운 컬럼 만들기

In [10]:
def compute_review_length(example):
    return {"review_length": len(example["review"].split())}

In [11]:
drug_dataset = drug_dataset.map(compute_review_length)
drug_dataset["train"][0]

Map:   0%|          | 0/160398 [00:00<?, ? examples/s]

Map:   0%|          | 0/53471 [00:00<?, ? examples/s]

{'patient_id': 206461,
 'drugName': 'Valsartan',
 'condition': 'left ventricular dysfunction',
 'review': '"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
 'rating': 9.0,
 'date': 'May 20, 2012',
 'usefulCount': 27,
 'review_length': 17}

In [12]:
drug_dataset["train"].sort("review_length")[:3]

{'patient_id': [111469, 13653, 53602],
 'drugName': ['Ledipasvir / sofosbuvir',
  'Amphetamine / dextroamphetamine',
  'Alesse'],
 'condition': ['hepatitis c', 'adhd', 'birth control'],
 'review': ['"Headache"', '"Great"', '"Awesome"'],
 'rating': [10.0, 10.0, 10.0],
 'date': ['February 3, 2015', 'October 20, 2009', 'November 23, 2015'],
 'usefulCount': [41, 3, 0],
 'review_length': [1, 1, 1]}

In [13]:
drug_dataset = drug_dataset.filter(lambda x:x["review_length"] > 30)
print(drug_dataset.num_rows)

Filter:   0%|          | 0/160398 [00:00<?, ? examples/s]

Filter:   0%|          | 0/53471 [00:00<?, ? examples/s]

{'train': 138514, 'test': 46108}


In [14]:
import html

text = "I&#039;m a transformer called BERT"
html.unescape(text)

"I'm a transformer called BERT"

In [15]:
drug_dataset = drug_dataset.map(lambda x: {"review": html.unescape(x["review"])})

Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

Map:   0%|          | 0/46108 [00:00<?, ? examples/s]

### map() 메서드의 대단한 능력

In [16]:
new_drug_dataset = drug_dataset.map(
    lambda x: {"review": [html.unescape(o) for o in x["review"]]}, batched=True
)

Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

Map:   0%|          | 0/46108 [00:00<?, ? examples/s]

In [20]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["review"], truncation=True)

In [21]:
tokenized_dataset = drug_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

Map:   0%|          | 0/46108 [00:00<?, ? examples/s]

In [23]:
def tokenize_and_split(examples):
    return tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )


In [24]:
result = tokenize_and_split(drug_dataset["train"][0])
[len(inp) for inp in result["input_ids"]]
result["input_ids"]


[[101,
  107,
  1422,
  1488,
  1110,
  9079,
  1194,
  1117,
  2223,
  1989,
  1104,
  1130,
  19972,
  11083,
  119,
  1284,
  1245,
  4264,
  1165,
  1119,
  1310,
  1142,
  1314,
  1989,
  117,
  1165,
  1119,
  1408,
  1781,
  1103,
  2439,
  13753,
  1119,
  1209,
  1129,
  1113,
  119,
  1370,
  1160,
  1552,
  117,
  1119,
  1180,
  6374,
  1243,
  1149,
  1104,
  1908,
  117,
  1108,
  1304,
  172,
  14687,
  1183,
  117,
  1105,
  7362,
  1111,
  2212,
  129,
  2005,
  1113,
  170,
  2797,
  1313,
  1121,
  1278,
  12020,
  113,
  1304,
  5283,
  1111,
  1140,
  119,
  114,
  146,
  1270,
  1117,
  3995,
  1113,
  6356,
  2106,
  1105,
  1131,
  1163,
  1106,
  6166,
  1122,
  1149,
  170,
  1374,
  1552,
  119,
  3969,
  1293,
  1119,
  1225,
  1120,
  1278,
  117,
  1105,
  1114,
  2033,
  1146,
  1107,
  1103,
  2106,
  119,
  1109,
  1314,
  1160,
  1552,
  1138,
  1151,
  2463,
  1714,
  119,
  1124,
  1110,
  150,
  21986,
  3048,
  1167,
  5340,
  1895,
  1190,
  1518,

In [26]:
tokenized_dataset = drug_dataset.map(
    tokenize_and_split, batched=True, remove_columns=drug_dataset["train"].column_names
)

Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

Map:   0%|          | 0/46108 [00:00<?, ? examples/s]

In [27]:
len(tokenized_dataset["train"]), len(drug_dataset["train"])


(206772, 138514)

In [54]:
def tokenize_and_split(examples):
    result = tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )
    # 신규 인덱스와 이전 인덱스와의 매핑 추출
    sample_map = result.pop("overflow_to_sample_mapping")
    for key, values in examples.items():
        result[key] = [values[i] for i in sample_map]
    return result

# tokenize_and_split(drug_dataset['train'][0:1])#["patient_id"]

In [55]:
tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)
tokenized_dataset


Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

Map:   0%|          | 0/46108 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 206772
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 68876
    })
})