In [None]:
!pip install transformers



In [None]:
import transformers

In [None]:
!pip install transformers[sentencepiece]



2. Using transformers.

---
Behind the pipelines


In [None]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis")
classifier(
    [
        "I've been waiting for a HuggingFace course my whole life.",
        "I hate this so much!",
        "Great place to work!"
    ]
)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


[{'label': 'POSITIVE', 'score': 0.9598049521446228},
 {'label': 'NEGATIVE', 'score': 0.9994558691978455},
 {'label': 'POSITIVE', 'score': 0.9998779296875}]

In [None]:
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
print("input IDs and attention masks: \n ", inputs)



input IDs and attention masks: 
  {'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}


In [None]:
from transformers import AutoModel

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModel.from_pretrained(checkpoint)
outputs = model(**inputs)
print("Hidden size: The vector dimension of each model input: \n", outputs.last_hidden_state.shape)

Hidden size: The vector dimension of each model input: 
 torch.Size([2, 16, 768])


In [None]:
from transformers import AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**inputs)
print("tensore shape: \n", outputs.logits.shape)
print("unnormalized scores or logits: \n", outputs.logits)

tensore shape: 
 torch.Size([2, 2])
unnormalized scores or logits: 
 tensor([[-1.5607,  1.6123],
        [ 4.1692, -3.3464]], grad_fn=<AddmmBackward0>)


In [None]:
import torch

predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print("predictions (-02 means zeros before first digit 0.0): \n", predictions)
print("labels: \n", model.config.id2label)
# model.config.id2label

predictions (-02 means zeros before first digit 0.0): 
 tensor([[4.0195e-02, 9.5980e-01],
        [9.9946e-01, 5.4418e-04]], grad_fn=<SoftmaxBackward0>)
labels: 
 {0: 'NEGATIVE', 1: 'POSITIVE'}


---
Tokenizers


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

sequence = "Using a Transformer network is simple"#["I’ve been waiting for a HuggingFace course my whole life.", "I hate this so much!"]
tokens = tokenizer.tokenize(sequence)

print(tokens)

ids = tokenizer.convert_tokens_to_ids(tokens)

print(ids)

['Using', 'a', 'Trans', '##former', 'network', 'is', 'simple']
[7993, 170, 13809, 23763, 2443, 1110, 3014]


---
Handling multiple sequences

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)

# input_ids = torch.tensor([ids])
# print("Input IDs:", input_ids)

batched_ids = [ids, ids]
batched_ids = torch.tensor(batched_ids)
print("Input IDs:", batched_ids)


# output = model(input_ids)
# print("Logits:", output.logits)

output = model(batched_ids)
print("Logits:", output.logits)

# obtain the same logits as before (but twice):

Input IDs: tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012],
        [ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])
Logits: tensor([[-2.7276,  2.8789],
        [-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)


In [None]:
"""Apply the tokenization manually on the two sentences used in section 2
(“I’ve been waiting for a HuggingFace course my whole life.” and “I hate this so much!”).
Pass them through the model and check that you get the same logits as in section 2.

Now batch them together using the padding token,
then create the proper attention mask.
Check that you obtain the same results when going through the model!"""

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."
sequence2 = "I hate this so much!"
tokens = tokenizer.tokenize(sequence)
sequence_ids = tokenizer.convert_tokens_to_ids(tokens)
tokens2 = tokenizer.tokenize(sequence2)
sequence2_ids = tokenizer.convert_tokens_to_ids(tokens2)
print("sequence: ", model(torch.tensor([sequence_ids])).logits)
print("sequence2: ", model(torch.tensor([sequence2_ids])).logits)

# Corrected batched_ids and padding
max_length = max(len(sequence_ids), len(sequence2_ids))
batched_ids = [
    sequence_ids + [tokenizer.pad_token_id] * (max_length - len(sequence_ids)),
    sequence2_ids + [tokenizer.pad_token_id] * (max_length - len(sequence2_ids)),
]

# Create attention mask
attention_mask = [
    [1] * len(sequence_ids) + [0] * (max_length - len(sequence_ids)),
    [1] * len(sequence2_ids) + [0] * (max_length - len(sequence2_ids)),
]
print("attention_mask:", attention_mask)

# Pass batched_ids and attention mask to the model
inputs = {"input_ids": torch.tensor(batched_ids), "attention_mask": torch.tensor(attention_mask)}
print("batch:", model(**inputs).logits)


sequence:  tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)
sequence2:  tensor([[ 3.1931, -2.6685]], grad_fn=<AddmmBackward0>)
attention_mask: [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]
batch: tensor([[-2.7276,  2.8789],
        [ 3.1931, -2.6685]], grad_fn=<AddmmBackward0>)


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
print(tokens)
output = model(**tokens)
print(output)

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  2061,  2031,  1045,   999,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}
SequenceClassifierOutput(loss=None, logits=tensor([[-1.5607,  1.6123],
        [-3.6183,  3.9137]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


3. Finetuning pretrained model
---
Processing the data


In [None]:
from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc")
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [None]:
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[15]

{'sentence1': 'Rudder was most recently senior vice president for the Developer & Platform Evangelism Business .',
 'sentence2': 'Senior Vice President Eric Rudder , formerly head of the Developer and Platform Evangelism unit , will lead the new entity .',
 'label': 0,
 'idx': 16}

In [None]:
raw_val_dataset = raw_datasets["validation"]
raw_val_dataset[87]

{'sentence1': 'However , EPA officials would not confirm the 20 percent figure .',
 'sentence2': 'Only in the past few weeks have officials settled on the 20 percent figure .',
 'label': 0,
 'idx': 812}

In [None]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_sentences_1 = tokenizer(raw_train_dataset["sentence1"][0])
tokenized_sentences_2 = tokenizer(raw_train_dataset["sentence2"][0])
tokenized_sentences_1, tokenized_sentences_2


({'input_ids': [101, 2572, 3217, 5831, 5496, 2010, 2567, 1010, 3183, 2002, 2170, 1000, 1996, 7409, 1000, 1010, 1997, 9969, 4487, 23809, 3436, 2010, 3350, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
 {'input_ids': [101, 7727, 2000, 2032, 2004, 2069, 1000, 1996, 7409, 1000, 1010, 2572, 3217, 5831, 5496, 2010, 2567, 1997, 9969, 4487, 23809, 3436, 2010, 3350, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]})

In [None]:
inputs = tokenizer("Rudder was most recently senior vice president for the Developer & Platform Evangelism Business .", "Senior Vice President Eric Rudder , formerly head of the Developer and Platform Evangelism unit , will lead the new entity .")
inputs

{'input_ids': [101, 24049, 2001, 2087, 3728, 3026, 3580, 2343, 2005, 1996, 9722, 1004, 4132, 9340, 12439, 2964, 2449, 1012, 102, 3026, 3580, 2343, 4388, 24049, 1010, 3839, 2132, 1997, 1996, 9722, 1998, 4132, 9340, 12439, 2964, 3131, 1010, 2097, 2599, 1996, 2047, 9178, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

Replicate the preprocessing on the GLUE SST-2 dataset. It’s a little bit different since it’s composed of single sentences instead of pairs, but the rest of what we did should look the same. For a harder challenge, try to write a preprocessing function that works on any of the GLUE tasks.

In [None]:
from datasets import load_dataset, DownloadConfig, disable_progress_bar

disable_progress_bar()
download_config = DownloadConfig(disable_tqdm=True)

raw_datasets = load_dataset("glue", "sst2", download_config=download_config)

In [3]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [None]:
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[15]

{'sentence': 'the greatest musicians ', 'label': 1, 'idx': 15}

In [None]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_sentences_1 = tokenizer(raw_train_dataset["sentence"][0])
tokenized_sentences_1


{'input_ids': [101, 5342, 2047, 3595, 8496, 2013, 1996, 18643, 3197, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
tokenized_sentences_2 = tokenizer(raw_train_dataset["sentence"][1])
tokenized_sentences_2

{'input_ids': [101, 3397, 2053, 15966, 1010, 2069, 4450, 2098, 18201, 2015, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
def tokenize_function(example):
    return tokenizer(example["sentence"], truncation=True)

  # changed the output from 'return tokenizer(example["sentence1"], example["sentence2"], truncation=True)'
  # as dataset has only one sentence, no pairs

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

samples = tokenized_datasets["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence"]}
[len(x) for x in samples["input_ids"]]

[10, 11, 15, 10, 22, 13, 29, 6]

In [None]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}
#  8 samples with padding of 29

{'input_ids': torch.Size([8, 29]),
 'token_type_ids': torch.Size([8, 29]),
 'attention_mask': torch.Size([8, 29]),
 'labels': torch.Size([8])}

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [44]:
from transformers import TrainingArguments
# argument you have to provide is a directory where the trained model will be saved
training_args = TrainingArguments("test-trainer")

In [45]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [46]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,
)

In [None]:
trainer.train()

In [48]:
# Evaluation
predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)

(408, 2) (408,)


In [49]:
import numpy as np

preds = np.argmax(predictions.predictions, axis=-1)

In [None]:
# !pip install evaluate

In [50]:
import evaluate

metric = evaluate.load("glue", "mrpc")

In [51]:
metric.compute(predictions=preds, references=predictions.label_ids)

{'accuracy': 0.8504901960784313, 'f1': 0.8964346349745331}

In [None]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments("test-trainer", eval_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
# Full training and eval loop from scratch with modern PyTorch best practices

# data loading
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from tqdm.auto import tqdm # Import tqdm

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# preprocess data before training
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

# define data loaders
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

# training with accelerator
from accelerate import Accelerator
from torch.optim import AdamW
from transformers import AutoModelForSequenceClassification, get_scheduler

accelerator = Accelerator()

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
optimizer = AdamW(model.parameters(), lr=3e-5)

train_dl, eval_dl, model, optimizer = accelerator.prepare(
    train_dataloader, eval_dataloader, model, optimizer
)

# scheduler
num_epochs = 3
num_training_steps = num_epochs * len(train_dl)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

# accessing GPU/TPU
# import torch # This import is already at the top

# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# model.to(device) # This is handled by accelerator.prepare
# device

# train
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dl:
        # batch = {k: v.to(device) for k, v in batch.items()} # This is handled by accelerator.prepare
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)



In [5]:
# evaluate
import evaluate
import torch

metric = evaluate.load("glue", "mrpc")
model.eval()
for batch in eval_dl: # Use eval_dl prepared by accelerator
    batch = {k: v.to(accelerator.device) for k, v in batch.items()} # Use accelerator.device
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.8529411764705882, 'f1': 0.8951048951048951}

4 module - about pushing models and creating repos on Hugginface.
---
Not interested trying.


5 module
---
Downloading datasets from anywhere


In [None]:
from datasets import load_dataset

url = "https://github.com/crux82/squad-it/raw/master/"
data_files = {
    "train": url + "SQuAD_it-train.json.gz",
    "test": url + "SQuAD_it-test.json.gz",
}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")

In [10]:
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
!unzip drugsCom_raw.zip

from datasets import load_dataset

data_files = {"train": "drugsComTrain_raw.tsv", "test": "drugsComTest_raw.tsv"}
# \t is the tab character in Python
drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")

drug_sample = drug_dataset["train"].shuffle(seed=42).select(range(1000))
# Peek at the first few examples


--2026-01-20 17:36:27--  https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘drugsCom_raw.zip.1’

drugsCom_raw.zip.1      [   <=>              ]  41.00M  87.8MB/s    in 0.5s    

2026-01-20 17:36:28 (87.8 MB/s) - ‘drugsCom_raw.zip.1’ saved [42989872]

Archive:  drugsCom_raw.zip
replace drugsComTest_raw.tsv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: drugsComTest_raw.tsv    
replace drugsComTrain_raw.tsv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: drugsComTrain_raw.tsv   


In [11]:
drug_sample[:3]

{'Unnamed: 0': [87571, 178045, 80482],
 'drugName': ['Naproxen', 'Duloxetine', 'Mobic'],
 'condition': ['Gout, Acute', 'ibromyalgia', 'Inflammatory Conditions'],
 'review': ['"like the previous person mention, I&#039;m a strong believer of aleve, it works faster for my gout than the prescription meds I take. No more going to the doctor for refills.....Aleve works!"',
  '"I have taken Cymbalta for about a year and a half for fibromyalgia pain. It is great\r\nas a pain reducer and an anti-depressant, however, the side effects outweighed \r\nany benefit I got from it. I had trouble with restlessness, being tired constantly,\r\ndizziness, dry mouth, numbness and tingling in my feet, and horrible sweating. I am\r\nbeing weaned off of it now. Went from 60 mg to 30mg and now to 15 mg. I will be\r\noff completely in about a week. The fibro pain is coming back, but I would rather deal with it than the side effects."',
  '"I have been taking Mobic for over a year with no side effects other than 

In [None]:
for split in drug_dataset.keys():
    assert len(drug_dataset[split]) == len(drug_dataset[split].unique("drugName"))

In [13]:
print("Unique drug names in training set:", len(drug_dataset["train"].unique("drugName")))
print("Unique drug names in test set:", len(drug_dataset["test"].unique("drugName")))

Unique drug names in training set: 3436
Unique drug names in test set: 2637


In [14]:
for split in drug_dataset.keys():
    print(len(drug_dataset["train"].unique("condition")))

885
885


In [15]:
drug_dataset = drug_dataset.rename_column(
    original_column_name="Unnamed: 0", new_column_name="patient_id"
)
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [None]:
# def filter_nones(x):
#     return x["condition"] is not None
# drug_dataset.filter(filter_nones)

# or

drug_dataset = drug_dataset.filter(lambda x: x["condition"] is not None)

In [18]:
def lowercase_condition(example):
    return {"condition": example["condition"].lower()}


drug_dataset.map(lowercase_condition)

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 160398
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53471
    })
})

In [None]:
def compute_review_length(example):
    return {"review_length": len(example["review"].split())}
drug_dataset = drug_dataset.map(compute_review_length)


# or alternative way to add new columns to a dataset is with the Dataset.add_column() function.

In [21]:
# Inspect the first training example
drug_dataset["train"][0]

{'patient_id': 206461,
 'drugName': 'Valsartan',
 'condition': 'Left Ventricular Dysfunction',
 'review': '"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
 'rating': 9.0,
 'date': 'May 20, 2012',
 'usefulCount': 27,
 'review_length': 17}

In [22]:
drug_dataset["train"].sort("review_length")[:3]

{'patient_id': [111469, 13653, 53602],
 'drugName': ['Ledipasvir / sofosbuvir',
  'Amphetamine / dextroamphetamine',
  'Alesse'],
 'condition': ['Hepatitis C', 'ADHD', 'Birth Control'],
 'review': ['"Headache"', '"Great"', '"Awesome"'],
 'rating': [10.0, 10.0, 10.0],
 'date': ['February 3, 2015', 'October 20, 2009', 'November 23, 2015'],
 'usefulCount': [41, 3, 0],
 'review_length': [1, 1, 1]}

In [None]:
drug_dataset = drug_dataset.filter(lambda x: x["review_length"] > 30) # max length?


In [25]:
print(drug_dataset.num_rows)

{'train': 138514, 'test': 46108}


In [None]:
import html

text = "I&#039;m a transformer called BERT"
html.unescape(text)
drug_dataset = drug_dataset.map(lambda x: {"review": html.unescape(x["review"])})

# or batched
new_drug_dataset = drug_dataset.map(
    lambda x: {"review": [html.unescape(o) for o in x["review"]]}, batched=True
)

Execute the same instruction with and without batched=True, then try it with a slow tokenizer (add use_fast=False in the AutoTokenizer.from_pretrained() method) so you can see what numbers you get on your hardware.

In [None]:
new_drug_dataset = drug_dataset.map(
    lambda x: {"review": [html.unescape(o) for o in x["review"]]}, batched=False
)

In [29]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["review"], truncation=True)

In [33]:
%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=True)

CPU times: user 28.4 ms, sys: 3.93 ms, total: 32.3 ms
Wall time: 70.9 ms


In [35]:
%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=False)

CPU times: user 22.5 ms, sys: 1.97 ms, total: 24.4 ms
Wall time: 25.4 ms


In [36]:
# try with slow tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False)


def tokenize_function(examples):
    return tokenizer(examples["review"], truncation=True)

In [38]:
%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=True)

CPU times: user 588 ms, sys: 3.02 ms, total: 591 ms
Wall time: 610 ms


In [40]:
%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=False)

CPU times: user 381 ms, sys: 4 ms, total: 385 ms
Wall time: 385 ms


In [None]:
slow_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False)


def slow_tokenize_function(examples):
    return slow_tokenizer(examples["review"], truncation=True)


tokenized_dataset = drug_dataset.map(slow_tokenize_function, batched=True, num_proc=8)

In [None]:
%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=True)

CPU times: user 918 ms, sys: 16.6 ms, total: 935 ms
Wall time: 1.52 s


Compute the average rating per drug and store the result in a new Dataset.

In [None]:
drug_dataset.set_format("pandas")
train_df = drug_dataset["train"][:]

In [None]:
average_rating = (
    train_df.groupby("drugName")["rating"]
    .mean()
    .to_frame()
    .reset_index()
    .rename(columns={"rating": "average_rating"})
)
display(average_rating.head())

Unnamed: 0,drugName,average_rating
0,A + D Cracked Skin Relief,10.0
1,A / B Otic,10.0
2,Abacavir / dolutegravir / lamivudine,8.211538
3,Abacavir / lamivudine / zidovudine,9.0
4,Abatacept,7.157895


In [None]:
from datasets import Dataset

avrating_dataset = Dataset.from_pandas(average_rating)
avrating_dataset

Dataset({
    features: ['drugName', 'average_rating'],
    num_rows: 3431
})

In [None]:
drug_dataset.reset_format()

1. Use the techniques from Chapter 3 to train a classifier that can predict the patient condition based on the drug review.
2. Use the summarization pipeline from Chapter 1 to generate summaries of the reviews.

In [None]:
https://colab.research.google.com/drive/1Zr0_bBFcAcKtQoVNQJk6OD7Gs30N89sQ