# Chapter 5 assignments



1.   Use the techniques from Chapter 3 to train a classifier that can predict the patient condition based on the drug review.

2.   Use the summarization pipeline from Chapter 1 to generate summaries of the reviews.



In [1]:
# from tqdm.auto import tqdm - removed as causing issues for Github to display notebooks.

In [2]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

In [3]:
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
!unzip drugsCom_raw.zip

--2026-01-19 13:55:26--  https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘drugsCom_raw.zip’

drugsCom_raw.zip        [     <=>            ]  41.00M  25.5MB/s    in 1.6s    

2026-01-19 13:55:28 (25.5 MB/s) - ‘drugsCom_raw.zip’ saved [42989872]

Archive:  drugsCom_raw.zip
  inflating: drugsComTest_raw.tsv    
  inflating: drugsComTrain_raw.tsv   


In [6]:
import html
from datasets import load_dataset, DownloadConfig, disable_progress_bar
from transformers import AutoTokenizer, DataCollatorWithPadding

disable_progress_bar()

download_config = DownloadConfig(disable_tqdm=True)

data_files = {"train": "drugsComTrain_raw.tsv", "test": "drugsComTest_raw.tsv"}
drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t", download_config=download_config)
drug_dataset = drug_dataset.rename_column(
    original_column_name="Unnamed: 0", new_column_name="patient_id")
drug_dataset = drug_dataset.filter(lambda x: x["condition"] is not None)

def lowercase_condition(example):
    return {"condition": example["condition"].lower()}
drug_dataset = drug_dataset.map(lowercase_condition)

# Applying HTML unescaping to the Review column
drug_dataset = drug_dataset.map(
    lambda x: {"review": [html.unescape(o) for o in x["review"]]}, batched=True)

# Collecting all unique conditions
all_conditions = set()
for split_name in drug_dataset.keys():
    all_conditions.update(drug_dataset[split_name].unique("condition"))

# Sprting a list to ensure consistent label assignments
unique_conditions = sorted(list(all_conditions))

label_to_id = {condition: i for i, condition in enumerate(unique_conditions)}
id_to_label = {i: condition for i, condition in enumerate(unique_conditions)}

def convert_condition_to_labels(example):
    example["labels"] = label_to_id[example["condition"]]
    return example

# Applying mapping to create the 'labels' column and then removing the original 'condition' column
drug_dataset = drug_dataset.map(convert_condition_to_labels)
drug_dataset = drug_dataset.remove_columns(["condition"])

# Reducing the number of samples for each split (5% of original)
for split_name in drug_dataset.keys():
    original_size = len(drug_dataset[split_name])
    sample_size = int(original_size * 0.05)
    if sample_size == 0 and original_size > 0:
        sample_size = 1
    # Randomly sample the dataset.
    drug_dataset[split_name] = drug_dataset[split_name].shuffle(seed=42).select(range(sample_size))
print(f"Dataset reduced to 5%. New sizes: {{train: {len(drug_dataset['train'])}, test: {len(drug_dataset['test'])}}}")

# Verifying the mapping and new column
print(f"Number of unique labels: {len(unique_conditions)}")
print(f"Example label mapping: {list(label_to_id.items())[:5]}")
print(drug_dataset["train"].features)

# Tokenizing the dataset after all other transformations
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
def tokenize_function(examples):
    return tokenizer(examples["review"], truncation=True)
tokenized_dataset = drug_dataset.map(tokenize_function, batched=True)

# Creating validation split
if "validation" not in tokenized_dataset.keys():
    train_test_split_dataset = tokenized_dataset["train"].train_test_split(test_size=0.1, seed=42)
    tokenized_dataset["train"] = train_test_split_dataset["train"]
    tokenized_dataset["validation"] = train_test_split_dataset["test"]

# removing extraneous columns that are not model inputs or labels from the tokenized_dataset
columns_to_remove = [col for col in tokenized_dataset["train"].column_names if col not in ["input_ids", "attention_mask", "token_type_ids", "labels"]]
tokenized_dataset = tokenized_dataset.remove_columns(columns_to_remove)

# Setting the format to 'torch' after all necessary columns are removed
tokenized_dataset.set_format("torch")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Create DataLoaders
from torch.utils.data import DataLoader
train_dataloader = DataLoader(
    tokenized_dataset["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_dataset["validation"], batch_size=8, collate_fn=data_collator
)

Dataset reduced to 5%. New sizes: {train: 8019, test: 2673}
Number of unique labels: 916
Example label mapping: [('0</span> users found this comment helpful.', 0), ('100</span> users found this comment helpful.', 1), ('105</span> users found this comment helpful.', 2), ('10</span> users found this comment helpful.', 3), ('110</span> users found this comment helpful.', 4)]
{'patient_id': Value('int64'), 'drugName': Value('string'), 'review': Value('string'), 'rating': Value('float64'), 'date': Value('string'), 'usefulCount': Value('int64'), 'labels': Value('int64')}


In [12]:
import os
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"

from accelerate import Accelerator
from transformers import AutoModelForSequenceClassification, get_scheduler
from torch.optim import AdamW

accelerator = Accelerator()

checkpoint = "bert-base-cased"

# Dynamically determining num_labels from the unique_conditions found earlier
# Ensuring id_to_label and label_to_id are available from previous steps
num_labels = len(id_to_label)

model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, num_labels=num_labels, id2label=id_to_label, label2id=label_to_id
)
optimizer = AdamW(model.parameters(), lr=3e-5)

train_dl, eval_dl, model, optimizer = accelerator.prepare(
    train_dataloader, eval_dataloader, model, optimizer
)

num_epochs = 3
num_training_steps = num_epochs * len(train_dl)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

model.train()
for epoch in range(num_epochs):
    for batch in train_dl:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
!pip install evaluate



In [14]:
import evaluate
import numpy as np

# Initialize the metric for multi-class classification
# Using 'f1' with 'average="weighted"' for multi-class problem.
metric = evaluate.load("f1")

model.eval()
for batch in eval_dataloader:
    # Move batch to device
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

    # batch of predictions and references to the metric
    metric.add_batch(predictions=predictions, references=batch["labels"])

# Compute the final metric
metric.compute(average="weighted")


{'f1': 0.41851862592985506}

on a 5% subset of dataset, model could obtain almost 50% F1 score => result is good enough.

  

---



Summarization part

In [29]:
import os
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"

In [24]:
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
!unzip drugsCom_raw.zip

--2026-01-19 14:49:18--  https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘drugsCom_raw.zip.3’

drugsCom_raw.zip.3      [       <=>          ]  41.00M  20.4MB/s    in 2.0s    

2026-01-19 14:49:20 (20.4 MB/s) - ‘drugsCom_raw.zip.3’ saved [42989872]

Archive:  drugsCom_raw.zip
replace drugsComTest_raw.tsv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: drugsComTest_raw.tsv    
replace drugsComTrain_raw.tsv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: drugsComTrain_raw.tsv   


In [25]:
import html
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding


data_files = {"train": "drugsComTrain_raw.tsv", "test": "drugsComTest_raw.tsv"}
drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")
drug_dataset = drug_dataset.rename_column(
    original_column_name="Unnamed: 0", new_column_name="patient_id")
drug_dataset = drug_dataset.filter(lambda x: x["condition"] is not None)

# Applying HTML unescaping directly to the drug_dataset's review column
drug_dataset = drug_dataset.map(
    lambda x: {"review": [html.unescape(o) for o in x["review"]]}, batched=True)


In [26]:
# creating dataset of only reviews
drug_dataset_reviews = drug_dataset["train"]["review"]

In [27]:
# checking review text
drug_dataset_reviews[1]

'"My son is halfway through his fourth week of Intuniv. We became concerned when he began this last week, when he started taking the highest dose he will be on. For two days, he could hardly get out of bed, was very cranky, and slept for nearly 8 hours on a drive home from school vacation (very unusual for him.) I called his doctor on Monday morning and she said to stick it out a few days. See how he did at school, and with getting up in the morning. The last two days have been problem free. He is MUCH more agreeable than ever. He is less emotional (a good thing), less cranky. He is remembering all the things he should. Overall his behavior is better. \r\nWe have tried many different medications and so far this is the most effective."'

In [31]:
import os
# checking pipeline
from transformers import pipeline

summarizer = pipeline("summarization")
synthesized_review = summarizer(drug_dataset_reviews[1])

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0


In [32]:
# running pipeline on 5 entries
summaries = summarizer(drug_dataset_reviews[:5])
for i, summary in enumerate(summaries):
    print(f"--- Review {i+1} ---")
    print(f"Original: {drug_dataset_reviews[i]}")
    print(f"Summary: {summary['summary_text']}\n")

Your max_length is set to 142, but your input_length is only 25. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)
Your max_length is set to 142, but your input_length is only 110. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=55)


--- Review 1 ---
Original: "It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"
Summary:  "It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil" "It's a combination of bystolic . 5mg and fish oil," she says of her anti-depressants . "I take it with no side effects," she said of taking it with fish oil .

--- Review 2 ---
Original: "My son is halfway through his fourth week of Intuniv. We became concerned when he began this last week, when he started taking the highest dose he will be on. For two days, he could hardly get out of bed, was very cranky, and slept for nearly 8 hours on a drive home from school vacation (very unusual for him.) I called his doctor on Monday morning and she said to stick it out a few days. See how he did at school, and with getting up in the morning. The last two days have been problem free. He is MUCH more agreeable than ever. He is less emotional (a good thing), less cranky. He is remembering all the things