# Exercise: Full-fine tuning BERT

In this exercise, you will create a BERT sentiment classifier using the [Hugging Face Transformers](https://huggingface.co/transformers/) library. You will use the [IMDB movie review dataset](https://ai.stanford.edu/~amaas/data/sentiment/) to train and evaluate your model.

The IMDB dataset contains 50,000 movie reviews that are labeled as either positive or negative. The dataset is split into 25,000 reviews for training and 25,000 reviews for testing. The training and testing sets are balanced, meaning they contain an equal number of positive and negative reviews.


In [None]:
# Install dependencies for this notebook if they are missing
! pip install -q \
    scikit-learn \
    evaluate \
    datasets \
    "transformers[torch]" \
    ipywidgets

In [None]:
# Since we intend on using a GPU, we inspect that one is available and has enough memory to run the model
! nvidia-smi

In [None]:
# Load the sms_spam dataset
# See: https://huggingface.co/datasets/sms_spam

from datasets import load_dataset

# The sms_spam dataset only has a train split, so we use the train_test_split method to split it into train and test
dataset = load_dataset("sms_spam", split="train").train_test_split(
    test_size=0.2, shuffle=True, seed=23
)

splits = ["train", "test"]

# View the dataset characteristics
dataset["train"]

In [None]:
# Inspect the first example
dataset["train"][0]

In [None]:
# Inpsect another example
print(dataset["train"][2])

In [None]:
from transformers import AutoTokenizer


def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)


tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenized_dataset = {}
for split in splits:
    tokenized_dataset[split] = dataset[split].map(
        lambda x: tokenizer(x["sms"], truncation=True), batched=True
    )

In [None]:
tokenized_dataset["train"]
# print(tokenized_dataset["train"][0]["input_ids"])

In [None]:
import evaluate
import numpy as np
from transformers import AutoModelForSequenceClassification

accuracy = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


id2label = {0: "not spam", 1: "spam"}
label2id = {"not spam": 0, "spam": 1}


# https://huggingface.co/transformers/v3.0.2/model_doc/auto.html#automodelforsequenceclassification
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

# Unfreeze all the model parameters.
# Note:
for param in model.parameters():
    param.requires_grad = True
    # adjust learning rate for fine-tuning by uncommenting the following line
    # param.requires_grad_(lr=2e-5)

In [None]:
# list(model.named_parameters())

In [None]:
print(model)

In [None]:
from transformers import DataCollatorWithPadding

# The HuggingFace Trainer class handles the training and eval loop for PyTorch for us.
# Read more about it here https://huggingface.co/docs/transformers/main_classes/trainer
trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="./data/sentiment_analysis",
        learning_rate=2e-5,
        # Reduce the batch size if you don't have enough memory
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=2,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=False,
    ),
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
# dataset_indices = list(df.index)
items_for_manual_review = tokenized_dataset["test"].select([0, 1, 22, 31])

# show the dataset
for item in items_for_manual_review:
    print(item["sms"])

In [None]:
results = trainer.predict(items_for_manual_review)
results

In [None]:
results.predictions

In [None]:
results.predictions.argmax(axis=1)

In [None]:
# Make a dataframe with the predictions and the text and the labels

df = pd.DataFrame(
    {
        "sms": [item["sms"] for item in items_for_manual_review],
        "predictions": results.predictions.argmax(axis=1),
        "labels": results.label_ids,
    }
)
df