In [None]:
!pip install transformers datasets accelerate scikit-learn

# Part 3: Transformer based embeddings  

## Prequel

### Global Settings

In [None]:
SEED = 42
SUBSET_RATIO = 0.01 # We don't have much time and you probably are working on laptops, so I suggest keeping this value. If you want to see results on the full dataset, you can set it to 1.0
DEVICE = "cuda" # If you don't have a NVIDIA GPU, use "mps", "cpu" or "rocm" depending on your config
BATCH_SIZE = 16 # Suggested value for 10GB of VRAM/RAM
MODEL_NAME = "bert-base-uncased" # You can try out other models if you have time


### Loading IMDB Dataset

In [None]:
from datasets import load_dataset


dataset = load_dataset("imdb")

train_dataset = (
    dataset["train"]
    .shuffle(SEED)
    .select(range(int(len(dataset["train"]) * SUBSET_RATIO)))
)
test_dataset = (
    dataset["test"]
    .shuffle(SEED)
    .select(range(int(len(dataset["test"]) * SUBSET_RATIO)))
)

### Task: Create a function that given embeddings and labels, trains a classifier and returns the predictions on the test set.

You should reuse the function of part 1 to compare the results.

In [None]:
import numpy.typing as npt
import numpy as np

def fit_predict(train_embeddings, train_labels, test_embeddings) -> npt.NDArray[float]:
  pass

## Experiment 1: Using Pre-trained BERT Embeddings

The goal is to use a pretrained language model to generate embeddings for our corpus. We use the LM as is without any additional training (like we did with GloVe in the previous part)

In [None]:
from transformers import (
    AutoTokenizer,
    AutoModel,
    PreTrainedModel,
    PreTrainedTokenizer,
)

tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)

### Task: Create a function that generates BERT embeddings for a given text dataset. You can either use the embedding of the [CLS] token or use any pooling method on the embeddings of all tokens.


In [None]:
from typing import Iterable

def embed_documents(model: PreTrainedModel, texts: Iterable[str]) -> npt.NDArray[int]:
  pass

### Check that you correctly implemented `embed_documents`

In [None]:
train_embeddings = embed_documents(model, train_dataset["text"])
test_embeddings = embed_documents(model, test_dataset["text"])

assert train_embeddings.shape == (len(train_dataset), 768)
assert test_embeddings.shape == (len(test_dataset), 768)

### Task: Train a Random Forest classifier on the BERT embeddings and check relevant metrics.


### Analysis
- How does it compare to previous methods?


## Experiment 2: Fine-tuning BERT using unsuperivsed MLM objective


### Task: Create a function that tokenizes a batch of texts and use it to tokenize the train and test datasets.

You should avoid returning tensors at this stage. This will be handle later with the padding using a data collator.  


In [None]:
from transformers import BatchEncoding

def tokenize_batch(batch) -> BatchEncoding:
  pass

In [None]:
# We remove the label column because it confuses the training script
tokenized_train_dataset = train_dataset.map(
    tokenize_batch, batched=True
).remove_columns(["label"])
tokenized_test_dataset = test_dataset.map(
    tokenize_batch, batched=True
).remove_columns(["label"])

### Task: Create a data collator that masks the input tokens with a probability of 15%. See [DataCollatorForLanguageModeling](https://huggingface.co/docs/transformers/en/main_classes/data_collator#transformers.DataCollatorForLanguageModeling)



In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = None # Fill me!

### Task: Train the BERT model with a masked language modeling head on the IMDB dataset.


In [None]:
from transformers import (
    BertForMaskedLM,
    Trainer,
    TrainingArguments,
)


model_mlm = BertForMaskedLM.from_pretrained("bert-base-uncased")

training_args = TrainingArguments(
    output_dir="./mlm_model",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    prediction_loss_only=True,
    logging_steps=10,
    logging_strategy="steps",
)

trainer = Trainer(
    model=model_mlm,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
)

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

trainer.train()

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")


### Task: Like before generate BERT embeddings for the train and test datasets using the fine-tuned model and compare the performance of the classifier.
Tip: You can access the encoder using `model_mlm.base_model`


### Analysis: Compare the performance of the classifier using the fine-tuned BERT model with the previous experiment.


## Experiment 3: End-to-End BERT Classification and Embeddings
The objective is to finetune the BERT model for sequence classification and use the embeddings from the finetuned model to train a classifier.


### Task: Tokenize the train and test datasets using previous function. Split the train dataset into train and validation sets.

In [None]:
tokenized_train_dataset = train_dataset.map(
    tokenize_batch, batched=True, remove_columns=["text"]
)
tokenized_test_dataset = test_dataset.map(
    tokenize_batch, batched=True, remove_columns=["text"]
)
tokenized_train_eval_dataset = tokenized_train_dataset.train_test_split(test_size=0.2)

tokenized_train_dataset = tokenized_train_eval_dataset["train"]
tokenized_eval_dataset = tokenized_train_eval_dataset["test"]

### Task: Load the BERT model for sequence classification



In [None]:
from transformers import BertForSequenceClassification


model_classifier = BertForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=2
)


### Task: Create a data collator that pads the input sequences. Why do we need padding for sequence classification?

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### Task: Create a function that compute relevant metrics for the classification task.

In [None]:
from transformers import EvalPrediction

def compute_metrics(eval_pred: EvalPrediction) -> dict[str, float]:
  pass

### Task: Train the classifier

In [None]:
training_args = TrainingArguments(
    output_dir="./classifier_model",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    logging_steps=10,
    logging_strategy="steps",
    eval_strategy="epoch",
    eval_steps=1,
)


trainer = Trainer(
    model=model_classifier,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

trainer.train()

### Task: Evaluate the classifier on the test set.

### Task: Use the classifier to generate embeddings for the train and test datasets and evaluate the performance of the generic classifier.


### Analysis

How does this two last methods compare to the rest?  Can you think of way to improve further what we did?