In [1]:
%%capture

import sys

!{sys.executable} -m pip install --upgrade evaluate

In [2]:
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)

from datasets import load_dataset, DatasetDict, Dataset

from peft import (
    PeftModel,
    PeftConfig,
    get_peft_model,
    LoraConfig,
)

import evaluate
import torch
import numpy as np
from tqdm import tqdm

In [3]:
model_name = "distilbert-base-uncased"

# define labels
id2label = {0: "Negative", 1: "Positive"}
label2id = {v: k for k, v in id2label.items()}

# build classification model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id,
)

model

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [4]:
dataset = load_dataset("imdb")

dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [5]:
next(iter(dataset["train"]))

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

### Validation split

In [6]:
# Split the train set into train and validation sets
train_val_split = dataset['train'].train_test_split(test_size=0.2, seed=42)

# Combine the splits into one dataset object
dataset = DatasetDict({
    "train": train_val_split["train"],
    "val": train_val_split["test"],
    "test": dataset["test"]
})

# Check the size of each split
print(f"Train set size: {len(dataset['train'])}")
print(f"Validation set size: {len(dataset['val'])}")
print(f"Test set size: {len(dataset['test'])}")


Train set size: 20000
Validation set size: 5000
Test set size: 25000


In [7]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    add_prefix_space=True,
)

tokenizer

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [8]:
# define tokenizer helper function

def tokenize_function(examples):
    tokenizer.truncation_side = "left"
    return tokenizer(
        examples["text"],
        return_tensors="np",
        padding="max_length",
        truncation=True,
        max_length=512,
    )

# add special token for padding
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})
    # resize model embedding
    model.resize_token_embeddings(len(tokenizer))

# tokenize dataset
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True
)
next(iter(tokenized_dataset["train"]))

{'text': 'Stage adaptations often have a major fault. They often come out looking like a film camera was simply placed on the stage (Such as "Night Mother"). Sidney Lumet\'s direction keeps the film alive, which is especially difficult since the picture offered him no real challenge. Still, it\'s nice to look at for what it is. The chemistry between Michael Caine and Christopher Reeve is quite brilliant. The dynamics of their relationship are surprising. Caine is fantastic as always, and Reeve gets one of his few chances to really act.<br /><br />I confess that I\'ve never seen Ira Levin\'s play, but I hear that Jay Presson Allen\'s adaptation is faithful. The script is incredibly convoluted, and keeps you guessing. "Deathtrap" is an enormously entertaining film, and is recommended for nearly all fans of stage and screen.<br /><br />7.4 out of 10',
 'label': 1,
 'input_ids': [101,
  2754,
  17241,
  2411,
  2031,
  1037,
  2350,
  6346,
  1012,
  2027,
  2411,
  2272,
  2041,
  2559,
 

In [9]:
# define data collator
data_collator = DataCollatorWithPadding(tokenizer)

In [10]:
accuracy = evaluate.load("accuracy")

# define the evaluation helper function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(
        predictions=predictions,
        references=labels
    )

# testing the evaluation helper function
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
preds, labels = [], []
for sample in tokenized_dataset["test"]:
    input_ids = torch.LongTensor(sample["input_ids"]).to(device)
    label = sample["label"]
    with torch.no_grad():
        output = model(input_ids.unsqueeze(0))
    preds.append(output.logits)
    labels.append(label)

preds = torch.cat(preds, dim=0).cpu().numpy()
print(preds.shape)
compute_metrics((preds, labels))

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


(25000, 2)


{'accuracy': 0.49436}

## Finetuning with LoRA

In [12]:
lora_config = LoraConfig(
    task_type="SEQ_CLS",  # sequence classification
    r=4,  # rank
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_lin"]
)

lora_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='SEQ_CLS', inference_mode=False, r=4, target_modules={'q_lin'}, lora_alpha=32, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False))

In [13]:
lora_model = get_peft_model(
    model, lora_config
)

print(lora_model.print_trainable_parameters())

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9307
None


In [14]:
# hyperparameters
lr = 1e-4
batch_size = 8
num_epochs = 10

# define training arguments
training_args = TrainingArguments(
    output_dir=f"{model_name}-lora-imdb",
    learning_rate=lr,
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)



In [15]:
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["val"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [16]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2489,0.26695,0.906
2,0.3057,0.247952,0.9082
3,0.3134,0.279027,0.9124
4,0.1416,0.271469,0.9116
5,0.2549,0.281473,0.9148
6,0.1769,0.299814,0.9152
7,0.158,0.293799,0.9162
8,0.1844,0.308202,0.9162
9,0.4301,0.310992,0.9154
10,0.1093,0.313311,0.916


TrainOutput(global_step=25000, training_loss=0.22755524102151395, metrics={'train_runtime': 1706.9079, 'train_samples_per_second': 117.171, 'train_steps_per_second': 14.646, 'total_flos': 2.68799336448e+16, 'train_loss': 0.22755524102151395, 'epoch': 10.0})

### Performance after finetuning

In [17]:
preds, labels = [], []
for sample in tokenized_dataset["test"]:
    input_ids = torch.LongTensor(sample["input_ids"]).to(device)
    label = sample["label"]
    with torch.no_grad():
        output = lora_model(input_ids.unsqueeze(0))
    preds.append(output.logits)
    labels.append(label)

preds = torch.cat(preds, dim=0).to("cpu")
print(preds.shape)
compute_metrics((preds, labels))

torch.Size([25000, 2])


{'accuracy': 0.87576}