# Finetuning a DistilBERT Classifier using Lightning

In [None]:
!pip install -r requirements.txt

In [None]:
import torch
print("Torch version:", torch.__version__)
print("Torch file location:", torch.__file__)

# check if CUDA is available and print the version
print("CUDA compiled version:", torch.version.cuda)
print("cuDNN version:", torch.backends.cudnn.version() if torch.backends.cudnn.is_available() else "Not available")

![](figures/finetuning-ii.png)

# 1 Loading the dataset into DataFrames

In [None]:
import os.path as op

from datasets import load_dataset

import lightning as L
from lightning.pytorch.loggers import CSVLogger
from lightning.pytorch.callbacks import ModelCheckpoint

import numpy as np
import pandas as pd
import torch

from sklearn.feature_extraction.text import CountVectorizer

from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset
from local_dataset_utilities import IMDBDataset

In [None]:
download_dataset()

df = load_dataset_into_to_dataframe()
partition_dataset(df)

In [None]:
df_train = pd.read_csv("train.csv")
df_val = pd.read_csv("val.csv")
df_test = pd.read_csv("test.csv")

In [None]:
df_train['text'][0]

# 2 Tokenization and Numericalization

**Load the dataset via `load_dataset`**

In [None]:
imdb_dataset = load_dataset(
    "csv",
    data_files={
        "train": "train.csv",
        "validation": "val.csv",
        "test": "test.csv",
    },
)

print(imdb_dataset)

**Tokenize the dataset**

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", cache_dir="./models")
print("Tokenizer input max length:", tokenizer.model_max_length)
print("Tokenizer vocabulary size:", tokenizer.vocab_size)

In [None]:
def tokenize_text(batch):
    return tokenizer(batch["text"], truncation=True, padding=True)

In [None]:
imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)

In [None]:
del imdb_dataset

In [None]:
imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# 3 Set Up DataLoaders

In [None]:
from torch.utils.data import DataLoader, Dataset


class IMDBDataset(Dataset):
    def __init__(self, dataset_dict, partition_key="train"):
        self.partition = dataset_dict[partition_key]

    def __getitem__(self, index):
        return self.partition[index]

    def __len__(self):
        return self.partition.num_rows

In [None]:
train_dataset = IMDBDataset(imdb_tokenized, partition_key="train")
val_dataset = IMDBDataset(imdb_tokenized, partition_key="validation")
test_dataset = IMDBDataset(imdb_tokenized, partition_key="test")

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=12,
    shuffle=True,
    num_workers=4
)

val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=12,
    num_workers=4
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=12,
    num_workers=4
)

# 4 Initializing DistilBERT

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, cache_dir="./models")

In [None]:
model

In [None]:
from peft import LoraConfig, get_peft_model, TaskType
# LoRA setting（common setting：q,v, we can add k,out or lin1/lin2 in FNN）
lora_cfg = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,              # r
    lora_alpha=16,    # scale
    lora_dropout=0.1,
    target_modules=["q_lin", "v_lin"]  # DistilBERT Lora layers
)

# pack it as a LoRA model
model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()

## 5 Finetuning

**Wrap in LightningModule for Training**

In [None]:
import lightning as L
import torch
import torchmetrics


class LightningModel(L.LightningModule):
    def __init__(self, model, learning_rate=5e-5):
        super().__init__()

        self.learning_rate = learning_rate
        self.model = model

        self.val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2)
        self.test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2)

    def forward(self, input_ids, attention_mask, labels):
        return self.model(input_ids, attention_mask=attention_mask, labels=labels)

    def training_step(self, batch, batch_idx):
        outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"],
                       labels=batch["label"])
        self.log("train_loss", outputs["loss"])
        return outputs["loss"]  # this is passed to the optimizer for training

    def validation_step(self, batch, batch_idx):
        outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"],
                       labels=batch["label"])
        self.log("val_loss", outputs["loss"], prog_bar=True)

        logits = outputs["logits"]
        predicted_labels = torch.argmax(logits, 1)
        self.val_acc(predicted_labels, batch["label"])
        self.log("val_acc", self.val_acc, prog_bar=True)

    def test_step(self, batch, batch_idx):
        outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"],
                       labels=batch["label"])

        logits = outputs["logits"]
        predicted_labels = torch.argmax(logits, 1)
        self.test_acc(predicted_labels, batch["label"])
        self.log("accuracy", self.test_acc, prog_bar=True)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        return optimizer


lightning_model = LightningModel(model)

In [None]:
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.loggers import CSVLogger


callbacks = [
    ModelCheckpoint(
        save_top_k=1, mode="max", monitor="val_acc"
    )  # save top 1 model
]
logger = CSVLogger(save_dir="logs/", name="my-model")

In [None]:
trainer = L.Trainer(
    max_epochs=3,
    callbacks=callbacks,
    accelerator="gpu",
    precision="16-mixed",
    devices=1,
    logger=logger,
    log_every_n_steps=10,
)

trainer.fit(model=lightning_model,
            train_dataloaders=train_loader,
            val_dataloaders=val_loader)

In [None]:
trainer.test(lightning_model, dataloaders=train_loader, ckpt_path="best")

In [None]:
trainer.test(lightning_model, dataloaders=val_loader, ckpt_path="best")

In [None]:
trainer.test(lightning_model, dataloaders=test_loader, ckpt_path="best")

In [None]:
# define a local model path
local_merged_model_path = "./merged-model"

# combine the weights of lora and the base model
merged_model = model.merge_and_unload()

# save the merged model
merged_model.save_pretrained(local_merged_model_path)

# save the tokenizer
tokenizer.save_pretrained(local_merged_model_path)


In [None]:
pip install huggingface_hub #Deploy the model in Huggingface


In [None]:
from huggingface_hub import notebook_login

# enter the api key of huggingface_hub
notebook_login()

In [None]:
# define the repo name
hub_repo_id_merged = "Qndhm/distilled-bert-imdb-lora-merged"

merged_model.push_to_hub(hub_repo_id_merged)
tokenizer.push_to_hub(hub_repo_id_merged)

print(f"the merged model has been uploaded to: https://huggingface.co/{hub_repo_id_merged}")

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Test the model loaded from Huggingface
hf_repo_id = "Qndhm/distilled-bert-imdb-lora-merged"#repo id

print(f"Load the model and tokenizer from: {hf_repo_id}")
# load the model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(hf_repo_id)
tokenizer = AutoTokenizer.from_pretrained(hf_repo_id)

print("\n Loading completed")

In [None]:
#--- Inference Test ---
text = "This movie was not good at all."
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)
predicted_class_id = outputs.logits.argmax().item()
print(f"Prediction ID: {predicted_class_id}")#0: negative; 1: positive

In [None]:
trainer.test(LightningModel(model), dataloaders=test_loader)#test acc of the loaded model; same as the original result