# Finetuning a DistilBERT Classifier in Lightning

In [None]:
!pip install -r requirements.txt

In [None]:
import torch
print("Torch version:", torch.__version__)
print("Torch file location:", torch.__file__)

# 检查是否包含CUDA编译信息
print("CUDA compiled version:", torch.version.cuda)
print("cuDNN version:", torch.backends.cudnn.version() if torch.backends.cudnn.is_available() else "Not available")

![](figures/finetuning-ii.png)

# 1 Loading the dataset into DataFrames

In [None]:
import os.path as op

from datasets import load_dataset

import lightning as L
from lightning.pytorch.loggers import CSVLogger
from lightning.pytorch.callbacks import ModelCheckpoint

import numpy as np
import pandas as pd
import torch

from sklearn.feature_extraction.text import CountVectorizer

from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset
from local_dataset_utilities import IMDBDataset

In [None]:
download_dataset()

df = load_dataset_into_to_dataframe()
partition_dataset(df)

In [None]:
df_train = pd.read_csv("train.csv")
df_val = pd.read_csv("val.csv")
df_test = pd.read_csv("test.csv")

In [None]:
df_train['text'][0]

# 2 Tokenization and Numericalization

**Load the dataset via `load_dataset`**

In [None]:
imdb_dataset = load_dataset(
    "csv",
    data_files={
        "train": "train.csv",
        "validation": "val.csv",
        "test": "test.csv",
    },
)

print(imdb_dataset)

**Tokenize the dataset**

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", cache_dir="./models")
print("Tokenizer input max length:", tokenizer.model_max_length)
print("Tokenizer vocabulary size:", tokenizer.vocab_size)

In [None]:
def tokenize_text(batch):
    return tokenizer(batch["text"], truncation=True, padding=True)

In [None]:
imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)

In [None]:
del imdb_dataset

In [None]:
imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# 3 Set Up DataLoaders

In [None]:
from torch.utils.data import DataLoader, Dataset


class IMDBDataset(Dataset):
    def __init__(self, dataset_dict, partition_key="train"):
        self.partition = dataset_dict[partition_key]

    def __getitem__(self, index):
        return self.partition[index]

    def __len__(self):
        return self.partition.num_rows

In [None]:
train_dataset = IMDBDataset(imdb_tokenized, partition_key="train")
val_dataset = IMDBDataset(imdb_tokenized, partition_key="validation")
test_dataset = IMDBDataset(imdb_tokenized, partition_key="test")

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=12,
    shuffle=True,
    num_workers=4
)

val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=12,
    num_workers=4
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=12,
    num_workers=4
)

# 4 Initializing DistilBERT

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, cache_dir="./models")

In [None]:
model

In [None]:
from peft import LoraConfig, get_peft_model, TaskType
# LoRA setting
lora_cfg = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,              # r
    lora_alpha=16,    # alpha
    lora_dropout=0.1,
    target_modules=["q_lin", "v_lin"]  # default tuning layers with pre_classifier + classifier
)

model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()

## 5 Finetuning

**Wrap in LightningModule for Training**

In [None]:
import lightning as L
import torch
import torchmetrics


class LightningModel(L.LightningModule):
    def __init__(self, model, learning_rate=5e-5):
        super().__init__()

        self.learning_rate = learning_rate
        self.model = model

        self.val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2)
        self.test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2)

    def forward(self, input_ids, attention_mask, labels):
        return self.model(input_ids, attention_mask=attention_mask, labels=labels)

    def training_step(self, batch, batch_idx):
        outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"],
                       labels=batch["label"])
        self.log("train_loss", outputs["loss"])
        return outputs["loss"]  # this is passed to the optimizer for training

    def validation_step(self, batch, batch_idx):
        outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"],
                       labels=batch["label"])
        self.log("val_loss", outputs["loss"], prog_bar=True)

        logits = outputs["logits"]
        predicted_labels = torch.argmax(logits, 1)
        self.val_acc(predicted_labels, batch["label"])
        self.log("val_acc", self.val_acc, prog_bar=True)

    def test_step(self, batch, batch_idx):
        outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"],
                       labels=batch["label"])

        logits = outputs["logits"]
        predicted_labels = torch.argmax(logits, 1)
        self.test_acc(predicted_labels, batch["label"])
        self.log("accuracy", self.test_acc, prog_bar=True)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        return optimizer


lightning_model = LightningModel(model)

In [None]:
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.loggers import CSVLogger


callbacks = [
    ModelCheckpoint(
        save_top_k=1, mode="max", monitor="val_acc"
    )  # save top 1 model
]
logger = CSVLogger(save_dir="logs/", name="my-model")

In [None]:
trainer = L.Trainer(
    max_epochs=3,
    callbacks=callbacks,
    accelerator="gpu",
    precision="16-mixed",
    devices=1,
    logger=logger,
    log_every_n_steps=10,
)

trainer.fit(model=lightning_model,
            train_dataloaders=train_loader,
            val_dataloaders=val_loader)

In [None]:
trainer.test(lightning_model, dataloaders=train_loader, ckpt_path="best")

In [None]:
trainer.test(lightning_model, dataloaders=val_loader, ckpt_path="best")

In [None]:
trainer.test(lightning_model, dataloaders=test_loader, ckpt_path="best")

In [None]:
model = lightning_model.model
# local path to save the adapter
local_adapter_path = "./my-new-awesome-lora-adapter"

# save lora adapter weights and bias
model.save_pretrained(local_adapter_path)

# save tokenizer
tokenizer.save_pretrained(local_adapter_path)


In [None]:
pip install huggingface_hub #Deploy the model in Huggingface


In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
# upload the adapter model to Hugging face
hub_repo_id_adapter = "username/distilled-bert-imdb-lora-adapter"#replace username with your huggingface username

model.push_to_hub(hub_repo_id_adapter)
tokenizer.push_to_hub(hub_repo_id_adapter)

In [None]:
# load the model from Huggingface Hub and do inference
from safetensors.torch import load_file
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from peft import get_peft_model, LoraConfig
from huggingface_hub import hf_hub_download
import os

# compare the base model and the adapter
base_model_name = "distilbert-base-uncased"
hf_repo_id = hub_repo_id_adapter

# --- load base model and adapter ---
print(f"loading base model: {base_model_name}")
model = AutoModelForSequenceClassification.from_pretrained(
    base_model_name,
    num_labels=2
)
tokenizer = AutoTokenizer.from_pretrained(hf_repo_id)


# --- create a config file the same as that in Hub
peft_config = LoraConfig(
    task_type="SEQ_CLS",
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["q_lin", "v_lin"],
    modules_to_save=["pre_classifier", "classifier"]#to be used in comparison
)

# add null LoRA adapter
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()


# load the adapter weights form hub
print(f"\n downloading weights from hub: {hf_repo_id}")
weights_path = hf_hub_download(repo_id=hf_repo_id, filename="adapter_model.safetensors")

adapter_weights = load_file(weights_path)


#compare the keys of adapter and base model
print("Hub keys of the adapter:", list(adapter_weights.keys()))

# print base model keys
model_trainable_keys = [k for k, v in model.named_parameters() if v.requires_grad]
print("base model keys:", model_trainable_keys)
new_state_dict = {}
for k, v in adapter_weights.items():
    #adjust the keys to be consistent
    new_key = k.replace(".weight", ".default.weight")
    if "classifier" in new_key:
      # ...classifier.bias -> ...classifier.modules_to_save.default.bias)
      if new_key.endswith(".bias"):
        new_key = new_key.replace(".bias", ".modules_to_save.default.bias")
        # ...classifier.default.weight -> ...classifier.modules_to_save.default.weight)
      elif new_key.endswith(".weight"):
        new_key = new_key.replace(".default.weight", ".modules_to_save.default.weight")
    new_state_dict[new_key] = v



print("New keys:", list(new_state_dict.keys()))

print("\n Load weights with new keys")
model.load_state_dict(new_state_dict, strict=False)

text_pos = "I do not like this movie, it was bad!"
inputs_pos = tokenizer(text_pos, return_tensors="pt")
with torch.no_grad():
    outputs_pos = model(**inputs_pos)
predicted_class_id_pos = outputs_pos.logits.argmax().item()
print(f"positive: '{text_pos}' --> prediction: {predicted_class_id_pos}")

In [None]:
trainer.test(LightningModel(model), dataloaders=test_loader)#the same as the original model