<a href="https://colab.research.google.com/github/tranmanhcuong253/Parameter-Efficient-Fine-Tuning-Using-LoRA-with-DistilBERT/blob/main/LoRa_DistilBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#II.Text Classification

##1.Import Basic Package

In [1]:
!pip install peft



In [2]:
import torch
import torch.nn as nn
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
from transformers import get_linear_schedule_with_warmup, AdamW
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
import time, datetime, random, re
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm
import math

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AdamW,
    get_linear_schedule_with_warmup,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    AutoModelForCausalLM
)

from peft import (
    LoraConfig,
    get_peft_model,
    TaskType,
    PeftModel
)


##2.Load and Preprocess Datasets

In [3]:
from sklearn.datasets import fetch_20newsgroups

In [4]:
data=fetch_20newsgroups(
            subset='all',
            categories=['sci.crypt', 'sci.electronics', 'sci.med', 'sci.space'],
            shuffle=False,
            remove=('headers', 'footers', 'quotes'))

In [5]:
target_names = data.target_names
target_names

['sci.crypt', 'sci.electronics', 'sci.med', 'sci.space']

In [6]:
len(data['data'])


3952

In [7]:
df = pd.DataFrame([data.data, data.target.tolist()]).T
df.columns = ['text', 'target']

In [8]:
df.head()

Unnamed: 0,text,target
0,Archive-name: cryptography-faq/part10\nLast-mo...,0
1,Does anyone on this newsgroup happen to know W...,2
2,"Hi,\n\tI am looking for some help in choosing ...",1
3,Does anyone know of a non-word password genera...,0
4,"The system, or 'family', key would appear to b...",0


In [9]:
# remove excess white spaces
df['text'] = df['text'].apply(lambda x: " ".join(x.split()))

# remove excess spaces near punctuation
df['text'] = df['text'].apply(lambda x: re.sub(r'\s([?.!"](?:\s|$))', r'\1', x))

##3.Split and Convert Datasets

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_val, y_train, y_val = train_test_split(df["text"],
                                                  df["target"],
                                                  test_size=0.2,
                                                  stratify=df["target"],
                                                  random_state=42)
X_train.reset_index(drop = True, inplace = True)
y_train.reset_index(drop = True, inplace = True)
X_val.reset_index(drop = True, inplace = True)
y_val.reset_index(drop = True, inplace = True)

In [12]:
# instantiate BERT tokenizer with upper + lower case
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [13]:
# Tokenize function
def tokenize_func(data):
    return tokenizer(
            data['texts'],
            max_length=512,
            padding='max_length',
            return_attention_mask=True,
            truncation=True
        )

In [14]:
!pip install datasets
import datasets



In [15]:
train_dataset = datasets.Dataset.from_pandas(pd.DataFrame({"texts":X_train,"labels":y_train}))
train_dataset = train_dataset.map(
    tokenize_func,
    batched=True,
    remove_columns=["texts"]
)
train_dataset

Map:   0%|          | 0/3161 [00:00<?, ? examples/s]

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 3161
})

In [16]:
# Tokenize the Validation Data
val_dataset = datasets.Dataset.from_pandas(pd.DataFrame({"texts":X_val,"labels":y_val}))
val_dataset = val_dataset.map(
    tokenize_func,
    batched=True,
    remove_columns=["texts"]
)

val_dataset

Map:   0%|          | 0/791 [00:00<?, ? examples/s]

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 791
})

In [17]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

##4.Load the pretrain DistilBert and LoRA model

In [18]:
# Define a function that can print the trainable parameters
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

In [19]:
# Load DistilBERT with a single a single linear classification layer
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-cased",
    num_labels=4,return_dict=True).to(device)

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 65784580
all model parameters: 65784580
percentage of trainable model parameters: 100.00%


In [21]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [22]:
# Define the LoRA Configuration
lora_config = LoraConfig(
    r=8, # Rank Number
    lora_alpha=32, # Alpha (Scaling Factor)
    lora_dropout=0.05, # Dropout Prob for Lora
    target_modules=["q_lin", "k_lin","v_lin"], # Which layer to apply LoRA, usually only apply on MultiHead Attention Layer
    bias='none',
    task_type=TaskType.SEQ_CLS # Seqence to Classification Task
)

In [23]:
# Get our LoRA-enabled model
peft_model = get_peft_model(model,
                            lora_config)

# Reduced trainble parameters
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 814852
all model parameters: 66599432
percentage of trainable model parameters: 1.22%


In [24]:
peft_model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): DistilBertForSequenceClassification(
      (distilbert): DistilBertModel(
        (embeddings): Embeddings(
          (word_embeddings): Embedding(28996, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (transformer): Transformer(
          (layer): ModuleList(
            (0-5): 6 x TransformerBlock(
              (attention): MultiHeadSelfAttention(
                (dropout): Dropout(p=0.1, inplace=False)
                (q_lin): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.05, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=76

##5.Training model

In [25]:
# Define Eval Metric
from sklearn.metrics import accuracy_score
def metrics(eval_prediction):
    logits, labels = eval_prediction
    pred = np.argmax(logits, axis=1)
    acc_score = accuracy_score(labels, pred)
    return {"Val-Accuracy": acc_score}


In [26]:
train_batch_size = 32
eval_batch_size = 32


In [27]:
# Define training Args
peft_training_args = TrainingArguments(
    output_dir='./result-distilbert-lora',
    logging_dir='./logs-distilbert-lora',
#     auto_find_batch_size=True,
    learning_rate=1e-4,
    per_device_train_batch_size=train_batch_size, # You can adjust this value base on your available GPU, You may encounter "out of memory" error if this value is too lartge
    per_device_eval_batch_size=eval_batch_size, # You can adjust this value base on your available GPU, You may encounter "out of memory" error if this value is too lartge
    num_train_epochs=5,
    logging_steps=10,
    eval_strategy='epoch',
    eval_steps=1,
    weight_decay=0.01,
    seed=42,
    fp16=True, # Only use with GPU
    report_to='none'
)

In [28]:
# Define Optimzer
optimizer = AdamW(peft_model.parameters(),
                  lr=1e-4,
                  no_deprecation_warning=True)

In [29]:
# Define Scheduler
n_epochs = peft_training_args.num_train_epochs
total_steps = n_epochs * math.ceil(len(train_dataset) / train_batch_size )
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps)

# Data Collator
collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding="longest"
)


# Define Trainer
peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=train_dataset, # Training Data
    eval_dataset=val_dataset, # Evaluation Data
    tokenizer=tokenizer,
    compute_metrics=metrics,
    optimizers=(optimizer,lr_scheduler),
    data_collator=collator
)

print(f"Total Steps: {total_steps}")


# Train the model
peft_trainer.train()

Total Steps: 495


Epoch,Training Loss,Validation Loss,Val-accuracy
1,0.5665,0.425853,0.844501
2,0.4031,0.349921,0.867257
3,0.4049,0.324532,0.879899
4,0.3495,0.319999,0.881163
5,0.3302,0.314807,0.883692


TrainOutput(global_step=495, training_loss=0.48326685524950125, metrics={'train_runtime': 282.8411, 'train_samples_per_second': 55.879, 'train_steps_per_second': 1.75, 'total_flos': 2133285386772480.0, 'train_loss': 0.48326685524950125, 'epoch': 5.0})