<a href="https://colab.research.google.com/github/vitoraugusto1993/Coursera-courses/blob/main/LLM/Hugging%20Face/Fine%20Tuning/IMDB_reviews_PEFT_techniques_comparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install --upgrade pandas numpy transformers datasets evaluate torch accelerate>=0.26.0 psutil bitsandbytes mlflow==3.1.1 mlflow-skinny==3.1.1 nltk pandas==2.2.2 numpy==2.0 --quiet

In [None]:
import re
import torch
from torch.utils.data import DataLoader
import mlflow
import evaluate
import random
import numpy as np
import pandas as pd
import transformers
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from transformers.integrations import MLflowCallback
import datasets
from datasets import Dataset
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import random # Random module for generating random numbers and selections
import nltk
nltk.download('wordnet') # NLTK's WordNet corpus for finding synonyms
from nltk.corpus import wordnet

In [None]:
get_ipython().system_raw("mlflow ui --port 5000 &")

In [None]:
!pip install pyngrok --quiet

from pyngrok import ngrok
from getpass import getpass

# Terminate open tunnels if exist
# ngrok.kill()

# Setting the authtoken (optional)
# Get your authtoken from https://dashboard.ngrok.com/auth
NGROK_AUTH_TOKEN = getpass('Enter the ngrok authtoken: ')
ngrok.set_auth_token(NGROK_AUTH_TOKEN)

# O pen an HTTPs tunnel on port 5000 for http://localhost:5000
# 334eXKKnAypyTHCxzcxMRyrzuD7_7pq3ycoPsKoKbARcxuV8C
ngrok_tunnel = ngrok.connect(addr="5000", proto="http", bind_tls=True)
print("MLflow Tracking UI:", ngrok_tunnel.public_url)

## Step 1: Import data set

Import dataset from Hugging Face Hub. This Dataset contains thousands of reviews from IMDB about movies, classified as Negative (0) and Positive (1).

In [None]:
data = datasets.load_dataset('imdb')
dataset = data['train'].to_pandas()
dataset.head()

##### Step 2: Clean the text

This step is cleaning the raw text data to remove unnecessary characters, such as URLs, special symbols, or HTML tags, and to normalize the text by converting it to lowercase.



In [None]:
# Function to clean the text
def clean_text(text):
    text = str(text).lower() # Convert all text to lowercase for uniformity
    text = re.sub(r'http\S+', '', text) # Remove URLs from the text
    text = re.sub(r'<.*?>', '', text) # Remove any HTML tags from the text
    # text = re.sub(r'[^\w\s]', '', text) # Remove punctuation, keep only words and spaces
    return text # Return the cleaned text

# Assume `data` is a pandas DataFrame with a column named 'text'
# Apply the cleaning function to each row of the 'text' column
dataset['cleaned_text'] = dataset['text'].apply(clean_text)

# Remove un-necessary columns
dataset = dataset.drop(['text'], axis=1)

# Print the first 5 rows of the cleaned text to verify the cleaning process
print(dataset.head())

## Step 3: Handle missing data

Check for empty entries in the column 'cleaned_text'. If there are missing data, we should delete the entire rows.

In [None]:
# Print the count of missing values for each column
print(dataset.isnull().sum())
print('--------------------')

# Remove rows with missing data in the 'text' column
dataset = dataset.dropna()

# Confirmation of no missing data in the dataset
print(dataset.isnull().sum())
print('--------------------')

# Print the count of entries in the dataset
print(dataset.info())

In [None]:
dataset.groupby('label')['cleaned_text'].count()

In [None]:
dataset['label'].hist()

## Step 4: Tokenization

After cleaning the text, we tokenize it. Tokenization splits the text into individual words or subwords that can be used by the model. We will use the DistilBERT tokenizer to ensure compatibility with the pretrained model you are fine-tuning.

In [None]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer

# Load the BERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

dataset_ds = Dataset.from_pandas(dataset)

def tokenize_function(examples):
    return tokenizer(examples['cleaned_text'], truncation=True)

tokenized_dataset = dataset_ds.map(tokenize_function, batched=True)

In [None]:
tokenized_dataset = tokenized_dataset.remove_columns(['cleaned_text'])
tokenized_dataset = tokenized_dataset.rename_column('label', 'labels')
tokenized_dataset.set_format("torch")
tokenized_dataset.column_names

## Step 5: Structure the data for fine-tuning

You can fine-tune your model once the dataset is cleaned and tokenized. The next step is structuring the data for fine-tuning.

First we will split our dataset into three separate datasets: training, validation and test. For this case, we will use 70% of the whole dataset to train de model, 15% to validade and 15% to test. Then, we need to convert our pandas dataframes back to Datasets objects. With our dataset splited and converted, we have to remove an extra column that these steps added ('__index_level_0__')



In [None]:
tokenized_df = tokenized_dataset.to_pandas()

train_df, val_test_df = train_test_split(
    tokenized_df, test_size=0.3, random_state=42
)

val_df, test_df = train_test_split(
    val_test_df, test_size=0.5, random_state=42
)

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

train_dataset = train_dataset.remove_columns(['__index_level_0__'])
val_dataset = val_dataset.remove_columns(['__index_level_0__'])
test_dataset = test_dataset.remove_columns(['__index_level_0__'])

When we load a dataset using the load_dataset from library datasets, they are presented as a DatasetDict object. Sometimes the dataset is already splited into train, validation and test. So, in the next step, we will create a DatasetDict object just like the ones we get when me use load_data function.

In [None]:
from datasets import Dataset, DatasetDict

tweet_dataset = DatasetDict({"train": train_dataset, "val": val_dataset, "test":test_dataset})
tweet_dataset

Now we can create the dataloaders we will use to iterate over batches. We can easily define them as follows:

In [None]:
# Create DataLoader objects
train_dataloader = DataLoader(tweet_dataset["train"], batch_size=16, collate_fn=data_collator, shuffle=True)
val_dataloader = DataLoader(tweet_dataset["val"], batch_size=16, collate_fn=data_collator)
test_dataloader = DataLoader(tweet_dataset["test"], batch_size=16, collate_fn=data_collator)

print("Training, validation, and test sets are prepared with attention masks!")

To quickly check there is no mistake in the data processing, we can inspect a batch like this:

In [None]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

## Step 6: Create the model

### PEFT (trainning only the last layer before classification head)

Now that we're completely finished with data preprocessing, let's turn to the model. We instantiate it by referencing the pretrained model we want to fine tune. In this case e are using a base BERT model. As we want to perform PEFT (parameter efficient fine-tuning), we need to follow somes steps before training. First, we freeze all model layers, except the last one, which is the classification head layer. Optionally, we can unfreeze a few layers before the classification head if you judge necessary.

In [None]:
from transformers import DistilBertForSequenceClassification

DistilBertModel = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
model = DistilBertModel

# Step 1: Freeze all layers except the last one (classification head)
for param in model.base_model.parameters():
    param.requires_grad = False

# If you'd like to fine-tune additional layers (e.g., the last 2 layers), you can unfreeze those layers as well
for param in model.base_model.transformer.layer[-2:].parameters():
    param.requires_grad = True

trainable_params = 0
all_params = 0
for _, param in model.named_parameters():
    all_params += param.numel()
    if param.requires_grad:
        trainable_params += param.numel()

# Print the results
print(f"Trainable parameters: {trainable_params} || Total parameters: {all_params} || Trainable percentage: {100 * trainable_params / all_params:.2f}%")

### LoRA

In [None]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
        r=32,
        lora_alpha=64,
        target_modules=['v_lin','k_lin','q_lin'],
        lora_dropout=0.1,
        bias='none',
        task_type="SEQ_CLS" # Specify the task type
    )

lora_model = get_peft_model(DistilBertModel, lora_config)
lora_model.print_trainable_parameters() # To see the number of trainable parameters

### QLoRA

In [None]:
import bitsandbytes as bnb
from transformers import BitsAndBytesConfig
from peft import prepare_model_for_kbit_training

# 1. Define the quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4", # NormalFloat 4-bit quantization
    bnb_4bit_use_double_quant=True, # Nested quantization
    bnb_4bit_compute_dtype=torch.bfloat16, # Compute type for faster operations
)

# 2. Load the base model with quantization
# Ensure you have device_map='auto' for efficient memory usage across devices
QDistilBertModel = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased",
                                                            num_labels=2,
                                                            device_map="auto",
                                                            torch_dtype=torch.bfloat16, # Use bfloat16 for the model's data type
                                                            )

# Prepare model for k-bit training (important for QLoRA)
QDistilBertModel = prepare_model_for_kbit_training(QDistilBertModel)

qlora_config = LoraConfig(r=32,
                          lora_alpha=64,
                          target_modules=['v_lin','k_lin','q_lin'],
                          lora_dropout=0.1,
                          bias='none',
                          task_type="SEQ_CLS" # Specify the task type
                          )

qlora_model = get_peft_model(QDistilBertModel, qlora_config)
qlora_model.print_trainable_parameters() # To see the number of trainable parameters

## Step 7: Training loop configuration

### PEFT

Before setting the training loop, we need to specificate two important things: the optimizer and a learning rate scheduler. Since we are trying to replicate what the Trainer was doing by hand, we will use the same defaults. The optimizer used by the Trainer is Adafactor, wich is the same as Adam, but with a twist for weight decay regularization:

In [None]:
from transformers import Adafactor

optimizer = Adafactor(model.parameters(), lr=2e-5, relative_step=False)

Finally, the learning rate scheduler used by default is just a linear decay from the maximum value to 0. To properly define it, we need to know the number of training steps we will take, wich is the number of epochs we want to run multiplied by the number of training batches (which is the length of our training dataloader). The Trainer uses three epochs by default, so we will follow that:

In [None]:
from transformers import get_scheduler

num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

We are now ready to train" To get some sense of when training will be finished, we add a progress bar over our number of training steps, using tqdm library:

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

In [None]:
batch.to(device)

In [None]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

In [None]:
from tqdm.auto import tqdm
import mlflow
import evaluate

progress_bar = tqdm(range(num_training_steps))

accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

metrics = evaluate.combine(["precision","f1","recall"])

mlflow.set_experiment("Simple_PEFT_IMDB")
mlflow.pytorch.autolog()
step = 1

with mlflow.start_run(log_system_metrics=True) as run:
  for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        step +=1
        mlflow.log_metric("train_loss", loss.item(), step=step)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    model.eval()
    for batch in val_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metrics.add_batch(predictions=predictions, references=batch["labels"])

    results = metrics.compute(average="weighted")
    ml_metrics = {"eval_precision": results["precision"],
                  "eval_recall": results["recall"],
                  "eval_f1": results["f1"]}
    mlflow.log_metrics(ml_metrics, step=epoch)
    print(results)

### LoRA

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
lora_model.to(device)
device

In [None]:
outputs = lora_model(**batch)
print(outputs.loss, outputs.logits.shape)

In [None]:
optimizer = Adafactor(lora_model.parameters(), lr=1e-4, relative_step=False)

num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

In [None]:
progress_bar = tqdm(range(num_training_steps))

accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

metrics_lora = evaluate.combine(["precision","f1","recall"])

mlflow.set_experiment("LoRA_PEFT_IMDB")
mlflow.pytorch.autolog()
step = 1

with mlflow.start_run(log_system_metrics=True) as run:
  for epoch in range(num_epochs):
    lora_model.train()
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = lora_model(**batch)
        loss = outputs.loss
        loss.backward()
        step +=1
        mlflow.log_metric("train_loss", loss.item(), step=step)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    lora_model.eval()
    for batch in val_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = lora_model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metrics_lora.add_batch(predictions=predictions, references=batch["labels"])

    results = metrics_lora.compute(average="weighted")
    ml_metrics = {"eval_precision": results["precision"],
                  "eval_recall": results["recall"],
                  "eval_f1": results["f1"]}
    mlflow.log_metrics(ml_metrics, step=epoch)
    print(results)

### QLoRA

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
qlora_model.to(device)
device

In [None]:
outputs = qlora_model(**batch)
print(outputs.loss, outputs.logits.shape)

In [None]:
optimizer = Adafactor(qlora_model.parameters(), lr=1e-4, relative_step=False)

num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

In [None]:
progress_bar = tqdm(range(num_training_steps))

accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

metrics_qlora = evaluate.combine(["precision","f1","recall"])

mlflow.set_experiment("QLoRA_PEFT_IMDB")
mlflow.pytorch.autolog()
step = 1

with mlflow.start_run(log_system_metrics=True) as run:
  for epoch in range(num_epochs):
    qlora_model.train()
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = qlora_model(**batch)
        loss = outputs.loss
        loss.backward()
        step +=1
        mlflow.log_metric("train_loss", loss.item(), step=step)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    qlora_model.eval()
    for batch in val_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = qlora_model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metrics_qlora.add_batch(predictions=predictions, references=batch["labels"])

    results = metrics_qlora.compute(average="weighted")
    ml_metrics = {"eval_precision": results["precision"],
                  "eval_recall": results["recall"],
                  "eval_f1": results["f1"]}
    mlflow.log_metrics(ml_metrics, step=epoch)
    print(results)

## Step 8: Evaluation

After training the model, we need to use unseen data to valuate the model capability on classificating reviews. This step is important to guarantee that the model don't just memorize the classes of the training samples. In this example, the IMDB dataset has a test group containing 25k samples of movie reviews

### PEFT

In [None]:
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

metrics = evaluate.combine(["precision","f1","recall"])

model.eval()
for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metrics.add_batch(predictions=predictions, references=batch["labels"])

results = metrics.compute(average="weighted")
print(results)

### LoRA

In [None]:
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

metrics_lora = evaluate.combine(["precision","f1","recall"])

lora_model.eval()
for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = lora_model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metrics_lora.add_batch(predictions=predictions, references=batch["labels"])

results = metrics_lora.compute(average="weighted")
print(results)

### QLoRA

In [None]:
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

metrics_qlora = evaluate.combine(["precision","f1","recall"])

qlora_model.eval()
for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = qlora_model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metrics_qlora.add_batch(predictions=predictions, references=batch["labels"])

results = metrics_qlora.compute(average="weighted")
print(results)