# Apply Lightweight Fine-Tuning to a Foundation Model

This is a Udacity project for Generative AI nanodegree. The objective is to compare the performance between the original model on Hugging Face and its fine tuned model

### Loading and Evaluating a Foundation Model (from Hugging Face)

This uses following settings: 

- task: sentence classification 
- model: BERT
- data: emotion data set (https://huggingface.co/datasets/dair-ai/emotion)
- fine tuning methodology: Freeze and Tune
- performance criteria: classification accuracy

In [2]:
# data loading
from datasets import load_dataset

splits = ["train", "test"]
ds = {split: ds for split, ds in zip(splits, load_dataset("emotion", split=splits))}

print(ds)

for split in splits:
    print(ds[split][:2])

  from .autonotebook import tqdm as notebook_tqdm


{'train': Dataset({
    features: ['text', 'label'],
    num_rows: 16000
}), 'test': Dataset({
    features: ['text', 'label'],
    num_rows: 2000
})}
{'text': ['i didnt feel humiliated', 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake'], 'label': [0, 0]}
{'text': ['im feeling rather rotten so im not very ambitious right now', 'im updating my blog because i feel shitty'], 'label': [0, 0]}


In [3]:
#  preprocessing (tokenizing)

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    tokens = tokenizer(examples['text'], padding="max_length", truncation=True)
    return tokens 

tokenized_ds = {}
for split in splits:
    tokenized_ds[split] = ds[split].map(preprocess_function, batched=True)

print(tokenized_ds["train"][0]["input_ids"])


[101, 1045, 2134, 2102, 2514, 26608, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [4]:
# loading model

from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=6,
    id2label={0: "sadness", 1: "joy", 2: "love", 3: "anger", 4: "fear", 5: "surprise"},  
    label2id={"sadness":0, "joy":1, "love":2, "anger":3, "fear":4, "surprise":5},  
)

for param in model.base_model.parameters():
    param.requires_grad = False

print(model)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [5]:
#Inference and evaluate

import numpy as np
import numpy as np
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}


trainer = Trainer(
    model=model,
    compute_metrics=compute_metrics
)

In [6]:
# evaluate
evaluation_results = trainer.evaluate(tokenized_ds["test"]) 
formatted_output = "\n".join([f"{key}: {value}" for key, value in evaluation_results.items()])
print(formatted_output)

eval_loss: 1.7785563468933105
eval_accuracy: 0.136
eval_runtime: 26.1294
eval_samples_per_second: 76.542
eval_steps_per_second: 9.568


### Performing Parameter-Efficient Fine-Tuning (PEFT)

In [7]:
trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="./data/sentiment_analysis",
        learning_rate=2e-3,
        per_device_train_batch_size=64,
        per_device_eval_batch_size=64,
        num_train_epochs=3,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    ),
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.1524,0.5545
2,1.243200,1.108759,0.575
3,1.243200,1.079269,0.585


Checkpoint destination directory ./data/sentiment_analysis/checkpoint-250 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./data/sentiment_analysis/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./data/sentiment_analysis/checkpoint-750 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=750, training_loss=1.2024945068359374, metrics={'train_runtime': 730.4239, 'train_samples_per_second': 65.715, 'train_steps_per_second': 1.027, 'total_flos': 6358888710144000.0, 'train_loss': 1.2024945068359374, 'epoch': 3.0})

### Performaing Inference with a PEFT Model 

In [9]:
# evaluate
evaluation_results = trainer.evaluate(tokenized_ds["test"]) 
formatted_output = "\n".join([f"{key}: {value}" for key, value in evaluation_results.items()])
print(formatted_output)

eval_loss: 1.0792690515518188
eval_accuracy: 0.585
eval_runtime: 25.5569
eval_samples_per_second: 78.257
eval_steps_per_second: 1.252
epoch: 3.0


### Result

After small fine-tuning with 3 epoches, the model's accuracy has improved from 0.14 (almost random) to 0.59, which is still not so accurate yet but is meaningful to some extent 