In [59]:
#pip install datasets evaluate torch peft

In [60]:
#pip install transformers

In [61]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [62]:
output_dir = "/content/drive/MyDrive/Cognizant/results"

# Lightweight Fine-Tuning Project

TODO: In this cell, describe your choices for each of the following

* PEFT technique: LoRA
* Model: GPT-2
* Evaluation approach: Evaluate method with a Hugging Face Trainer
* Fine-tuning dataset: climatebert/environmental_claims

## Loading and Evaluating a Foundation Model


TODO: In the cells below, load your chosen pre-trained Hugging Face model and evaluate its performance prior to fine-tuning. This step includes loading an appropriate tokenizer and dataset.

In [63]:
from transformers import GPT2Config, GPT2Tokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset
import numpy as np
import random
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


from peft import LoraConfig, get_peft_model, TaskType, AutoPeftModelForSequenceClassification
random.seed(10)

In [64]:
import json

In [65]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForSequenceClassification.from_pretrained('gpt2',
                                                               num_labels=2,
                                                               id2label={0: "no", 1: "yes"},
                                                               label2id={"no": 0, "yes": 1})
# Model recognizes padding
model.config.pad_token_id = model.config.eos_token_id

# Load the dataset
splits = ["train", "validation"]
dataset = {split: load_dataset("climatebert/environmental_claims", split=split) for split in splits}


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [66]:
#def remove_columns(dataset, columns):
#    return dataset.remove_columns(columns)

#dataset = {split: remove_columns(load_dataset("RuyuanWan/Politeness_Disagreement", split=split), ['disagreement_rate']) for split in splits}



In [67]:
for split in splits:
    print(f"Columns in {split} dataset:", dataset[split].column_names)

Columns in train dataset: ['text', 'label']
Columns in validation dataset: ['text', 'label']


In [68]:
def preprocess_function(examples):

    print("Original text length:", [len(text) for text in examples['text'][:5]])
    tokenized = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

    print("Tokenized input_ids length:", [len(ids) for ids in tokenized['input_ids'][:5]])
    return tokenized

encoded_dataset = {split: dataset[split].map(preprocess_function, batched=True) for split in splits}


In [69]:
for split in splits:
    print(f"Lengths of tokenized sequences in {split} dataset:")
    for i in range(5):  # Print lengths for the first 5 examples
        print(len(encoded_dataset[split][i]['input_ids']))


def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average='binary', pos_label=1)
    recall = recall_score(labels, preds, average='binary', pos_label=1)
    f1 = f1_score(labels, preds, average='binary', pos_label=1)

    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    specificity = tn / (tn + fp)

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,  # Sensitivity
        "specificity": specificity,
        "f1": f1
    }

Lengths of tokenized sequences in train dataset:
128
128
128
128
128
Lengths of tokenized sequences in validation dataset:
128
128
128
128
128


In [70]:

training_args = TrainingArguments(
    output_dir= output_dir,
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    compute_metrics=compute_metrics,
)

# Evaluate the model
eval_result = trainer.evaluate()
print(f"Evaluation result: {eval_result}")

eval_results_file = f"{output_dir}/evaluation_initial_results.json"
with open(eval_results_file, 'w') as f:
    json.dump(eval_result, f)
print(f"Evaluation results saved to: {eval_results_file}")




Evaluation result: {'eval_loss': 0.5194663405418396, 'eval_accuracy': 0.7584905660377359, 'eval_precision': 1.0, 'eval_recall': 0.030303030303030304, 'eval_specificity': 1.0, 'eval_f1': 0.05882352941176471, 'eval_runtime': 2.3459, 'eval_samples_per_second': 112.963, 'eval_steps_per_second': 14.493}
Evaluation results saved to: /content/drive/MyDrive/Cognizant/results/evaluation_initial_results.json


## Performing Parameter-Efficient Fine-Tuning


TODO: In the cells below, create a PEFT model from your loaded model, run a training loop, and save the PEFT model weights.





In [71]:
# Configure LoRA
config = LoraConfig(
    r=10,  # Rank
    lora_alpha=32,
    target_modules=['c_attn', 'c_proj'],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS
)

# Create PEFT model
peft_model = get_peft_model(model, config)
peft_model.print_trainable_parameters()

# Initialize the Trainer with the PEFT model
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    compute_metrics=compute_metrics,
)



trainable params: 1,015,296 || all params: 125,456,640 || trainable%: 0.8093


In [72]:
# Train the model
trainer.train()

# Evaluate the model
eval_result_finetunned = trainer.evaluate()
print(f"Evaluation result: {eval_result_finetunned}")

# Save the PEFT model weights

eval_results_file_finetunned = f"{output_dir}/evaluation_finetunned_results.json"
with open(eval_results_file_finetunned, 'w') as f:
  json.dump(eval_result_finetunned, f)
print(f"Evaluation results saved to: {eval_results_file_finetunned}")

# Save the peft_model to the specified directory in Google Drive
peft_model.save_pretrained(f'{output_dir}/peft_model')
print(f"peft_model saved to: {output_dir}/peft_model")


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,Specificity,F1
1,No log,0.374054,0.837736,0.632184,0.833333,0.839196,0.718954
2,0.386600,0.245449,0.89434,0.771429,0.818182,0.919598,0.794118
3,0.386600,0.223099,0.898113,0.767123,0.848485,0.914573,0.805755


Evaluation result: {'eval_loss': 0.2230994701385498, 'eval_accuracy': 0.8981132075471698, 'eval_precision': 0.7671232876712328, 'eval_recall': 0.8484848484848485, 'eval_specificity': 0.914572864321608, 'eval_f1': 0.8057553956834531, 'eval_runtime': 2.2578, 'eval_samples_per_second': 117.371, 'eval_steps_per_second': 15.059, 'epoch': 3.0}
Evaluation results saved to: /content/drive/MyDrive/Cognizant/results/evaluation_finetunned_results.json
peft_model saved to: /content/drive/MyDrive/Cognizant/results/peft_model


We see an improvement on accuracy, precision and specifity as we add epochs

Also, we have more accuracy and less training loss than pre-fine tunning.

## Performing Inference with a PEFT Model

TODO: In the cells below, load the saved PEFT model weights and evaluate the performance of the trained PEFT model. Be sure to compare the results to the results from prior to fine-tuning.

In [73]:
# Load the tokenizer and the PEFT model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

peft_model = AutoPeftModelForSequenceClassification.from_pretrained('./peft_model',
                                                                num_labels=2,
                                                                id2label={0: "no", 1: "yes"},
                                                                label2id={"no": 0, "yes": 1})
peft_model.config.pad_token_id = peft_model.config.eos_token_id

# Re-setup the Trainer with the PEFT model
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    compute_metrics=compute_metrics,
)

# Evaluate the PEFT model
peft_eval_result = trainer.evaluate()
print(f"PEFT model evaluation result: {peft_eval_result}")

# Compare the results
initial_eval_accuracy = eval_result['eval_accuracy']
finetunned_eval_accuracy = eval_result_finetunned['eval_accuracy']
peft_eval_accuracy = peft_eval_result['eval_accuracy']

print(f"Initial model evaluation accuracy: {initial_eval_accuracy}")
print(f"Model evaluation after fine-tuning accuracy: {finetunned_eval_accuracy}")
print(f"PEFT model evaluation accuracy: {peft_eval_accuracy}")


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PEFT model evaluation result: {'eval_loss': 0.2672160863876343, 'eval_accuracy': 0.8754716981132076, 'eval_precision': 0.6987951807228916, 'eval_recall': 0.8787878787878788, 'eval_specificity': 0.8743718592964824, 'eval_f1': 0.778523489932886, 'eval_runtime': 2.2064, 'eval_samples_per_second': 120.107, 'eval_steps_per_second': 15.41}
Initial model evaluation accuracy: 0.7584905660377359
Model evaluation after fine-tuning accuracy: 0.8981132075471698
PEFT model evaluation accuracy: 0.8754716981132076


we get more accuracy on the model after fine-tunning than on the initial one