In [1]:
# package imports
import torch
import huggingface
from transformers import (AutoModelForSequenceClassification, AutoTokenizer, GPT2Model, GPT2Tokenizer, AutoModelForCausalLM,GPT2LMHeadModel, GPT2Config)
from datasets import load_dataset

In [2]:
#import validation dataset for evaluation. 
dataset = load_dataset('rotten_tomatoes',split='validation') #just for evaluating

In [3]:
#shape of dataset
dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 1066
})

In [4]:
import os

# get the current working directory
current_working_directory = os.getcwd()

# print output to the console
print(current_working_directory)

C:\Users\felix\Downloads


In [5]:
#create tokenizer with padding
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
pad_token = "<PAD>"
tokenizer.pad_token = pad_token
#tokenizer.set_padding(tokenizer.pad_token, pad_to_multiple_of=8)
config = GPT2Config.from_pretrained("gpt2", pad_token_id=tokenizer.pad_token_id)
model = AutoModelForSequenceClassification.from_pretrained("gpt2",config=config)
#config = GPT2Config.from_pretrained("gpt2", pad_token_id=tokenizer.pad_token_id)


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
#tokenize function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [7]:
#create a tokenized dataset for evaluation
tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [8]:
#creating input for model
inputs = tokenizer.encode_plus(
    tokenized_datasets['text'],
    add_special_tokens=True,
    max_length=128,  # Maximum sequence length
    padding="max_length",
    truncation=True,
    return_tensors="pt"  # Return PyTorch tensors
)

In [9]:
# Make prediction
with torch.no_grad():
    outputs = model(**inputs).logits
    probabilities = torch.nn.functional.softmax(outputs, dim=1)
    predicted_class = torch.argmax(probabilities)

# Display sentiment result
if predicted_class == 1:
    print(f"Sentiment: Positive ({probabilities[0][1] * 100:.2f}%)")
else:
    print(f"Sentiment: Negative ({probabilities[0][0] * 100:.2f}%)")

Sentiment: Positive (93.17%)


In [10]:
#dataset labels
labels = dataset["label"]

In [11]:
#load full dataset for testing
full_dataset = load_dataset('rotten_tomatoes')

In [12]:
# Create a DataLoader to efficiently process the data
data_loader = torch.utils.data.DataLoader(list(zip(inputs["input_ids"],inputs["attention_mask"], labels)),
batch_size=16, shuffle=False)

In [13]:
#evaluate model performance
from sklearn.metrics import accuracy_score

tokenized_dataset = full_dataset.map(lambda examples: tokenizer(examples["text"], padding=True, truncation=True))
                                     #, batched=True)

# Prepare the data for evaluation
eval_dataset = tokenized_dataset["test"].remove_columns(["text"]).rename_column("label", "labels")
eval_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

model.eval()
predictions = []
for batch in torch.utils.data.DataLoader(eval_dataset):
    with torch.no_grad():
        inputs = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_labels = torch.argmax(logits, dim=1)
        predictions.extend(predicted_labels.cpu().numpy())

true_labels = eval_dataset["labels"].numpy()
accuracy = accuracy_score(true_labels, predictions)
print("The pretrained model accuracy is", round(accuracy*100,2),"%")

The pretrained model accuracy is 50.09 %


In [14]:
# Create a PEFT Config for LoRA
from peft import LoraConfig, TaskType
config = LoraConfig(
r=8, # Rank
lora_alpha=32,
target_modules=['c_attn', 'c_proj'],
lora_dropout=0.1,
bias="none",
task_type=TaskType.SEQ_CLS
)

In [15]:
#creating base peft model
from peft import get_peft_model
lora_model = get_peft_model(model, config)



In [16]:
lora_model.print_trainable_parameters()

trainable params: 812,544 || all params: 125,253,888 || trainable%: 0.6487175871139426


In [17]:
#create a training dataset for PEFT model
new_dataset = full_dataset.map(lambda examples: tokenizer(examples["text"], padding=True, truncation=True))

In [18]:
#data processing
new_dataset = new_dataset.rename_column("label", "labels")
new_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [19]:
#unfreeze model
for param in lora_model.parameters():
    param.requires_grad = True

In [20]:
import numpy as np
import torch.nn as nn
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

In [21]:
#training arguments for model
training_args = TrainingArguments(
    output_dir='C:/Users/felix/Documents/Udacity',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-3,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    num_train_epochs=4,
    load_best_model_at_end=True,
    #weight_decay=0.1,
    remove_unused_columns=False,
    #label_names="labels"
)

In [22]:
import random

In [23]:
# reduce size of training data to speed up epochs
n_samples =1500
train_dataset = new_dataset['train']

# Get the number of samples in the dataset
num_samples = len(train_dataset)

# Generate a list of random indices without replacement
random_indices = random.sample(range(num_samples), n_samples)

# Select the samples corresponding to the random indices
random_train_samples = train_dataset.select(indices=random_indices)

In [24]:
#compute metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = torch.from_numpy(predictions)  # Convert predictions to tensor
    labels = torch.from_numpy(labels).long()  # Convert labels to tensor
    loss = nn.CrossEntropyLoss()(predictions, labels)  # Calculate the evaluation loss
    accuracy = (torch.argmax(predictions, axis=1) == labels).float().mean()  # Calculate the accuracy

    # Print the metrics dictionary for debugging
    metrics = {"eval_loss": loss.item(), "accuracy": accuracy.item()}
    print("Metrics:", metrics)

    return metrics

In [25]:
#train model
trainer = Trainer(
    model=lora_model,
    args = training_args,
    train_dataset = random_train_samples,
    eval_dataset = new_dataset['test'],
    tokenizer=tokenizer,
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics = compute_metrics
)
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.73878,0.5
2,No log,0.698363,0.5
3,No log,0.588487,0.712008
4,0.623600,0.722124,0.734522


Metrics: {'eval_loss': 0.7387796640396118, 'accuracy': 0.5}
Metrics: {'eval_loss': 0.6983628273010254, 'accuracy': 0.5}
Metrics: {'eval_loss': 0.5884869694709778, 'accuracy': 0.7120075225830078}
Metrics: {'eval_loss': 0.7221236824989319, 'accuracy': 0.7345215678215027}


TrainOutput(global_step=500, training_loss=0.6236490478515625, metrics={'train_runtime': 3965.3886, 'train_samples_per_second': 1.513, 'train_steps_per_second': 0.126, 'total_flos': 135987293491200.0, 'train_loss': 0.6236490478515625, 'epoch': 4.0})

In [26]:
#evaluate model
trainer.evaluate()

Metrics: {'eval_loss': 0.5485303401947021, 'accuracy': 0.7363977432250977}


{'eval_loss': 0.5485303401947021,
 'eval_accuracy': 0.7363977432250977,
 'eval_runtime': 153.1914,
 'eval_samples_per_second': 6.959,
 'eval_steps_per_second': 0.581,
 'epoch': 4.0}

In [27]:
#review model
lora_model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): GPT2ForSequenceClassification(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D()
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict(

In [28]:
#save model
lora_model.save_pretrained('trained_model')

In [29]:
from peft import AutoPeftModelForSequenceClassification

In [30]:
# Specify the path to the saved model directory
#model_dir = 'C:/Users/felix/Documents/Udacity/checkpoint-668/'

# Load the saved PEFT model
loaded_model = AutoPeftModelForSequenceClassification.from_pretrained('trained_model')

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
loaded_model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): GPT2ForSequenceClassification(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D()
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict(

In [32]:
#import dataset for inference
imdb_dataset = load_dataset('imdb',split='test') #just for evaluating

In [33]:
#make the dataset smaller to reduce latency
# Get the number of samples in the dataset
n_samples = 500
num_samples = len(imdb_dataset)

# Generate a list of random indices without replacement
random_indices = random.sample(range(num_samples), n_samples)

# Select the samples corresponding to the random indices
random_test_imdb = imdb_dataset.select(indices=random_indices)

In [34]:
random_test_imdb

Dataset({
    features: ['text', 'label'],
    num_rows: 500
})

In [35]:
#tokenize imdb test dataset
tokenized_imdb_dataset = random_test_imdb.map(lambda examples: tokenizer(examples["text"], padding=True, truncation=True))
                                     #, batched=True)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [36]:
# Prepare the data for evaluation
eval_imdb_dataset = tokenized_imdb_dataset.remove_columns(["text"]).rename_column("label", "labels")
eval_imdb_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loaded_model = loaded_model.to(device)

#evaluate with my PEFT model
loaded_model.eval()
predictions = []
for batch in torch.utils.data.DataLoader(eval_imdb_dataset):
    with torch.no_grad():
        inputs = {k: v.to(device) for k, v in batch.items()}
        outputs = loaded_model(**inputs)
        logits = outputs.logits
        predicted_labels = torch.argmax(logits, dim=1)
        predictions.extend(predicted_labels.cpu().numpy())

true_labels = eval_imdb_dataset["labels"].numpy()
accuracy = accuracy_score(true_labels, predictions)

print("The PEFT model model accuracy is", round(accuracy*100,2),"%")

The PEFT model model accuracy is 52.6 %


In [37]:
#evaluate the original pretrained model
model = model.to(device)

model.eval()
predictions = []
for batch in torch.utils.data.DataLoader(eval_imdb_dataset):
    with torch.no_grad():
        inputs = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_labels = torch.argmax(logits, dim=1)
        predictions.extend(predicted_labels.cpu().numpy())

true_labels = eval_imdb_dataset["labels"].numpy()
accuracy = accuracy_score(true_labels, predictions)
print("The pretrained model accuracy is", round(accuracy*100,2),"%")

The pretrained model accuracy is 61.8 %


### Summary Thoughts

- The pretrained model came out more accurate on an unseen dataset, when compared to my PEFT model. This is despite the opposite being true on my  original training dataset, which the PEFT model performed much better at. This could imply some overfitting was present.