# Finetuning for classification

## Preparing the datase

In [129]:
from importlib.metadata import version

pkgs = [
    "matplotlib",
    "numpy",
    "tiktoken",
    "torch",
    "tensorflow", # for open ai pretrained wights
    "pandas",
    "polars"
]

for p in pkgs:
    print(f"{p}: {version(p)}")

matplotlib: 3.10.5
numpy: 1.26.4
tiktoken: 0.9.0
torch: 2.2.2
tensorflow: 2.16.2
pandas: 2.3.2
polars: 1.32.3


In [130]:
import urllib.request
import zipfile
import os
from pathlib import Path

url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extracted_path = "sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
    if data_file_path.exists():
        print("Data file already exists. Skipping download.")
        return

    #download
    with urllib.request.urlopen(url) as response:
        with open(zip_path, "wb") as out_file:
            out_file.write(response.read())

    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extracted_path)

    # add .tsv file extension
    original_file_path = Path(extracted_path) / "SMSSpamCollection"
    os.rename(original_file_path, data_file_path)
    print(f"Data file downloaded and extracted to {data_file_path}")

try:
    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)
except Exception as e:
    print(f"An error occurred while downloading and extracting the data: {e}")

Data file already exists. Skipping download.


In [131]:
import pandas as pd

In [132]:
df = pd.read_csv(data_file_path, sep="\t", header=None, names=["label", "text"])
df[:5]

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [133]:
len(df)

5572

In [134]:
df["label"].value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

In [135]:
def create_balanced_dataset(df):
    num_spam = df[df['label']=='spam'].shape[0]

    ham_subset = df[df['label']=='ham'].sample(n=num_spam, random_state=123)

    balanced_df = pd.concat([ham_subset, df[df['label']=='spam']])
    return balanced_df

balanced_df = create_balanced_dataset(df)
balanced_df['label'].value_counts()

label
ham     747
spam    747
Name: count, dtype: int64

In [136]:
map_dict = {'ham': 0, 'spam': 1}

balanced_df['label'] = balanced_df['label'].map(map_dict)


In [137]:
balanced_df['label'].value_counts()

label
0    747
1    747
Name: count, dtype: int64

In [138]:
def random_split(df, train_frac=0.8, validation_frac=0.2):
    # shuffle the entire dataset
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)

    train_end = int(train_frac * len(df))
    validation_end = train_end + int(validation_frac * len(df))

    train_df = df[:train_end]
    validation_df = df[train_end:validation_end]
    test_df = df[validation_end:]

    return train_df, validation_df, test_df

train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)
print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(validation_df)}")
print(f"Test set size: {len(test_df)}")


Training set size: 1045
Validation set size: 149
Test set size: 300


# Create Data Loaders

In [139]:
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [140]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
print(tokenizer.decode([50256]))

<|endoftext|>


In [141]:
tokenizer.n_vocab

50257

In [142]:
# we need to create padding to make sure the input lengths across batches are consiste
import torch
from torch.utils.data import Dataset, DataLoader

class SpamDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=None):
        if type(csv_file) == str:
            self.data = pd.read_csv(csv_file).reset_index(drop=True)
        elif type(csv_file) == pd.DataFrame:
            self.data = csv_file.reset_index(drop=True)
        else:
            raise ValueError("csv_file must be a path string or a pandas DataFrame")

        self.encoded_text = [
            tokenizer.encode(txt) for txt in self.data["text"]
        ]

        if max_length is None:
            self.max_length = self._longest_encoded_length()
        else:
            self.max_length = max_length
            # truncate if text is longer than max length
            self.encoded_text = [
                encoded_text[:self.max_length] for encoded_text in self.encoded_text
            ]

        # pad sequence to longest sequence
        self.encoded_text = [
            encoded_text + [pad_token_id]*(self.max_length - len(encoded_text))
            for encoded_text in self.encoded_text
        ]

    def __getitem__(self, index):
        encoded = self.encoded_text[index]
        label = self.data["label"][index]
        return (
            torch.tensor(encoded, dtype=torch.long),
            torch.tensor(label, dtype=torch.long),
        )

    def __len__(self):
        return len(self.data)


    def _longest_encoded_length(self):
        # max_length = 0
        # for encoded_text in self.encoded_text:
        #     if len(encoded_text) > max_length:
        #         max_length = len(encoded_text)
        # return max_length
        return max(len(encoded_text) for encoded_text in self.encoded_text)


In [143]:
train_dataset =  SpamDataset(
    csv_file=train_df,
    tokenizer=tokenizer,
    max_length=None,
    pad_token_id=tokenizer.n_vocab-1
)

In [144]:
train_dataset.max_length

120

In [145]:
val_dataset = SpamDataset(
    csv_file=validation_df,
    tokenizer=tokenizer,
    max_length=train_dataset.max_length,
    pad_token_id=tokenizer.n_vocab-1
)
test_dataset = SpamDataset(
    csv_file=test_df,
    tokenizer=tokenizer,
    max_length=train_dataset.max_length,
    pad_token_id=tokenizer.n_vocab-1
)

In [146]:
from torch.utils.data import DataLoader

num_workers = 0
batch_size = 8

torch.manual_seed(123)

train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=True, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)

In [147]:
# Dry run

print("Train loader:")
for input_batch, label_batch in train_loader:
    pass
print("input batch shape:", input_batch.shape)
print("label batch shape:", label_batch.shape)

Train loader:
input batch shape: torch.Size([8, 120])
label batch shape: torch.Size([8])


In [148]:
print(f"{len(train_loader)} batches in train loader")
print(f"{len(val_loader)} batches in validation loader")
print(f"{len(test_loader)} batches in test loader")

130 batches in train loader
19 batches in validation loader
38 batches in test loader


In [149]:
temp = [1, 2, 3, 4]
temp + [99] * (6 - len(temp))
print(tokenizer.n_vocab-1)

50256


# Initializing the model weights

In [150]:
CHOOSE_MODEL = "gpt2-small (124M)"
BASE_CONFIG = {
    "vocab_size": tokenizer.n_vocab,
    "context_length": 1024,
    "drop_rate": 0.0,
    "qkv_bias": True,
}

model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

BASE_CONFIG.update(model_configs[CHOOSE_MODEL])

In [151]:
from gpt_download import download_and_load_gpt2
from gpt_helpers import load_weights_into_gpt,GPTModel

In [152]:
model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")
settings, params = download_and_load_gpt2(model_size=model_size, models_dir='gpt2', load=True)

In [153]:
model = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model, params)

In [154]:
from gpt_helpers import (
    generate_text_simple,
    text_to_token_ids,
    token_ids_to_text
)

# check simple initial workiing

text1 = "Every effort moves you"
token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(text1, tokenizer),
    max_new_tokens=15,
    context_size=BASE_CONFIG["context_length"]
)
print(token_ids_to_text(token_ids, tokenizer))

Every effort moves you forward.

The first step is to understand the importance of your work


In [155]:
# does instructions style works here?

text2 = (
    "Is the following text a spam? Answer with a yes or no:"
    " You are a winner you have been specially"
    " selected to recieve $1000 or a $2000 prize."
)

token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(text2, tokenizer),
    max_new_tokens=22,
    context_size=BASE_CONFIG["context_length"]
)
print(token_ids_to_text(token_ids, tokenizer))

Is the following text a spam? Answer with a yes or no: You are a winner you have been specially selected to recieve $1000 or a $2000 prize.

You have been specially selected to receive $1000 or a $2000 prize. You have been specially selected


 # Adding classification head

In [156]:
model

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.0, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.0, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=7

In [157]:
for param in model.parameters():
    param.requires_grad = False

#freezing the model

In [158]:
torch.manual_seed(123)

num_classes = 2
model.out_head = torch.nn.Linear(BASE_CONFIG["emb_dim"], num_classes)

In [159]:
model

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.0, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.0, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=7

In [160]:
for parm in model.trf_blocks[-1].parameters():
    parm.requires_grad = True

for param in model.final_norm.parameters():
    param.requires_grad = True

In [161]:
inputs = tokenizer.encode("Do you have time?")
inputs = torch.tensor(inputs).unsqueeze(0)

print(inputs)
print(f"inputs.shape: {inputs.shape}")

tensor([[5211,  345,  423,  640,   30]])
inputs.shape: torch.Size([1, 5])


In [162]:
with torch.no_grad():
    outputs = model(inputs)

print(outputs)

tensor([[[-1.5854,  0.9904],
         [-3.7235,  7.4548],
         [-2.2661,  6.6049],
         [-3.5983,  3.9902],
         [-5.2433,  6.3857]]])


In [163]:
print(outputs[:,-1,:])

tensor([[-5.2433,  6.3857]])


in this classification task due to the nature of the causal mask where we are seeing only the previous inputs in the current classification context we need to look at the last posistion as iyt will have information on all the previous token



# calculating the loss function

In [164]:
logits = outputs[:,-1,:]
probas = torch.softmax(logits, dim=-1)
print(probas.sum())
print(probas)

tensor(1.0000)
tensor([[8.9044e-06, 9.9999e-01]])


In [165]:
label = torch.argmax(probas).item()
label

1

In [166]:
# we we apply th softmax preserves the element scale. ie the posistions dont change
# we can skip  the softmax

label = torch.argmax(logits).item()
label

1

In [167]:
# the last token has most of the information
# the causal attension mask we used , the initial tokens have less information. when it comes to the last, we end up with information from all the tokens prior to it

def calc_accuracy_loader(dataloader, model, device, num_batches=None):
    model.eval()
    correct_preds, num_examples = 0, 0

    if num_batches is None:
        num_batches = len(dataloader)
    else:
        num_batches = min(num_batches, len(dataloader))

    for idx, (input_batch, label_batch) in enumerate(dataloader):
        if idx < num_batches:
            input_batch, label_batch = input_batch.to(device), label_batch.to(device)
            with torch.no_grad():
                logits = model(input_batch)[:, -1, :]
            predicted_labels = torch.argmax(logits, dim=-1)

            correct_preds += torch.sum(predicted_labels == label_batch).item()
            num_examples += label_batch.shape[0]
        else:
            break

    return correct_preds / num_examples


In [168]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

torch.manual_seed(123)

train_accuracy = calc_accuracy_loader(train_loader, model, device, num_batches=10)
val_accuracy = calc_accuracy_loader(val_loader, model, device, num_batches=10)
test_accuracy = calc_accuracy_loader(test_loader, model, device, num_batches=10)

In [173]:
train_accuracy, val_accuracy, test_accuracy

(0.4625, 0.45, 0.4875)

In [177]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)[:, -1, :]
    loss = torch.nn.functional.cross_entropy(logits, target_batch)
    return loss


In [178]:
def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.0
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        # Reduce the number of batches to match the total number of batches in the dataloader
        # if num_batches excceds the number of batches in  the data loader
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

In [179]:
train_loss = calc_loss_loader(train_loader, model, device, num_batches=10)
train_loss

3.031183195114136

In [180]:
val_loss = calc_loss_loader(val_loader, model, device, num_batches=10)
val_loss

2.8360007405281067

In [181]:
test_loss = calc_loss_loader(test_loader, model, device, num_batches=10)
test_loss

2.6336529731750487