# Fine Tuning - Padding


Instruction Fine-tuning
- broader range of tasks when compared with classification
- can be used for models that need more flexibility
- harder to train, demands larger resources (computational and data)

Classification Fine-tuning 
- constrained into predicting classes it has encautered during training (spam)
- cannot say anything else about the text 
- generally easier to develop (given its restriction nature)
- can achieve great results on specific tasks


Test fine-tuning using spam messages dataset

In [1]:
# Download Spam Dataset 
import urllib.request
import zipfile
import os
from pathlib import Path


url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extracted_path = "data/sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"


def download_and_unzip_spam_data(
        url, zip_path, extracted_path, data_file_path):
    if data_file_path.exists():
        print(f"{data_file_path} already exists. Skipping download "
              "and extraction."
        )
        return

    with urllib.request.urlopen(url) as response:    #1
        with open(zip_path, "wb") as out_file:
            out_file.write(response.read())

    with zipfile.ZipFile(zip_path, "r") as zip_ref:    #2
        zip_ref.extractall(extracted_path)

    original_file_path = Path(extracted_path) / "SMSSpamCollection"
    os.rename(original_file_path, data_file_path)               #3
    print(f"File downloaded and saved as {data_file_path}")

download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)

data/sms_spam_collection/SMSSpamCollection.tsv already exists. Skipping download and extraction.


In [2]:
import pandas as pd

df = pd.read_csv(
    data_file_path, sep = "\t", header=None, names=["Label", "Text"]
)

df

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


## Preparing Dataset for Classification tasks

In [3]:
print(df["Label"].value_counts())

# undersample for simplicity
def create_balanced_dataset(df):
    num_spam = df[df["Label"] == "spam"].shape[0]
    ham_subset = df[df["Label"] == "ham"].sample(num_spam, random_state=123)
    balanced_df = pd.concat([ham_subset, df[df["Label"] == "spam"] ])
    return balanced_df

balanced_df = create_balanced_dataset(df)
print("\nBalanced dataset\n", balanced_df["Label"].value_counts())

Label
ham     4825
spam     747
Name: count, dtype: int64

Balanced dataset
 Label
ham     747
spam    747
Name: count, dtype: int64


In [4]:
# convert classes to dictionary
balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1})


# split dataset for training
def random_split(df, train_frac, validation_frac):
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)
    train_end = int(len(df) * train_frac)
    validation_end = train_end + int(len(df) * validation_frac)
    train_df = df[:train_end]
    validation_df = df[train_end:validation_end]
    test_df = df[validation_end:]
    return train_df, validation_df, test_df


train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)
 #saving for later

train_df.to_csv("data/train.csv", index=None)
validation_df.to_csv("data/validation.csv", index=None)
test_df.to_csv("data/test.csv", index=None)


### Creating data loaders

To organize the chunks for fine tuning (which is basically a re training)

Options:
- truncate messages to the shortest message in the batch (cheaper)
- pad all messages to the longest, using specific token (preserves the content)

In [5]:
# pad batching using tiktoken
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}))

[50256]


In [6]:
import torch
from torch.utils.data import Dataset
# Padding
class SpamDataset(Dataset):
    PAD_TOKEN_ID = 50256  # <|endoftext|> token from gpt2
    def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=PAD_TOKEN_ID):
        self.data = pd.read_csv(csv_file)
        self.encoded_texts = [ tokenizer.encode(text) for text in self.data["Text"] ]
        if max_length is None:
            self.max_length = self._longest_encoded_length()
        else:
            self.max_length = max_length
            self.encoded_texts = [
                encoded_text[:self.max_length] for encoded_text in self.encoded_texts
            ]
        
        # pad all batches
        self.encoded_texts = [
            encoded_text + [pad_token_id] * (self.max_length - len(encoded_text))
            for encoded_text in self.encoded_texts
        ]

    def __getitem__(self, index):
        encoded = self.encoded_texts[index]
        label = self.data.iloc[index]["Label"]
        return (
            torch.tensor(encoded, dtype=torch.long),
            torch.tensor(label, dtype=torch.long)
        )

    def __len__(self):
        return len(self.data)
    
    def _longest_encoded_length(self):
        max_length = 0
        for encoded_text in self.encoded_texts:
            encoded_length = len(encoded_text)
            if encoded_length > max_length:
                max_length = encoded_length
        return max_length


In [7]:
train_dataset = SpamDataset(
    csv_file="data/train.csv",
    max_length=None,
    tokenizer=tokenizer
)


val_dataset = SpamDataset(
    csv_file="data/validation.csv",
    max_length=train_dataset.max_length,
    tokenizer=tokenizer
)

test_dataset = SpamDataset(
    csv_file="data/test.csv",
    max_length=train_dataset.max_length,
    tokenizer=tokenizer
)

print(train_dataset.max_length)

120


### Data Loaders


In [8]:
from torch.utils.data import DataLoader

num_workers = 0
batch_size = 8
torch.manual_seed(123)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    drop_last=True
)

val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False
)

for input_batch, target_batch in train_loader:
    pass

print("input dim:", input_batch.shape)
print("target dim:", target_batch.shape)

input dim: torch.Size([8, 120])
target dim: torch.Size([8])


In [9]:
t, v, ts = len(train_loader), len(val_loader), len(test_loader)

print(f"t,v, ts {t},{v},{ts}")

t,v, ts 130,19,38
