# Finetuning for classification

## Preparing the datase

In [35]:
from importlib.metadata import version

pkgs = [
    "matplotlib",
    "numpy",
    "tiktoken",
    "torch",
    "tensorflow", # for open ai pretrained wights
    "pandas",
    "polars"
]

for p in pkgs:
    print(f"{p}: {version(p)}")

matplotlib: 3.10.5
numpy: 2.3.2
tiktoken: 0.9.0
torch: 2.2.2
tensorflow: 2.16.2
pandas: 2.3.2
polars: 1.32.3


In [36]:
import urllib.request
import zipfile
import os
from pathlib import Path

url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extracted_path = "sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
    if data_file_path.exists():
        print("Data file already exists. Skipping download.")
        return

    #download
    with urllib.request.urlopen(url) as response:
        with open(zip_path, "wb") as out_file:
            out_file.write(response.read())

    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extracted_path)

    # add .tsv file extension
    original_file_path = Path(extracted_path) / "SMSSpamCollection"
    os.rename(original_file_path, data_file_path)
    print(f"Data file downloaded and extracted to {data_file_path}")

try:
    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)
except Exception as e:
    print(f"An error occurred while downloading and extracting the data: {e}")

Data file already exists. Skipping download.


In [37]:
import pandas as pd

In [38]:
df = pd.read_csv(data_file_path, sep="\t", header=None, names=["label", "text"])
df[:5]

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [39]:
len(df)

5572

In [40]:
df["label"].value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

In [41]:
def create_balanced_dataset(df):
    num_spam = df[df['label']=='spam'].shape[0]

    ham_subset = df[df['label']=='ham'].sample(n=num_spam, random_state=123)

    balanced_df = pd.concat([ham_subset, df[df['label']=='spam']])
    return balanced_df

balanced_df = create_balanced_dataset(df)
balanced_df['label'].value_counts()

label
ham     747
spam    747
Name: count, dtype: int64

In [42]:
map_dict = {'ham': 0, 'spam': 1}

balanced_df['label'] = balanced_df['label'].map(map_dict)


In [43]:
balanced_df['label'].value_counts()

label
0    747
1    747
Name: count, dtype: int64

In [44]:
def random_split(df, train_frac=0.8, validation_frac=0.2):
    # shuffle the entire dataset
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)

    train_end = int(train_frac * len(df))
    validation_end = train_end + int(validation_frac * len(df))

    train_df = df[:train_end]
    validation_df = df[train_end:validation_end]
    test_df = df[validation_end:]

    return train_df, validation_df, test_df

train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)
print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(validation_df)}")
print(f"Test set size: {len(test_df)}")


Training set size: 1045
Validation set size: 149
Test set size: 300


# Create Data Loaders

In [45]:
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [46]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
print(tokenizer.decode([50256]))

<|endoftext|>


In [47]:
tokenizer.n_vocab

50257

In [48]:
# we need to create padding to make sure the input lengths across batches are consiste
import torch
from torch.utils.data import Dataset, DataLoader

class SpamDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=None):
        if type(csv_file) == str:
            self.data = pd.read_csv(csv_file)
        elif type(csv_file) == pd.DataFrame:
            self.data = csv_file
        else:
            raise ValueError("csv_file must be a path string or a pandas DataFrame")

        self.encoded_text = [
            tokenizer.encode(txt) for txt in self.data["text"]
        ]

        if max_length is None:
            self.max_length = self._longest_encoded_length()
        else:
            self.max_length = max_length
            # truncate if text is longer than max length
            self.encoded_text = [
                encoded_text[:self.max_length] for encoded_text in self.encoded_text
            ]

        # pad sequence to longest sequence
        self.encoded_text = [
            encoded_text + [pad_token_id]*(self.max_length - len(encoded_text))
            for encoded_text in self.encoded_text
        ]

    def __getitem__(self, index):
        encoded = self.encoded_text[index]
        label = self.data["label"][index]
        return (
            torch.tensor(encoded, dtype=torch.long),
            torch.tensor(label, dtype=torch.long),
        )

    def __len__(self):
        return len(self.data)


    def _longest_encoded_length(self):
        # max_length = 0
        # for encoded_text in self.encoded_text:
        #     if len(encoded_text) > max_length:
        #         max_length = len(encoded_text)
        # return max_length
        return max(len(encoded_text) for encoded_text in self.encoded_text)


In [49]:
train_dataset =  SpamDataset(
    csv_file=train_df,
    tokenizer=tokenizer,
    max_length=None,
    pad_token_id=tokenizer.n_vocab-1
)

In [50]:
train_dataset.max_length

120

In [51]:
val_dataset = SpamDataset(
    csv_file=validation_df,
    tokenizer=tokenizer,
    max_length=train_dataset.max_length,
    pad_token_id=tokenizer.n_vocab-1
)
test_dataset = SpamDataset(
    csv_file=test_df,
    tokenizer=tokenizer,
    max_length=train_dataset.max_length,
    pad_token_id=tokenizer.n_vocab-1
)

In [52]:
from torch.utils.data import DataLoader

num_workers = 0
batch_size = 8

torch.manual_seed(123)

train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=True, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)

In [53]:
# Dry run

print("Train loader:")
for input_batch, label_batch in train_loader:
    pass
print("input batch shape:", input_batch.shape)
print("label batch shape:", label_batch.shape)

Train loader:
input batch shape: torch.Size([8, 120])
label batch shape: torch.Size([8])


In [54]:
print(f"{len(train_loader)} batches in train loader")
print(f"{len(val_loader)} batches in validation loader")
print(f"{len(test_loader)} batches in test loader")

130 batches in train loader
19 batches in validation loader
38 batches in test loader


In [33]:
temp = [1, 2, 3, 4]
temp + [99] * (6 - len(temp))
print(tokenizer.n_vocab-1)

50256
