# Finetuning for classification

## Preparing the datase

In [5]:
from importlib.metadata import version

pkgs = [
    "matplotlib",
    "numpy",
    "tiktoken",
    "torch",
    "tensorflow", # for open ai pretrained wights
    "pandas",
    "polars"
]

for p in pkgs:
    print(f"{p}: {version(p)}")

matplotlib: 3.10.5
numpy: 2.3.2
tiktoken: 0.9.0
torch: 2.2.2
tensorflow: 2.16.2
pandas: 2.3.2
polars: 1.32.3


In [3]:
import urllib.request
import zipfile
import os
from pathlib import Path

url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extracted_path = "sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
    if data_file_path.exists():
        print("Data file already exists. Skipping download.")
        return

    #download
    with urllib.request.urlopen(url) as response:
        with open(zip_path, "wb") as out_file:
            out_file.write(response.read())

    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extracted_path)

    # add .tsv file extension
    original_file_path = Path(extracted_path) / "SMSSpamCollection"
    os.rename(original_file_path, data_file_path)
    print(f"Data file downloaded and extracted to {data_file_path}")

try:
    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)
except Exception as e:
    print(f"An error occurred while downloading and extracting the data: {e}")

Data file downloaded and extracted to sms_spam_collection/SMSSpamCollection.tsv


In [6]:
import pandas as pd

In [7]:
df = pd.read_csv(data_file_path, sep="\t", header=None, names=["label", "text"])
df[:5]

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
len(df)

5572

In [9]:
df["label"].value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

In [11]:
def create_balanced_dataset(df):
    num_spam = df[df['label']=='spam'].shape[0]

    ham_subset = df[df['label']=='ham'].sample(n=num_spam, random_state=123)

    balanced_df = pd.concat([ham_subset, df[df['label']=='spam']])
    return balanced_df

balanced_df = create_balanced_dataset(df)
balanced_df['label'].value_counts()

label
ham     747
spam    747
Name: count, dtype: int64

In [12]:
map_dict = {'ham': 0, 'spam': 1}

balanced_df['label'] = balanced_df['label'].map(map_dict)


In [14]:
balanced_df['label'].value_counts()

label
0    747
1    747
Name: count, dtype: int64

In [18]:
def random_split(df, train_frac=0.8, validation_frac=0.2):
    # shuffle the entire dataset
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)

    train_end = int(train_frac * len(df))
    validation_end = train_end + int(validation_frac * len(df))

    train_df = df[:train_end]
    validation_df = df[train_end:validation_end]
    test_df = df[validation_end:]

    return train_df, validation_df, test_df

train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)
print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(validation_df)}")
print(f"Test set size: {len(test_df)}")


Training set size: 1045
Validation set size: 149
Test set size: 300


# Create Data Loaders