In [1]:
import matplotlib
import numpy as np
import tiktoken
import pandas as pd

import mlx.core as mx
import mlx.nn as nn
import mlx.optimizers as optim
import mlx.data as dx

# Import data

In [2]:
import urllib.request
import zipfile
import os
from pathlib import Path

url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "__sms_spam_collection.zip"
extracted_path = "__sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
    if data_file_path.exists():
        print(f"{data_file_path} already exists. Skipping download and extraction.")
        return

    # Downloading the file
    with urllib.request.urlopen(url) as response:
        with open(zip_path, "wb") as out_file:
            out_file.write(response.read())

    # Unzipping the file
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extracted_path)

    # Add .tsv file extension
    original_file_path = Path(extracted_path) / "SMSSpamCollection"
    os.rename(original_file_path, data_file_path)
    print(f"File downloaded and saved as {data_file_path}")

try:
    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:
    print(f"Primary URL failed: {e}. Trying backup URL...")
    url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path) 

__sms_spam_collection/SMSSpamCollection.tsv already exists. Skipping download and extraction.


In [3]:
df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"])
df

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
df["Label"].value_counts()

Label
ham     4825
spam     747
Name: count, dtype: int64

# Preparing data

In [5]:
def create_balanced_dataset(df):
    
    # Count the instances of "spam"
    num_spam = df[df["Label"] == "spam"].shape[0]
    
    # Randomly sample "ham" instances to match the number of "spam" instances
    ham_subset = df[df["Label"] == "ham"].sample(num_spam, random_state=123)
    
    # Combine ham "subset" with "spam"
    balanced_df = pd.concat([ham_subset, df[df["Label"] == "spam"]])

    return balanced_df


balanced_df = create_balanced_dataset(df)
print(balanced_df["Label"].value_counts())

Label
ham     747
spam    747
Name: count, dtype: int64


In [6]:
balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1})
balanced_df

Unnamed: 0,Label,Text
4307,0,Awww dat is sweet! We can think of something t...
4138,0,Just got to &lt;#&gt;
4831,0,"The word ""Checkmate"" in chess comes from the P..."
4461,0,This is wishing you a great day. Moji told me ...
5440,0,Thank you. do you generally date the brothas?
...,...,...
5537,1,Want explicit SEX in 30 secs? Ring 02073162414...
5540,1,ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...
5547,1,Had your contract mobile 11 Mnths? Latest Moto...
5566,1,REMINDER FROM O2: To get 2.50 pounds free call...


In [7]:
def random_split(df, train_frac, validation_frac):
    # Shuffle the entire DataFrame
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)

    # Calculate split indices
    train_end = int(len(df) * train_frac)
    validation_end = train_end + int(len(df) * validation_frac)

    # Split the DataFrame
    train_df = df[:train_end]
    validation_df = df[train_end:validation_end]
    test_df = df[validation_end:]

    return train_df, validation_df, test_df

train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)
# Test size is implied to be 0.2 as the remainder

train_df.to_csv("train.csv", index=None)
validation_df.to_csv("validation.csv", index=None)
test_df.to_csv("test.csv", index=None)

# Creating data loaders

In [8]:
# tokenizer = tiktoken.get_encoding("gpt2")
# print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}))

In [None]:
from mlx_lm import load
model_hf, tokenizer_hf = load("openai-community/gpt2")

Fetching 14 files:   0%|          | 0/14 [00:00<?, ?it/s]

In [10]:
class SpamDataset:
    def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=50256):
        self.data = pd.read_csv(csv_file)

        # pre-tokenize texts
        self.encoded_texts = [
            tokenizer.encode(text) for text in self.data["Text"]
        ]

        if max_length is None:
            self.max_length = self._longest_encoded_length()
        else:
            self.max_length = max_length
            # truncate if longer than max_length
            self.encoded_texts = [
                encoded_text[:self.max_length]
                for encoded_text in self.encoded_texts
            ]
        
        # pad to the longest sequence
        self.encoded_texts = [
            encoded_text + [pad_token_id] * (self.max_length - len(encoded_text))
            for encoded_text in self.encoded_texts
        ]
    
    def __getitem__(self, index):
        encoded = self.encoded_texts[index]
        label = self.data.iloc[index]["Label"]
        return (
            mx.array(encoded, dtype=mx.int32),
            mx.array(label, dtype=mx.int32),
        )
    
    def __len__(self):
        return len(self.data)

    def _longest_encoded_length(self):
        # max_length = 0
        # for encoded_text in self.encoded_texts:
        #     encoded_length = len(encoded_text)
        #     if encoded_length > max_length:
        #         max_length = encoded_length
        # return max_length
        return max(len(encoded_text) for encoded_text in self.encoded_texts)

In [11]:
train_dataset = SpamDataset(
    csv_file="train.csv",
    max_length=None,
    tokenizer=tokenizer_hf
)
train_dataset.max_length

120

In [12]:
val_dataset = SpamDataset(
    csv_file="validation.csv",
    max_length=train_dataset.max_length,
    tokenizer=tokenizer_hf
)
test_dataset = SpamDataset(
    csv_file="test.csv",
    max_length=train_dataset.max_length,
    tokenizer=tokenizer_hf
)

In [13]:
class DataLoaderNP:
    def __init__(self, dataset, batch_size, shuffle, drop_last, seed=None):
        self.dataset = [{"input_ids": x[0], "label": x[1]} for x in dataset]
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.drop_last = drop_last
        self.seed = seed
    
    def __call__(self):
        indices = np.arange(len(self.dataset))
        if self.shuffle:
            if self.seed is not None:
                np.random.seed(self.seed)
            indices = np.random.permutation(indices)

        # collect batches from the dataset
        for i in range(0, len(indices) - self.batch_size+1, self.batch_size):
            batch_indices = indices[i:i+self.batch_size]
            batch = [self.dataset[idx] for idx in batch_indices]
            input_ids = np.array([item["input_ids"] for item in batch])
            labels = np.array([item["label"] for item in batch])
            yield {
                "input_ids": mx.array(input_ids, dtype=mx.int32),
                "label": mx.array(labels, dtype=mx.int32),
            }
    
    def __len__(self):
        n_batches = len(self.dataset)//self.batch_size
        if not self.drop_last:
            n_batches += int(len(self.dataset) % self.batch_size != 0)
        return n_batches

    def __iter__(self):
        return self.__call__()

batch_size = 8

train_loader = DataLoaderNP(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    drop_last=True,
    seed=None,
)

print("Train loader:")
for batch in train_loader:
    pass
print("Input batch dimensions:", batch["input_ids"].shape)
print("Label batch dimensions:", batch["label"], batch["label"].shape)

Train loader:
Input batch dimensions: (8, 120)
Label batch dimensions: array([0, 0, 0, ..., 0, 0, 0], dtype=int32) (8,)


In [14]:
val_loader = DataLoaderNP(
    dataset=val_dataset,
    batch_size=batch_size,
    shuffle=False,
    drop_last=True,
)
test_loader = DataLoaderNP(
    dataset=test_dataset,
    batch_size=batch_size,
    shuffle=False,
    drop_last=True,
)

# Initializing a model with pretrained weights
- Tensorflow2 does not support Python 3.13 for the moment.

In [15]:
model_hf['model'].h[-1].ln_2

LayerNorm(768, eps=1e-05, affine=True)

In [16]:
class GPTClassifier(nn.Module):
    def __init__(self, pretrained_model, num_classes=2):
        super().__init__()
        self.pretrained_model = pretrained_model.freeze()
        self.classifier = nn.Linear(pretrained_model.h[-1].ln_2.weight.shape[0], 
                                    num_classes)
        self.pretrained_model.h[-1].unfreeze()
        self.pretrained_model.ln_f.unfreeze()

    def __call__(self, inputs, mask=None, cache=None):
        out = self.pretrained_model(inputs, mask, cache)
        out = self.classifier(out)
        return out

In [17]:
mod_clf = GPTClassifier(model_hf['model'], num_classes=2)

In [18]:
from mlx.utils import tree_flatten
print(f"Pretrained model parameters: {sum(v.size for _, v in tree_flatten(mod_clf.parameters()))}")
print(f"trainable parameters: {sum(v.size for _, v in tree_flatten(mod_clf.trainable_parameters()))}")
print(f"trainable parameters (pretrained): {sum(v.size for _, v in tree_flatten(mod_clf.pretrained_model.trainable_parameters()))}")
print(f"trainable parameters (classifier): {sum(v.size for _, v in tree_flatten(mod_clf.classifier.trainable_parameters()))}")

Pretrained model parameters: 124441346
trainable parameters: 7090946
trainable parameters (pretrained): 7089408
trainable parameters (classifier): 1538


In [19]:
inputs = tokenizer_hf.encode("Do you have time")
inputs = mx.expand_dims(mx.array(inputs), axis=0)
print("Inputs:", inputs)
print("Inputs dimensions:", inputs.shape)

Inputs: array([[5211, 345, 423, 640]], dtype=int32)
Inputs dimensions: (1, 4)


In [20]:
outputs = mod_clf(mx.stop_gradient(inputs))
print("Outputs:", outputs)
print("Outputs dimensions:", outputs.shape)
print("Last output token:", outputs[:, -1, :])

Outputs: array([[[0.6055, -1.42779],
        [1.60927, -6.17181],
        [1.32532, -3.85031],
        [2.37668, -3.35481]]], dtype=float32)
Outputs dimensions: (1, 4, 2)
Last output token: array([[2.37668, -3.35481]], dtype=float32)


In [21]:
logits = outputs[:, -1, :]
probas = nn.softmax(logits, axis=-1)
label_logit = mx.argmax(logits)
label_sfmx = mx.argmax(probas)

print("Class label:", label_logit.item(), label_sfmx.item())

Class label: 0 0


In [22]:
def calc_accuracy_loader(data_loader, model, no_grad, num_batches=None):
    model.eval()
    correct_predictions, num_examples = 0, 0

    if num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))
    for i, batch in enumerate(data_loader):
        if i < num_batches:
            input_batch, target_batch = mx.array(batch['input_ids']), mx.array(batch['label'])
            if no_grad:
                input_batch = mx.stop_gradient(input_batch)

            # logits of the last output token
            logits = model(mx.stop_gradient(input_batch))[:, -1, :]
            predicted_labels = mx.argmax(logits, axis=-1)

            num_examples += predicted_labels.shape[0]
            correct_predictions += (predicted_labels == target_batch).sum().item()
        else:
            break
    return correct_predictions / num_examples

In [23]:
train_accuracy = calc_accuracy_loader(train_loader, mod_clf, True, num_batches=5)
val_accuracy = calc_accuracy_loader(val_loader, mod_clf, True, num_batches=5)
test_accuracy = calc_accuracy_loader(test_loader, mod_clf, True, num_batches=5)

print(f"Training accuracy: {train_accuracy*100:.2f}%")
print(f"Validation accuracy: {val_accuracy*100:.2f}%")
print(f"Test accuracy: {test_accuracy*100:.2f}%")

Training accuracy: 50.00%
Validation accuracy: 50.00%
Test accuracy: 45.00%


In [24]:
def compute_ce_loss(model, inputs, targets):
    # !!!including model is crucial for gradient calculation; otherwise, gradients are zero
    # pad_id = 50256
    # logits_all = model(inputs)
    # length_idx = (inputs != pad_id).sum(axis=1) - 1
    # batch_idx = mx.arange(len(length_idx))
    # logits = logits_all[batch_idx, length_idx, :]
    logits = model(inputs)[:, -1, :]
    return nn.losses.cross_entropy(logits, targets, reduction='mean')

def calc_loss_batch(input_batch, target_batch, model, no_grad):
    input_batch, target_batch = mx.array(input_batch), mx.array(target_batch)
    if no_grad:
        input_batch = mx.stop_gradient(input_batch)
    batch_step_fn = nn.value_and_grad(model, compute_ce_loss)
    # logits = model(input_batch)[:, -1, :]
    # loss, grad = batch_step_fn(logits, target_batch)
    loss, grad = batch_step_fn(model, input_batch, target_batch)
    # print(f"@@@@@ loss {loss.item():.3f}, grad {grad['classifier']['weight'].max():.6f}")
    return loss, grad

def calc_loss_loader(data_loader, model, no_grad, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0:
        return float('nan')
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))
    
    for i, batch in enumerate(data_loader):
        if i < num_batches:
            loss, _ = calc_loss_batch(batch['input_ids'], batch['label'], model, no_grad)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

In [25]:
train_loss = calc_loss_loader(train_loader, mod_clf, True, num_batches=5)
val_loss = calc_loss_loader(val_loader, mod_clf, True, num_batches=5)
test_loss = calc_loss_loader(test_loader, mod_clf, True, num_batches=5)

print(f"Training loss: {train_loss:.4f}")
print(f"Validation loss: {val_loss:.4f}")
print(f"Test loss: {test_loss:.4f}")

Training loss: 3.3436
Validation loss: 2.9189
Test loss: 3.2112


In [26]:
def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs, 
                       eval_freq, eval_iter):
    train_losses, val_losses, train_accs, val_accs = [], [], [], []
    examples_seen, global_step = 0, -1

    for epoch in range(num_epochs):
        # mx.eval(model.parameters())
        model.train()
        for batch in train_loader:
            input_batch, target_batch = batch['input_ids'], batch['label']
            loss, grads = calc_loss_batch(input_batch, target_batch, model, no_grad=False)
            # print(f"Epoch {epoch+1}, step {global_step+1:06d}: loss {loss.item():.3f}, grad {grads['classifier']['weight'].max():.6f}")
            optimizer.update(model, grads)
            # Force a graph evaluation
            mx.eval(model.parameters(), optimizer.state, loss)
            examples_seen += input_batch.size
            global_step += 1

            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                print(f"Epoch {epoch+1}, step {global_step:06d}: "
                      f"train loss {train_loss:.3f}, val loss {val_loss:.3f}")
        # calculate accuracy after each epoch
        train_accuracy = calc_accuracy_loader(train_loader, model, True, eval_iter)
        val_accuracy = calc_accuracy_loader(val_loader, model, True, eval_iter)
        print(f"Training accuracy: {train_accuracy*100:.2f}% | ", end="")
        print(f"Validation accuracy: {val_accuracy*100:.2f}%")
        train_accs.append(train_accuracy)
        val_accs.append(val_accuracy)
    return train_losses, val_losses, train_accs, val_accs, examples_seen

def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    train_loss = calc_loss_loader(train_loader, model, True, eval_iter)
    val_loss = calc_loss_loader(val_loader, model, True, eval_iter)
    model.train()
    return train_loss, val_loss

In [27]:
tokenizer_hf.pad_token

In [28]:
import time
start_time = time.time()

mx.random.seed(123)
mod_clf = GPTClassifier(model_hf['model'], num_classes=2)
optimizer = optim.AdamW(learning_rate=5e-5, weight_decay=1e-1)

# TODO: training loss is not decreasing. needs investigation. 
# TODO: fix datastream first.
# e.g., https://apeatling.com/articles/simple-guide-to-local-llm-fine-tuning-on-a-mac-with-mlx/ ??
num_epochs = 5
train_losses, val_losses, train_accs, val_accs, examples_seen = train_model_simple(
    mod_clf, train_loader, val_loader, optimizer, None, 
    num_epochs=num_epochs, eval_freq=50, eval_iter=5,
)

end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

Epoch 1, step 000000: train loss 2.375, val loss 3.165
Epoch 1, step 000050: train loss 0.455, val loss 0.474
Epoch 1, step 000100: train loss 0.071, val loss 0.060
Training accuracy: 95.00% | Validation accuracy: 95.00%
Epoch 2, step 000150: train loss 0.019, val loss 0.115
Epoch 2, step 000200: train loss 0.020, val loss 0.037
Epoch 2, step 000250: train loss 0.037, val loss 0.083
Training accuracy: 97.50% | Validation accuracy: 95.00%
Epoch 3, step 000300: train loss 0.044, val loss 0.102
Epoch 3, step 000350: train loss 0.107, val loss 0.130
Training accuracy: 97.50% | Validation accuracy: 97.50%
Epoch 4, step 000400: train loss 0.073, val loss 0.136
Epoch 4, step 000450: train loss 0.007, val loss 0.125
Epoch 4, step 000500: train loss 0.133, val loss 0.111
Training accuracy: 95.00% | Validation accuracy: 95.00%
Epoch 5, step 000550: train loss 0.006, val loss 0.096
Epoch 5, step 000600: train loss 0.267, val loss 0.062
Training accuracy: 100.00% | Validation accuracy: 97.50%
Trai

In [29]:
train_accs, val_accs

([0.95, 0.975, 0.975, 0.95, 1.0], [0.95, 0.95, 0.975, 0.95, 0.975])

In [30]:
train_accuracy = calc_accuracy_loader(train_loader, mod_clf, True)
val_accuracy = calc_accuracy_loader(val_loader, mod_clf, True)
test_accuracy = calc_accuracy_loader(test_loader, mod_clf, True)

print(f"Training accuracy: {train_accuracy*100:.2f}%")
print(f"Validation accuracy: {val_accuracy*100:.2f}%")
print(f"Test accuracy: {test_accuracy*100:.2f}%")

Training accuracy: 99.13%
Validation accuracy: 97.92%
Test accuracy: 96.28%
