In [94]:
import mlx.core as mx
import mlx.nn as nn
import numpy as np

# Preparing a datset for supervised insturction finetuning

In [2]:
import json
import os
import urllib


def download_and_load_file(file_path, url):

    if not os.path.exists(file_path):
        with urllib.request.urlopen(url) as response:
            text_data = response.read().decode("utf-8")
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(text_data)

    # The book originally contained this unnecessary "else" clause:
    #else:
    #    with open(file_path, "r", encoding="utf-8") as file:
    #        text_data = file.read()

    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)

    return data


file_path = "instruction-data.json"
url = (
    "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch"
    "/main/ch07/01_main-chapter-code/instruction-data.json"
)

data = download_and_load_file(file_path, url)
print("Number of entries:", len(data))


Number of entries: 1100


In [3]:
print("example entry:\n", data[50])

example entry:
 {'instruction': 'Identify the correct spelling of the following word.', 'input': 'Ocassion', 'output': "The correct spelling is 'Occasion.'"}


In [4]:
print("example entry:\n", data[999])

example entry:
 {'instruction': "What is an antonym of 'complicated'?", 'input': '', 'output': "An antonym of 'complicated' is 'simple'."}


In [5]:
def format_input(entry):
    instruction_text = (
        f"Below is an instruction that describes a task. "
        f"Write a response that appropriately completes the request. "
        f"\n\n### Instruction:\n{entry['instruction']}"
    )
    input_text = f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""
    return instruction_text + input_text

In [6]:
model_input = format_input(data[50])
desired_response = f"\n\n### Reseponse:\n{data[50]['output']}"
print(model_input + desired_response)

Below is an instruction that describes a task. Write a response that appropriately completes the request. 

### Instruction:
Identify the correct spelling of the following word.

### Input:
Ocassion

### Reseponse:
The correct spelling is 'Occasion.'


In [7]:
# formatted response without an input field
model_input = format_input(data[999])
desired_response = f"\n\n### Reseponse:\n{data[999]['output']}"
print(model_input + desired_response)

Below is an instruction that describes a task. Write a response that appropriately completes the request. 

### Instruction:
What is an antonym of 'complicated'?

### Reseponse:
An antonym of 'complicated' is 'simple'.


In [8]:
train_portion = int(len(data) * 0.85)
test_portion = int(len(data) * 0.10)
val_portion = len(data) - train_portion - test_portion
train_data = data[:train_portion]
test_data = data[train_portion:train_portion + test_portion]
val_data = data[train_portion+test_portion:]

In [9]:
print("Training set length:", len(train_data))
print("Validation set length:", len(val_data))
print("Test set length:", len(test_data))

Training set length: 935
Validation set length: 55
Test set length: 110


# Organizing data into training batches

In [10]:
from mlx_lm import load
model_hf, tokenizer_hf = load("openai-community/gpt2-medium")

Fetching 15 files:   0%|          | 0/15 [00:00<?, ?it/s]

In [11]:
class InstructionDataset:
    def __init__(self, data, tokenizer):
        self.data = data
        self.encoded_texts = []
        for entry in data:
            instruction_plus_input = format_input(entry)
            response_text = f"\n\n### Response:\n{entry['output']}"
            full_text = instruction_plus_input + response_text
            self.encoded_texts.append(
                tokenizer.encode(full_text)
            )
    def __getitem__(self, index):
        return self.encoded_texts[index]
    
    def __len__(self):
        return len(self.data)

In [12]:
def custom_collate_draft_1(batch, pad_token_id=50256):
    # find the longest sequence in the batch
    # and increase the max length by +1, which will add one extra padding token
    batch_max_length = max(len(item)+1 for item in batch)

    inputs_lst = []
    for item in batch:
        new_item = item.copy()
        new_item += [pad_token_id]
        padded = (
            new_item + [pad_token_id] * (batch_max_length - len(new_item))
        )
        # remove the extra padded token that has been added via the +1 setting in batch_max_length
        inputs = padded[:-1]
        inputs_lst.append(inputs)
    
    inputs_tensor = mx.stack(mx.array(inputs_lst), axis=0)
    return inputs_tensor

In [13]:
inputs_1 = [0, 1, 2, 3, 4]
inputs_2 = [5, 6]
inputs_3 = [7, 8, 9]
batch = (
    inputs_1, inputs_2, inputs_3
)
print(custom_collate_draft_1(batch))

array([[0, 1, 2, 3, 4],
       [5, 6, 50256, 50256, 50256],
       [7, 8, 9, 50256, 50256]], dtype=int32)


In [14]:
def custom_collate_draft_2(batch, pad_token_id=50256):
    # find the longest sequence in the batch
    # and increase the max length by +1, which will add one extra padding token
    batch_max_length = max(len(item)+1 for item in batch)

    inputs_lst = []
    targets_lst = []
    for item in batch:
        new_item = item.copy()
        new_item += [pad_token_id]
        padded = (
            new_item + [pad_token_id] * (batch_max_length - len(new_item))
        )
        # remove the extra padded token that has been added via the +1 setting in batch_max_length
        inputs = padded[:-1]
        inputs_lst.append(inputs)
        # shift +1 to the right for targets
        targets = padded[1:]
        targets_lst.append(targets)
    
    inputs_tensor = mx.stack(mx.array(inputs_lst), axis=0)
    targets_tensor = mx.stack(mx.array(targets_lst), axis=0)
    return inputs_tensor, targets_tensor

In [15]:
inputs_1 = [0, 1, 2, 3, 4]
inputs_2 = [5, 6]
inputs_3 = [7, 8, 9]
batch = (
    inputs_1, inputs_2, inputs_3
)
print(custom_collate_draft_2(batch))

(array([[0, 1, 2, 3, 4],
       [5, 6, 50256, 50256, 50256],
       [7, 8, 9, 50256, 50256]], dtype=int32), array([[1, 2, 3, 4, 50256],
       [6, 50256, 50256, 50256, 50256],
       [8, 9, 50256, 50256, 50256]], dtype=int32))


In [None]:
def custom_collate_fn(batch,
                      pad_token_id=50256,
                      ignore_index=-100,
                      allowed_max_length=None
):
    # find the longest sequence in the batch
    # and increase the max length by +1, which will add one extra padding token
    batch_max_length = max(len(item)+1 for item in batch)

    inputs_lst = []
    targets_lst = []
    for item in batch:
        new_item = item.copy()
        new_item += [pad_token_id]
        padded = (
            new_item + [pad_token_id] * (batch_max_length - len(new_item))
        )
        # remove the extra padded token that has been added via the +1 setting in batch_max_length
        inputs = mx.array(padded[:-1])
        # shift +1 to the right for targets
        targets = mx.array(padded[1:])

        # replace all but the first padding tokens in targets by ignore_index
        mask = []
        n_ignore = 0
        for x in targets:
            if x == pad_token_id:
                n_ignore += 1
                if n_ignore > 1:
                    mask.append(False)
                else:
                    mask.append(True)
            else:
                mask.append(True)
        mask = mx.array(mask)
        targets = mx.where(mask, targets, ignore_index)
        if allowed_max_length is not None:
            inputs = inputs[:allowed_max_length]
            targets = targets[:allowed_max_length]

        inputs_lst.append(inputs)
        targets_lst.append(targets)
    
    inputs_tensor = mx.stack(mx.array(inputs_lst), axis=0)
    targets_tensor = mx.stack(mx.array(targets_lst), axis=0)
    return inputs_tensor, targets_tensor

In [69]:
inputs_1 = [0, 1, 2, 3, 4]
inputs_2 = [5, 6]
inputs_3 = [7, 8, 9]
batch = (
    inputs_1, inputs_2, inputs_3
)
print(custom_collate_fn(batch))

(array([[0, 1, 2, 3, 4],
       [5, 6, 50256, 50256, 50256],
       [7, 8, 9, 50256, 50256]], dtype=int32), array([[1, 2, 3, 4, 50256],
       [6, 50256, -100, -100, -100],
       [8, 9, 50256, -100, -100]], dtype=int32))


In [85]:
logits_1 = mx.array(
    [[-1.0, 1.0], [-0.5, 1.5]]
)
targets_1 = mx.array([0, 1])
loss_1 = nn.losses.cross_entropy(logits_1, targets_1, reduction="mean")
print(loss_1)

array(1.12693, dtype=float32)


In [86]:
logits_2 = mx.array(
    [[-1.0, 1.0], [-0.5, 1.5], [-0.5, 1.5]]
)
targets_2 = mx.array([0, 1, 1])
loss_2 = nn.losses.cross_entropy(logits_2, targets_2, reduction="mean")
print(loss_2)

array(0.793595, dtype=float32)


In [93]:
targets_3 = mx.array([0, 1, -100])
loss_3 = nn.losses.cross_entropy(logits_2, targets_3, reduction="none")
loss_3_mask = targets_3 != -100
loss_3 = (loss_3 * loss_3_mask).sum() / loss_3_mask.sum()
print("loss_1==loss_3", loss_1==loss_3, loss_1, loss_3)

loss_1==loss_3 array(True, dtype=bool) array(1.12693, dtype=float32) array(1.12693, dtype=float32)


# Creating data loaders for an instruction dataset

In [None]:
class DataLoaderNP:
    def __init__(self, dataset, batch_size, 
                 shuffle, drop_last, seed=None, 
                 collate_fn=None):
        self.dataset = dataset
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.drop_last = drop_last
        self.seed = seed
        self.collate_fn = collate_fn
    
    def __call__(self):
        indices = np.arange(len(self.dataset))
        if self.shuffle:
            if self.seed is not None:
                np.random.seed(self.seed)
            indices = np.random.permutation(indices)

        # collect batches from the dataset
        for i in range(0, len(indices) - self.batch_size+1, self.batch_size):
            batch_indices = indices[i:i+self.batch_size]
            batch = [self.dataset[idx] for idx in batch_indices]
            if self.collate_fn is not None:
                input_ids, target_ids = self.collate_fn(batch)
            # print(input_ids, target_ids)
            yield {
                "input_ids": mx.array(input_ids, dtype=mx.int32),
                "target_ids": mx.array(target_ids, dtype=mx.int32),
            }
    
    def __len__(self):
        n_batches = len(self.dataset)//self.batch_size
        if not self.drop_last:
            n_batches += int(len(self.dataset) % self.batch_size != 0)
        return n_batches

    def __iter__(self):
        return self.__call__()


In [119]:
batch_size = 8
train_dataset = InstructionDataset(train_data, tokenizer_hf)
train_loader = DataLoaderNP(
    train_dataset, batch_size,
    shuffle=True, drop_last=True, seed=123,
    collate_fn=custom_collate_fn
)

In [121]:
print("Train loader:")
for batch in train_loader:
    print(batch['input_ids'].shape, batch['target_ids'].shape)
    # pass
# print("Input batch dimensions:", batch["input_ids"].shape)
# print("Target batch dimensions:", batch["target_ids"].shape)

Train loader:
(8, 90) (8, 90)
(8, 66) (8, 66)
(8, 70) (8, 70)
(8, 71) (8, 71)
(8, 71) (8, 71)
(8, 69) (8, 69)
(8, 58) (8, 58)
(8, 58) (8, 58)
(8, 64) (8, 64)
(8, 92) (8, 92)
(8, 61) (8, 61)
(8, 68) (8, 68)
(8, 65) (8, 65)
(8, 70) (8, 70)
(8, 75) (8, 75)
(8, 75) (8, 75)
(8, 61) (8, 61)
(8, 61) (8, 61)
(8, 80) (8, 80)
(8, 84) (8, 84)
(8, 63) (8, 63)
(8, 69) (8, 69)
(8, 81) (8, 81)
(8, 63) (8, 63)
(8, 72) (8, 72)
(8, 66) (8, 66)
(8, 81) (8, 81)
(8, 62) (8, 62)
(8, 74) (8, 74)
(8, 70) (8, 70)
(8, 68) (8, 68)
(8, 69) (8, 69)
(8, 61) (8, 61)
(8, 66) (8, 66)
(8, 81) (8, 81)
(8, 67) (8, 67)
(8, 66) (8, 66)
(8, 78) (8, 78)
(8, 69) (8, 69)
(8, 70) (8, 70)
(8, 67) (8, 67)
(8, 69) (8, 69)
(8, 69) (8, 69)
(8, 69) (8, 69)
(8, 89) (8, 89)
(8, 92) (8, 92)
(8, 64) (8, 64)
(8, 60) (8, 60)
(8, 66) (8, 66)
(8, 68) (8, 68)
(8, 84) (8, 84)
(8, 64) (8, 64)
(8, 59) (8, 59)
(8, 72) (8, 72)
(8, 63) (8, 63)
(8, 62) (8, 62)
(8, 75) (8, 75)
(8, 78) (8, 78)
(8, 84) (8, 84)
(8, 63) (8, 63)
(8, 76) (8, 76)
(8, 59) (8

In [122]:
print("Example input IDs:", batch["input_ids"][0])

Example input IDs: array([21106, 318, 281, ..., 50256, 50256, 50256], dtype=int32)


In [123]:
print("Example target IDs:", batch["target_ids"][0])

Example target IDs: array([318, 281, 12064, ..., -100, -100, -100], dtype=int32)


In [127]:
val_dataset = InstructionDataset(val_data, tokenizer_hf)
val_loader = DataLoaderNP(
    val_dataset, batch_size,
    shuffle=True, drop_last=True, seed=123,
    collate_fn=custom_collate_fn
)
test_dataset = InstructionDataset(test_data, tokenizer_hf)
test_loader = DataLoaderNP(
    test_dataset, batch_size,
    shuffle=True, drop_last=True, seed=123,
    collate_fn=custom_collate_fn
)

# Test the loaded pretrained LLM

In [None]:
from mlx_lm import generate

# the model is not capable of follwing the instructions, yet.
input_text = format_input(val_data[0])
print(input_text)
generate(model_hf, tokenizer_hf, prompt=input_text)


Below is an instruction that describes a task. Write a response that appropriately completes the request. 

### Instruction:
Convert the active sentence to passive: 'The chef cooks the meal every day.'


"\n\n### Response:\n\nThe chef cooks the meal every day.\n\n### Instruction:\n\nConvert the active sentence to passive: 'The chef cooks the meal every day.'\n\n### Response:\n\nThe chef cooks the meal every day.\n\n### Instruction:\n\nConvert the active sentence to passive: 'The chef cooks the meal every day.'\n\n### Response:\n\nThe chef cooks the meal every day.\n\n### Instruction:\n\nConvert the active sentence to passive: 'The chef cooks the meal every day.'\n\n### Response:\n\nThe chef cooks the meal every day.\n\n### Instruction:\n\nConvert the active sentence to passive: 'The chef cooks the meal every day.'\n\n### Response:\n\nThe chef cooks the meal every day.\n\n### Instruction:\n\nConvert the active sentence to passive: 'The chef cooks the meal every day.'\n\n### Response:\n\nThe chef cooks the meal every day.\n\n### Instruction:\n\nConvert the active sentence to passive: 'The chef cooks the meal every day.'\n\n### Response:\n\nThe chef cooks the meal every day.\n\n### Instru

# Finetuning the LLM on instruction data

"Below is an instruction that describes a task. Write a response that appropriately completes the request. \n\n### Instruction:\nConvert the active sentence to passive: 'The chef cooks the meal every day.'"