In [1]:
import json
import os
import urllib

In [2]:
def download_and_load_file(file_path, url):
    if not os.path.exists(file_path):
        with urllib.request.urlopen(url) as response:
            text_data = response.read().decode("utf-8")

        with open(file_path, "w", encoding="utf-8") as fout:
            fout.write(text_data)
    #else:
    #    with open(file_path, "r", encoding="utf-8") as fin:
    #        text_data = fin.read()

    with open(file_path, "r") as fin:
        data = json.load(fin)

    return data

file_path = "ch07/instruction-data.json"
url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch07/01_main-chapter-code/instruction-data.json"
data = download_and_load_file(file_path, url)
print(len(data))

1100


In [3]:
print(data[50])

{'instruction': 'Identify the correct spelling of the following word.', 'input': 'Ocassion', 'output': "The correct spelling is 'Occasion.'"}


In [4]:
print(data[999])

{'instruction': "What is an antonym of 'complicated'?", 'input': '', 'output': "An antonym of 'complicated' is 'simple'."}


In [5]:
def format_input(entry):
    instruction_text = (
        f"Bellow is an instruction that describe a task. "
        f"Write a response that appropriately completes the request."
        f"\n\n### Instruction:\n{entry['instruction']}"
    )

    input_text = f"\n\n### Input: \n{entry['input']}" if entry['input'] else ""

    return instruction_text + input_text

model_input = format_input(data[50])
print(model_input)

Bellow is an instruction that describe a task. Write a response that appropriately completes the request.

### Instruction:
Identify the correct spelling of the following word.

### Input: 
Ocassion


In [6]:
desired_output = f"\n\n### Response: \n{data[50]['output']}"
print(model_input + desired_output)

Bellow is an instruction that describe a task. Write a response that appropriately completes the request.

### Instruction:
Identify the correct spelling of the following word.

### Input: 
Ocassion

### Response: 
The correct spelling is 'Occasion.'


In [7]:
model_input = format_input(data[999])
desired_output = f"\n\n### Response: \n{data[999]['output']}"
print(model_input + desired_output)

Bellow is an instruction that describe a task. Write a response that appropriately completes the request.

### Instruction:
What is an antonym of 'complicated'?

### Response: 
An antonym of 'complicated' is 'simple'.


In [8]:
train_portion = int(len(data) * 0.85)
test_portion = int(len(data) * 0.1)
val_portion = len(data) - train_portion - test_portion

In [9]:
train_data = data[:train_portion]
test_data = data[train_portion:train_portion+test_portion]
val_data = data[train_portion+test_portion:]

print(f"Training data: {len(train_data)}")
print(f"Test data: {len(test_data)}")
print(f"Validation data: {len(val_data)}")

Training data: 935
Test data: 110
Validation data: 55


In [10]:
import torch

def custom_collate_fn(
    batch,
    pad_token_id=50526,
    ignore_index=-100,
    allowed_max_length=None,
    device="cpu"
):
    batch_max_length = max([len(item)+1 for item in batch])
    inputs_lst, targets_lst = [], []

    for item in batch:
        new_item = item.copy()
        new_item += [pad_token_id]

        padded = new_item + [pad_token_id] * (batch_max_length - len(new_item))
        inputs = torch.tensor(padded[:-1])
        targets = torch.tensor(padded[1:])

        mask = targets == pad_token_id
        indices = torch.nonzero(mask).squeeze()

        if indices.numel() > 1:
            targets[indices[1:]] = ignore_index

        if allowed_max_length is not None:
            inputs = inputs[:allowed_max_length]
            targets = targets[:allowed_max_length]

        inputs_lst.append(inputs)
        targets_lst.append(targets)
    inputs_tensor = torch.stack(inputs_lst).to(device)
    targets_tensor = torch.stack(targets_lst).to(device)

    return inputs_tensor, targets_tensor

In [12]:
from functools import partial
device = torch.device("cpu")

customized_collate_fn = partial(
    custom_collate_fn,
    device = device,
    allowed_max_length=1024
)

In [13]:
inputs_1 = [0, 1, 2, 3, 4]
inputs_2 = [5, 6]
inputs_3 = [7, 8, 9]
batch = (inputs_1, inputs_2, inputs_3)

In [14]:
inputs, targets = custom_collate_fn(batch, device="mps")

In [22]:
print(inputs)
print(targets)

print(inputs.shape, targets.shape)

tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50526, 50526, 50526],
        [    7,     8,     9, 50526, 50526]], device='mps:0')
tensor([[    1,     2,     3,     4, 50526],
        [    6, 50526,  -100,  -100,  -100],
        [    8,     9, 50526,  -100,  -100]], device='mps:0')
torch.Size([3, 5]) torch.Size([3, 5])


In [16]:
from torch.utils.data import Dataset

class InstructionDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.encoded_texts = []

        for entry in data:
            instruction_plus_input = format_input(entry)
            response_text = f"\n\n### Response:\n{entry['output']}"
            full_text = instruction_plus_input + response_text
            self.encoded_texts.append(
                tokenizer.encode(full_text)
            )

    def __len__(self):
        return len(self.encoded_texts)

    def __getitem__(self, index):
        return self.encoded_texts[index]

In [17]:
from torch.utils.data import DataLoader
import tiktoken

num_workers = 0
batch_size = 8

torch.manual_seed(123)

tokenizer = tiktoken.get_encoding("gpt2")

train_dataset = InstructionDataset(train_data, tokenizer)
train_loader = DataLoader(
    train_dataset, 
    batch_size=batch_size, 
    collate_fn=customized_collate_fn,
    shuffle=True,
    drop_last=True,
    num_workers=num_workers
)

test_dataset = InstructionDataset(test_data, tokenizer)
test_loader = DataLoader(
    test_dataset, 
    batch_size=batch_size, 
    collate_fn=customized_collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)

val_dataset = InstructionDataset(val_data, tokenizer)
val_loader = DataLoader(
    val_dataset, 
    batch_size=batch_size, 
    collate_fn=customized_collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)

In [24]:
for inputs, targets in train_loader:
    print(inputs.shape, targets.shape)

torch.Size([8, 93]) torch.Size([8, 93])
torch.Size([8, 72]) torch.Size([8, 72])
torch.Size([8, 63]) torch.Size([8, 63])
torch.Size([8, 72]) torch.Size([8, 72])
torch.Size([8, 76]) torch.Size([8, 76])
torch.Size([8, 68]) torch.Size([8, 68])
torch.Size([8, 77]) torch.Size([8, 77])
torch.Size([8, 75]) torch.Size([8, 75])
torch.Size([8, 66]) torch.Size([8, 66])
torch.Size([8, 78]) torch.Size([8, 78])
torch.Size([8, 63]) torch.Size([8, 63])
torch.Size([8, 75]) torch.Size([8, 75])
torch.Size([8, 65]) torch.Size([8, 65])
torch.Size([8, 89]) torch.Size([8, 89])
torch.Size([8, 74]) torch.Size([8, 74])
torch.Size([8, 77]) torch.Size([8, 77])
torch.Size([8, 75]) torch.Size([8, 75])
torch.Size([8, 90]) torch.Size([8, 90])
torch.Size([8, 79]) torch.Size([8, 79])
torch.Size([8, 70]) torch.Size([8, 70])
torch.Size([8, 84]) torch.Size([8, 84])
torch.Size([8, 73]) torch.Size([8, 73])
torch.Size([8, 60]) torch.Size([8, 60])
torch.Size([8, 65]) torch.Size([8, 65])
torch.Size([8, 74]) torch.Size([8, 74])
