In [20]:
import json
import os
import urllib

In [22]:
def download_and_load_file(file_path, url):
    if not os.path.exists(file_path):
        with urllib.request.urlopen(url) as response:
            text_data = response.read().decode("utf-8")

        with open(file_path, "w", encoding="utf-8") as fout:
            fout.write(text_data)
    #else:
    #    with open(file_path, "r", encoding="utf-8") as fin:
    #        text_data = fin.read()

    with open(file_path, "r") as fin:
        data = json.load(fin)

    return data

file_path = "ch07/instruction-data.json"
url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch07/01_main-chapter-code/instruction-data.json"
data = download_and_load_file(file_path, url)
print(len(data))

1100


In [23]:
print(data[50])

{'instruction': 'Identify the correct spelling of the following word.', 'input': 'Ocassion', 'output': "The correct spelling is 'Occasion.'"}


In [24]:
print(data[999])

{'instruction': "What is an antonym of 'complicated'?", 'input': '', 'output': "An antonym of 'complicated' is 'simple'."}


In [25]:
def format_input(entry):
    instruction_text = (
        f"Bellow is an instruction that describe a task. "
        f"Write a response that appropriately completes the request."
        f"\n\n### Instruction:\n{entry['instruction']}"
    )

    input_text = f"\n\n### Input: \n{entry['input']}" if entry['input'] else ""

    return instruction_text + input_text

model_input = format_input(data[50])
print(model_input)

Bellow is an instruction that describe a task. Write a response that appropriately completes the request.

### Instruction:
Identify the correct spelling of the following word.

### Input: 
Ocassion


In [26]:
desired_output = f"\n\n### Response: \n{data[50]['output']}"
print(model_input + desired_output)

Bellow is an instruction that describe a task. Write a response that appropriately completes the request.

### Instruction:
Identify the correct spelling of the following word.

### Input: 
Ocassion

### Response: 
The correct spelling is 'Occasion.'


In [27]:
model_input = format_input(data[999])
desired_output = f"\n\n### Response: \n{data[999]['output']}"
print(model_input + desired_output)

Bellow is an instruction that describe a task. Write a response that appropriately completes the request.

### Instruction:
What is an antonym of 'complicated'?

### Response: 
An antonym of 'complicated' is 'simple'.


In [28]:
import torch

def custom_collate_fn(
    batch,
    pad_token_id=50526,
    ignore_index=-100,
    allowed_max_length=None,
    device="cpu"
):
    batch_max_length = max([len(item)+1 for item in batch])
    inputs_lst, targets_lst = [], []

    for item in batch:
        new_item = item.copy()
        new_item += [pad_token_id]

        padded = new_item + [pad_token_id] * (batch_max_length - len(new_item))
        inputs = torch.tensor(padded[:-1])
        targets = torch.tensor(padded[1:])

        mask = targets == pad_token_id
        indices = torch.nonzero(mask).squeeze()

        if indices.numel() > 1:
            targets[indices[1:]] = ignore_index

        if allowed_max_length is not None:
            inputs = inputs[:allowed_max_length]
            targets = targets[:allowed_max_length]

        inputs_lst.append(inputs)
        targets_lst.append(targets)
    inputs_tensor = torch.stack(inputs_lst).to(device)
    targets_tensor = torch.stack(targets_lst).to(device)

    return inputs_tensor, targets_tensor

In [29]:
inputs_1 = [0, 1, 2, 3, 4]
inputs_2 = [5, 6]
inputs_3 = [7, 8, 9]
batch = (inputs_1, inputs_2, inputs_3)

In [30]:
inputs, targets = custom_collate_fn(batch, device="mps")

In [31]:
print(inputs)
print(targets)

tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50526, 50526, 50526],
        [    7,     8,     9, 50526, 50526]], device='mps:0')
tensor([[    1,     2,     3,     4, 50526],
        [    6, 50526,  -100,  -100,  -100],
        [    8,     9, 50526,  -100,  -100]], device='mps:0')
