Chapter 7:Finetuning To Follow Instructions


In [5]:
from importlib.metadata import version 
pkgs = [
    "matplotlib",  # Plotting library
    "tiktoken",    # Tokenizer
    "torch",       # Deep learning library
    "tqdm",        # Progress bar
    "tensorflow",  # For OpenAI's pretrained weights
]
for p in pkgs:
    print(f"{p} version: {version(p)}")

matplotlib version: 3.10.0
tiktoken version: 0.8.0
torch version: 2.5.1
tqdm version: 4.67.1
tensorflow version: 2.18.0


In [10]:
import json 
import os 
import urllib 

def download_and_load_file(file_path ,url):
  if not os.path.exists(file_path):
    with urllib.request.urlopen(url) as response:
      text_data = response.read().decode("utf-8")
    with open(file_path , "w" , encoding="utf-8") as file:
      file.write(text_data)
  with open(file_path , "r" , encoding="utf-8") as file:
    data = json.load(file)
  return data 
file_path = "instruction-data.json"
url = (
    "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch"
    "/main/ch07/01_main-chapter-code/instruction-data.json"
)

data = download_and_load_file(file_path , url)
print("Number of entries :" , len(data))

Number of entries : 1100


In [11]:
print ("Exmple Entry :\n" ,data[50])
print ("Exmple Entry :\n" ,data[999])

Exmple Entry :
 {'instruction': 'Identify the correct spelling of the following word.', 'input': 'Ocassion', 'output': "The correct spelling is 'Occasion.'"}
Exmple Entry :
 {'instruction': "What is an antonym of 'complicated'?", 'input': '', 'output': "An antonym of 'complicated' is 'simple'."}


## 7.2  Preparing dataset for supervised instruction fine-tuning


In [12]:
def format_input(entry):
  instruction_text = (
    f"Below is an instruction that describes a task. "
    f"Write a response that appropriately completes the request."
    f"\n\\n### Instruction:\\n{entry['instruction']}"
  )
  
  input_text = (f"\n\\n### Input:\\n {entry['input']}" if entry["input"] else "")
  return instruction_text + input_text

In [14]:
model_input = format_input(data[50])
desired_response = f"\n\\n### Response:\\n{data[50]['output']}"
print(model_input + desired_response)

Below is an instruction that describes a task. Write a response that appropriately completes the request.
\n### Instruction:\nIdentify the correct spelling of the following word.
\n### Input:\n Ocassion
\n### Response:\nThe correct spelling is 'Occasion.'


In [15]:
model_input = format_input(data[999])
desired_response = f"\n\n### Response:\n{data[999]['output']}"
print(model_input + desired_response)

Below is an instruction that describes a task. Write a response that appropriately completes the request.
\n### Instruction:\nWhat is an antonym of 'complicated'?

### Response:
An antonym of 'complicated' is 'simple'.


In [20]:
train_portion = int(len(data) * 0.85)
test_portion = int(len(data) * 0.1)
val_portion = len(data) - train_portion - test_portion

train_data = data[:train_portion]
test_data = data[train_portion:train_portion + test_portion]
val_data = data[train_portion + test_portion:] 

print(f"train set length : {len(train_data)}")
print(f"test set length : {len(test_data)}")
print(f"val set length : {len(val_data)}")

train set length : 935
test set length : 110
val set length : 55


## 7.3 Organizing data into training batches


In [21]:
import torch 
from torch.utils.data import Dataset

class InstructionDataset(Dataset):
  def __init__(self, data , tokenizer):
    self.data = data
    self.encoded_texts = []
    for entry in data :
      instruction_plus_input = format_input(entry)
      response_text = f"\n\\n### Response:\\n{entry['output']}"
      full_text = instruction_plus_input + response_text 
      self.encoded_texts.append(
        tokenizer.encode(full_text)
      )
    
    def __getitem__(self, index):
      return self.encoded_texts[index]
    def __len__(self):
      return len(self.data)

In [22]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}))

[50256]
