In [None]:
%pip install regex requests torch numpy transformers datasets

In [None]:
from mingpt.bpe import BPETokenizer
from mingpt.model import GPT

import torch
from torch.utils.data import Dataset
from datasets import load_dataset
import pandas as pd

# 1. Dataset

In [7]:
class QADataset(Dataset):
    def __init__(self, data, tokenizer, block_size=128):
        """
        Args:
            data (list of tuples): List of (question, answer) pairs.
            tokenizer (BPETokenizer): A tokenizer instance to tokenize text into numerical IDs.
            block_size (int): Maximum sequence length.
            pad_token_id (int): The token ID used for padding.
        """
        self.tokenizer = tokenizer
        self.block_size = block_size
        self.data = [
            self.format_example(data_point["instruction"], data_point["demonstration"]) for data_point in data
        ]

    def format_example(self, question, answer):
        """Formats the question-answer pair for the model, with padding."""
        text = f"<|human|>: {question} \n <|assistant|>: {answer} <|endoftext|>"
        tokens = self.tokenizer(text)
        tokens = tokens.squeeze(0).tolist()
        
        tokens = tokens[:self.block_size]
        
        return tokens

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        """
        Returns:
            x (torch.Tensor): Input tokens (question + answer prompt).
            y (torch.Tensor): Target tokens (shifted output).
        """
        tokens = self.data[idx]
        x = torch.tensor(tokens[:-1], dtype=torch.long)  # Exclude last token for input
        y = torch.tensor(tokens[1:], dtype=torch.long)   # Exclude first token for output
        
        return x, y

In [None]:
data = load_dataset("HuggingFaceH4/helpful-instructions")
pd.DataFrame(data["train"])

In [None]:
# Initialize the BPETokenizer
tokenizer = BPETokenizer()

# Create the dataset with padding
train_dataset = QADataset(data["train"], tokenizer=tokenizer, block_size=1024)

# Example to check one item in the dataset
x, y = train_dataset[0]
print(f"Input tokens: {x}")
print(f"Output tokens: {y}")

# 2. Model Definition

In [None]:
model_type = 'gpt2'
device = 'cuda'

model = GPT.from_pretrained(model_type)
model.to(device)

In [11]:
def generate(prompt='', num_samples=1, steps=20, do_sample=True):
        
    tokenizer = BPETokenizer()
    if prompt == '':
        x = torch.tensor([[tokenizer.encoder.encoder['<|endoftext|>']]], dtype=torch.long)
    else:
        x = tokenizer(prompt).to(device)

    x = x.expand(num_samples, -1)

    y = model.generate(x, max_new_tokens=steps, do_sample=do_sample, top_k=40)
    
    for i in range(num_samples):
        out = tokenizer.decode(y[i].cpu().squeeze())
        print('\n'+'-'*80)
        print(out)

In [None]:
generate(prompt='How do I teach kids to meditate?', num_samples=2, steps=20)

# 3. Finetuning SFT

In [None]:
from mingpt.trainer import Trainer

train_config = Trainer.get_default_config()
train_config.learning_rate = 5e-4
train_config.max_iters = 1
train_config.num_workers = 4
trainer = Trainer(train_config, model, train_dataset)

In [None]:
trainer.run()

In [None]:
model.eval()

In [None]:
generate(prompt='Donald Trump?', num_samples=2, steps=20)