In [None]:
%pip install regex requests torch numpy transformers

In [1]:
from mingpt.bpe import BPETokenizer
from mingpt.model import GPT

import torch
from torch.utils.data import Dataset

# 1. Dataset

In [12]:
class QADataset(Dataset):
    def __init__(self, data, tokenizer, block_size=128, pad_token_id=0):
        """
        Args:
            data (list of tuples): List of (question, answer) pairs.
            tokenizer (BPETokenizer): A tokenizer instance to tokenize text into numerical IDs.
            block_size (int): Maximum sequence length.
            pad_token_id (int): The token ID used for padding.
        """
        self.tokenizer = tokenizer
        self.block_size = block_size
        self.pad_token_id = pad_token_id
        self.data = [
            self.format_example(q, a) for q, a in data
        ]

    def format_example(self, question, answer):
        """Formats the question-answer pair for the model, with padding."""
        text = f"<human>: {question} <assistant>: {answer}"
        tokens = self.tokenizer(text)
        tokens = tokens.squeeze(0).tolist()
        
        # Truncate to block_size
        tokens = tokens[:self.block_size]
        
        # Apply padding if necessary
        if len(tokens) < self.block_size:
            padding = [self.pad_token_id] * (self.block_size - len(tokens))
            tokens.extend(padding)
        
        return tokens

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        """
        Returns:
            x (torch.Tensor): Input tokens (question + answer prompt).
            y (torch.Tensor): Target tokens (shifted output).
            attention_mask (torch.Tensor): Mask for padding tokens.
        """
        tokens = self.data[idx]
        x = torch.tensor(tokens[:-1], dtype=torch.long)  # Exclude last token for input
        y = torch.tensor(tokens[1:], dtype=torch.long)   # Exclude first token for output
        
        return x, y

# Sample question-answer data
qa_data = [
    ("What is AI?", "AI stands for Artificial Intelligence."),
    ("Who developed Python?", "Guido van Rossum developed Python."),
    ("What is deep learning?", "Deep learning is a subset of machine learning."),
]

# Initialize the BPETokenizer
tokenizer = BPETokenizer()

# Create the dataset with padding
train_dataset = QADataset(qa_data, tokenizer=tokenizer, block_size=16, pad_token_id=0)

# Example to check one item in the dataset
x, y = train_dataset[0]
print(f"Input tokens: {x}")
print(f"Output tokens: {y}")

Input tokens: tensor([   27, 10734, 31175,  1867,   318,  9552,    30,  1279,   562, 10167,
        31175,  9552,  6296,   329, 35941])
Output tokens: tensor([10734, 31175,  1867,   318,  9552,    30,  1279,   562, 10167, 31175,
         9552,  6296,   329, 35941,  9345])


In [13]:
model_type = 'gpt2'
device = 'cuda'

model = GPT.from_pretrained(model_type)
model.to(device)

number of parameters: 124.44M
149 161
HF: torch.Size([50257, 768]), MiniGPT: torch.Size([50257, 768])
HF: torch.Size([1024, 768]), MiniGPT: torch.Size([1024, 768])
HF: torch.Size([768]), MiniGPT: torch.Size([768])
HF: torch.Size([768]), MiniGPT: torch.Size([768])
HF: torch.Size([768, 2304]), MiniGPT: torch.Size([2304, 768])
HF: torch.Size([2304]), MiniGPT: torch.Size([2304])
HF: torch.Size([768, 768]), MiniGPT: torch.Size([768, 768])
HF: torch.Size([768]), MiniGPT: torch.Size([768])
HF: torch.Size([768]), MiniGPT: torch.Size([768])
HF: torch.Size([768]), MiniGPT: torch.Size([768])
HF: torch.Size([768, 3072]), MiniGPT: torch.Size([3072, 768])
HF: torch.Size([3072]), MiniGPT: torch.Size([3072])
HF: torch.Size([3072, 768]), MiniGPT: torch.Size([768, 3072])
HF: torch.Size([768]), MiniGPT: torch.Size([768])
HF: torch.Size([768]), MiniGPT: torch.Size([768])
HF: torch.Size([768]), MiniGPT: torch.Size([768])
HF: torch.Size([768, 2304]), MiniGPT: torch.Size([2304, 768])
HF: torch.Size([2304]), 

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear(in_features=768, out_features=768, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): ModuleDict(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          (act): NewGELU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(

In [14]:
from mingpt.trainer import Trainer

train_config = Trainer.get_default_config()
train_config.learning_rate = 5e-4
train_config.max_iters = 10
train_config.num_workers = 2
trainer = Trainer(train_config, model, train_dataset)

running on device cuda


In [15]:
trainer.run()

In [16]:
model.eval()

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear(in_features=768, out_features=768, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): ModuleDict(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          (act): NewGELU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(

In [17]:
def generate(prompt='', num_samples=1, steps=20, do_sample=True):
        
    # tokenize the input prompt into integer input sequence
    tokenizer = BPETokenizer()
    if prompt == '':
        # to create unconditional samples...
        # manually create a tensor with only the special <|endoftext|> token
        # similar to what openai's code does here https://github.com/openai/gpt-2/blob/master/src/generate_unconditional_samples.py
        x = torch.tensor([[tokenizer.encoder.encoder['<|endoftext|>']]], dtype=torch.long)
    else:
        x = tokenizer(prompt).to(device)

    # we'll process all desired num_samples in a batch, so expand out the batch dim
    x = x.expand(num_samples, -1)

    # forward the model `steps` times to get samples, in a batch
    y = model.generate(x, max_new_tokens=steps, do_sample=do_sample, top_k=40)
    
    for i in range(num_samples):
        out = tokenizer.decode(y[i].cpu().squeeze())
        print('-'*80)
        print(out)
        

In [21]:
generate(prompt='What is AI?', num_samples=10, steps=20)

--------------------------------------------------------------------------------
What is AI? AI stands for Artificial Intelligence Intelligence Intelligence Intelligence Intelligence stands for Artificial Intelligence Intelligence Intelligence Intelligence Intelligence Automation Intelligence
--------------------------------------------------------------------------------
What is AI? AI stands for Artificial Intelligence Intelligence Intelligence Intelligence stands for Artificial Intelligence Intelligence Intelligence Intelligence Intelligence Artificial Intelligence Intelligence Laboratory
--------------------------------------------------------------------------------
What is AI? <assistant>: AI stands for Artificial Intelligence Intelligence Intelligence Intelligence Intelligence Intelligence Intelligence Artificial Intelligence Intelligence? <
--------------------------------------------------------------------------------
What is AI? AI stands for Artificial Intelligence Intellig