In [2]:
# %pip install regex requests torch numpy transformers datasets evaluate rouge_score

In [17]:
!nvidia-smi

Tue Mar  4 10:23:02 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.124.04             Driver Version: 570.124.04     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA RTX 4000 Ada Gene...    Off |   00000000:01:00.0 Off |                  Off |
| 30%   38C    P2             19W /  130W |   19637MiB /  20475MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
from mingpt.bpe import BPETokenizer
from mingpt.model import GPT

import torch
from torch.utils.data import Dataset
from datasets import load_dataset
import pandas as pd
from evaluate import load

# 1. Dataset

In [3]:
class QADataset(Dataset):
    def __init__(self, data, tokenizer, block_size=128):
        self.tokenizer = tokenizer
        self.block_size = block_size
        self.data = [
            self.format_example(data_point["instruction"], data_point["demonstration"]) for data_point in data
        ]

    def format_example(self, question, answer):
        text = f"<|human|>: {question} \n <|assistant|>: {answer} <|endoftext|>"
        tokens = self.tokenizer(text)
        tokens = tokens.squeeze(0).tolist()[:self.block_size]        
        prompt = text.split("<|assistant|>:")[0] + "<|assistant|>:"
        self.assistant_index = len(self.tokenizer(prompt).squeeze(0).tolist())

        return tokens

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        """
        Returns:
            x (torch.Tensor): Input tokens (question + answer prompt).
            y (torch.Tensor): Target tokens (shifted output).
        """
        tokens = self.data[idx]
        x = torch.tensor(tokens[:-1], dtype=torch.long)  # Exclude last token for input
        y = torch.tensor(tokens[1:], dtype=torch.long)   # Exclude first token for output

        y[:self.assistant_index] = -1  # Mask loss for assistant tokens
        
        return x, y

In [4]:
data = load_dataset("HuggingFaceH4/helpful-instructions")
pd.DataFrame(data["train"])

Unnamed: 0,instruction,demonstration,meta
0,"Hi, I want to learn to play horseshoes. Can yo...","I can, but maybe I should begin by telling you...",{'source': 'helpful-anthropic-raw'}
1,How do I teach kids to meditate?,Great question! That’s a really useful skill t...,{'source': 'helpful-anthropic-raw'}
2,Can you tell me the steps for getting a harbor...,Sure. I believe you’ll need a copy of the mari...,{'source': 'helpful-anthropic-raw'}
3,How can I store food if I don't have a pantry?,"You could store the food in a refrigerator, th...",{'source': 'helpful-anthropic-raw'}
4,what are some good novels for a 9 year old?,"That depends on the 9 year old, but if they li...",{'source': 'helpful-anthropic-raw'}
...,...,...,...
147701,"Given the following sentence, classify it into...",Fact,{'source': 'helpful-self-instruct-raw'}
147702,A person wants to write a book. he/she writes ...,Chapter 1 - The History of China\nChapter 2 - ...,{'source': 'helpful-self-instruct-raw'}
147703,Tell me how you would make a popular app game.,I would make a game that is similar to 2048. T...,{'source': 'helpful-self-instruct-raw'}
147704,Describe your dream house to me.\n\nOutput:,My dream house is a two-story building with a ...,{'source': 'helpful-self-instruct-raw'}


In [7]:
data["train"].select(range(4))[0]

{'instruction': 'Hi, I want to learn to play horseshoes. Can you teach me?',
 'demonstration': 'I can, but maybe I should begin by telling you that a typical game consists of 2 players and 6 or 8 horseshoes.',
 'meta': {'source': 'helpful-anthropic-raw'}}

In [5]:
tokenizer = BPETokenizer()

train_dataset = QADataset(data["train"], tokenizer=tokenizer, block_size=1024)

x, y = train_dataset[1]
print(f"Input tokens: {x}")
print(f"Output tokens: {y}")

Input tokens: tensor([   27,    91, 10734,    91, 31175,  1374,   466,   314,  4545,  3988,
          284,  1117, 12027,    30,   220,   198,  1279,    91,   562, 10167,
           91, 31175,  3878,  1808,     0,  1320,   447,   247,    82,   257,
         1107,  4465,  5032,   284, 32237,    11,   340,   460,  2222,  4167,
           11,  9480,    11,   290, 12157,    13,   314,   447,   247,    76,
         9675,   345,   765,   284,  4545,   534,  3988,   546,   340,    13,
         1279,    91,   437,  1659,  5239,    91])
Output tokens: tensor([   -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
           -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
           -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
           -1,    -1,   284, 32237,    11,   340,   460,  2222,  4167,    11,
         9480,    11,   290, 12157,    13,   314,   447,   247,    76,  9675,
          345,   765,   284,  4545,   534,  3988,   546,   340

# 2. Model Definition

In [6]:
model_type = 'gpt2'
device = 'cuda'

model = GPT.from_pretrained(model_type)
model.to(device)

number of parameters: 124.44M


GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear(in_features=768, out_features=768, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): ModuleDict(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          (act): NewGELU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(

In [7]:
def generate(prompt='', num_samples=1, steps=20, do_sample=True):

    tokenizer = BPETokenizer()
    if prompt == '':
        x = torch.tensor([[tokenizer.encoder.encoder['<|endoftext|>']]], dtype=torch.long)
    else:
        x = tokenizer(prompt).to(device)

    x = x.expand(num_samples, -1)

    y = model.generate(x, max_new_tokens=steps, do_sample=do_sample, top_k=40)
    
    for i in range(num_samples):
        out = tokenizer.decode(y[i].cpu().squeeze())
        print('\n'+'-'*80)
        print(out)

In [11]:
generate(prompt='How do I teach kids to meditate?', num_samples=2, steps=50)


--------------------------------------------------------------------------------
How do I teach kids to meditate?

Dana is a psychologist and her PhD is in psychology at the same institution a long term. The focus on the problem is that there are many people who would say if I teach the student how to meditate, that would be one of

--------------------------------------------------------------------------------
How do I teach kids to meditate? Are they smart people they can be better meditative practitioners? How do you teach them to meditate? By how do you do?

"This is not a matter of any one specific school," he says. "This one school we're


# 3. Finetuning With SFT

In [11]:
from mingpt.trainer import Trainer

train_config = Trainer.get_default_config()
train_config.learning_rate = 5e-4
train_config.batch_size = 6
train_config.max_iters = 200
train_config.num_workers = 1
trainer = Trainer(train_config, model, train_dataset)

running on device cuda


In [12]:
trainer.run()

Training Progress: 100%|██████████| 200/200 [00:31<00:00,  6.30it/s]

Step: 199 / 200
Loss: 1.007644
Learning Rate: 5.000000e-04
Learning Rate: 5.000000e-04
Training finished!





In [13]:
model.eval()

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear(in_features=768, out_features=768, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): ModuleDict(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          (act): NewGELU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(

In [16]:
generate(prompt='How do I teach kids to meditate?', num_samples=2, steps=50)


--------------------------------------------------------------------------------
How do I teach kids to meditate? 
 <|assistant|>: Are you interested in meditation? <|endoftext|> <|> - Listen to meditation.
- Go to a meditation practice. <|endoftext|> <|endoftext|

--------------------------------------------------------------------------------
How do I teach kids to meditate? What is the best way to meditate? 
 <|assistant|>: The best way to meditate is to practice meditation and meditation, then practice meditating to relieve yourself, and then practice meditating to manage the pressure to practice med


In [None]:
# save the model
torch.save(model.state_dict(), 'models/SFT_finetuned.pth')

# 4. Evaluations

In [None]:
# load the fine-tuned model
model.load_state_dict(torch.load('models/SFT_finetuned.pth'))
model.eval()

In [16]:
tokenizer = BPETokenizer()

def generate_evaluate(prompt='', num_samples=1, steps=20, do_sample=True):
    if prompt == '':
        x = torch.tensor([[tokenizer.encoder.encoder['<|endoftext|>']]], dtype=torch.long)
    else:
        x = tokenizer(prompt).to(device)

    x = x.expand(num_samples, -1)

    y = model.generate(x, max_new_tokens=steps, do_sample=do_sample, top_k=40)
    
    generated_responses = []
    for i in range(num_samples):
        out = tokenizer.decode(y[i].cpu().squeeze())
        generated_responses.append(out)
    
    return generated_responses

references = []
predictions = []

num_samples = 2

for i, sample in enumerate(data["train"].select(range(num_samples))):
    prompt = sample["instruction"]
    reference = [sample["demonstration"]]
    generated_text = generate_evaluate(prompt=prompt, num_samples=1, steps=50, do_sample=True)[0]

    references.append(reference)
    predictions.append(generated_text)

### 4.1 BLEU Score

In [17]:
bleu = load("bleu")

bleu_score = bleu.compute(predictions=predictions, references=references)
print("BLEU Score:", bleu_score)

BLEU Score: {'bleu': 0.0, 'precisions': [0.125, 0.00909090909090909, 0.0, 0.0], 'brevity_penalty': 1.0, 'length_ratio': 1.9649122807017543, 'translation_length': 112, 'reference_length': 57}


### 4.2 Rouge Score

In [18]:
rouge = load("rouge")

scores = rouge.compute(predictions=predictions, references=references)
print("ROUGE Scores:", scores)


ROUGE Scores: {'rouge1': np.float64(0.16991107906753183), 'rouge2': np.float64(0.0), 'rougeL': np.float64(0.12112472963229992), 'rougeLsum': np.float64(0.15621244893054553)}
