### Step 1: Install necesscary packages

In [1]:
!pip install matplotlib
!pip install torch numpy transformers datasets tiktoken wandb tqdm


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Collecting torch
  Downloading torch-2.8.0-cp311-none-macosx_11_0_arm64.whl.metadata (30 kB)
Collecting transformers
  Downloading transformers-4.56.2-py3-none-any.whl.metadata (40 kB)
Collecting datasets
  Downloading datasets-4.1.1-py3-none-any.whl.metadata (18 kB)
Collecting tiktoken
  Downloading tiktoken-0.11.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (6.7 kB)
Collecting wandb
  Downloading wandb-0.22.0-py3-none-macosx_12_0_arm64.whl.metadata (10 kB)
Collecting filelock (from torch)
  Downloading filelock-3.19.1-py3-none-any.whl.metadata (2.1 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Downloading networkx-3.5-py3-none-any.whl.metadata (6.3

### Step 2: Package imports and configuration

In [2]:
import sys
import os
sys.path.append(os.path.abspath("..")) 
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import pickle
from model import GPT, GPTConfig
import random
from tqdm import tqdm
import time
import json
import matplotlib.pyplot as plt
# Configuration
beta = 0.5
device = 'cuda' if torch.cuda.is_available() else 'cpu'
base_lr = 1e-4
epochs = 5
batch_size = 64
max_length =64
num_samples = 1
max_new_tokens = 200
temperature = 0.8
top_k = 200
# tokenizer
with open("../sft/meta.pkl", "rb") as f:
    meta = pickle.load(f)
stoi, itos = meta["stoi"], meta["itos"]
def encode(s): return [stoi[c] for c in s]
def decode(l): return ''.join([itos[i] for i in l])

### Step 3: Define helper functions

In [3]:
def compute_logprob(input_ids):
    inputs = input_ids[:, :-1]
    targets = input_ids[:, 1:]
    logits, _ = gpt(inputs, full_seq=True)
    B, T, V = logits.size()
    logits_flat = logits.reshape(-1, V)
    targets_flat = targets.reshape(-1)
    loss = F.cross_entropy(logits_flat, targets_flat, ignore_index=0, reduction='none')
    loss = loss.reshape(B, T)
    attention_mask = (targets != 0).float()
    loss = (loss * attention_mask).sum(dim=1) / attention_mask.sum(dim=1)
    return -loss 

def pad_or_truncate(seq, max_length):
    return seq[-max_length:] if len(seq) > max_length else seq + [0] * (max_length - len(seq))

def get_batches(lines, batch_size):
    random.shuffle(lines)
    #for l in lines:
    #    print(l[1])
    for i in range(0, len(lines), batch_size):
        batch = lines[i:i+batch_size]
        if len(batch) < batch_size:
            continue
        neg_inputs = [pad_or_truncate(encode(p['negative'] + '\n\n\n\n'), max_length) for p in batch]
        pos_inputs = [pad_or_truncate(encode(p['positive'] + '\n\n\n\n'), max_length) for p in batch]
        neg_tensor = torch.tensor(neg_inputs, dtype=torch.long, device=device)
        pos_tensor = torch.tensor(pos_inputs, dtype=torch.long, device=device)
        yield neg_tensor, pos_tensor

### Step 4: Load the pretrained NanoGPT model

In [4]:
ckpt = torch.load("../sft/gpt.pt", map_location=device)
gptconf = GPTConfig(**ckpt['model_args'])
gpt = GPT(gptconf)
state_dict = ckpt['model']
unwanted_prefix = '_orig_mod.'
for k in list(state_dict.keys()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
gpt.load_state_dict(state_dict)
gpt.to(device).train()

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(74, 348)
    (wpe): Embedding(256, 348)
    (drop): Dropout(p=0.2, inplace=False)
    (h): ModuleList(
      (0-5): 6 x Block(
        (ln_1): LayerNorm()
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=348, out_features=1044, bias=False)
          (c_proj): Linear(in_features=348, out_features=348, bias=False)
          (attn_dropout): Dropout(p=0.2, inplace=False)
          (resid_dropout): Dropout(p=0.2, inplace=False)
        )
        (ln_2): LayerNorm()
        (mlp): MLP(
          (c_fc): Linear(in_features=348, out_features=1392, bias=False)
          (gelu): GELU(approximate='none')
          (c_proj): Linear(in_features=1392, out_features=348, bias=False)
          (dropout): Dropout(p=0.2, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm()
  )
  (lm_head): Linear(in_features=348, out_features=74, bias=False)
)

### Step 5: Load Data (**students are required to complete this part!**)

In [None]:
# Load data from ./data/pos_neg_pairs.json
import os, json, re
from tqdm import tqdm

def clean_first_line(s: str) -> str:
    return (s.splitlines()[0] if s else "").strip()

def load_json_list(path: str):
    if os.path.exists(path):
        with open(path, "r") as f:
            try:
                return json.load(f)
            except Exception:
                return []
    return []

def save_json_list(path: str, data):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w") as f:
        json.dump(data, f, indent=2)

# ---- 1) Python question generator (simple, vocab-safe) ----
def gen_simple_question():
    """Return a single math question string matching the toy format, ending with '=?'."""
    t = random.choice(["add", "sub", "mul", "div", "solve_mul", "solve_add", "solve_sub", "div_exact", "solve_div"])
    a = random.randint(1, 99)
    b = random.randint(1, 99)

    if t == "add":
        return f"{a}+{b}=?"
    if t == "sub":
        return f"{a}-{b}=?"
    if t == "mul":
        return f"{a}*{b}=?"
    if t == "div":
        # make clean integer division: (a*b)/a = b
        return f"{a*b}/{a}=?"
    if t == "div_exact":
        # choose exact a/b
        a2 = a*b
        return f"{a2}/{b}=?"
    if t == "solve_mul":
        # x*B = A , x = ?
        A = a*b
        B = a
        return f"x*{B}={A}, x=?"
    if t == "solve_add":
        # x + B = A , x = ?
        A = a + b
        B = b
        return f"x+{B}={A}, x=?"
    if t == "solve_sub":
        # x - B = A , x = ?
        # ensure non-negative A
        big = max(a, b)
        small = min(a, b)
        A = big - small
        B = small
        return f"x-{B}={A}, x=?"
    if t == "solve_div":
        # x / B = A , x = ?
        # ensure B not zero
        B = random.randint(1, 12)
        A = random.randint(1, 20)   
        X = A * B                
        return f"x/{B}={A}, x=?"

def gen_questions(n, dedup=True, seed=42):
    random.seed(seed)
    qs = []
    seen = set()
    for _ in range(n * 2):  # oversample a bit to survive dedup
        q = gen_simple_question()
        if not q.endswith("=?"):  # guard
            continue
        if dedup:
            if q in seen:
                continue
            seen.add(q)
        qs.append(q)
        if len(qs) >= n:
            break
    return qs

# ---- 2) Ask NanoGPT to answer (negative) and append to JSON ----
def collect_negatives_from_python_questions(
    out_path="dpo/pos_neg_pairs.json",
    n_samples=10,
    batch_questions=200,
    temperature=0.8,
    top_k=50,
):
    data = load_json_list(out_path)
    # try to dedupe against existing by reconstructing question from negative line start
    existing_qs = set()
    for item in data:
        neg = clean_first_line(item.get("negative", ""))
        # if negative starts with Q, pull question portion (up to '=?')
        m = re.search(r"(.*=\?)", neg)
        if m:
            existing_qs.add(m.group(1).strip())

    added_total = 0
    pbar = tqdm(range(0, n_samples, batch_questions), desc="Collecting negatives")

    for _ in pbar:
        need = min(batch_questions, n_samples - added_total)
        if need <= 0:
            break

        new_qs = gen_questions(need, dedup=True, seed=random.randint(0, 10**9))
        # drop any that already exist
        new_qs = [q for q in new_qs if q not in existing_qs]

        for q in new_qs:
            # Ask NanoGPT for the (bad) answer; keep only first line with the prompt included
            neg = generate(
                q,
                max_new_tokens=60,
                temperature=temperature,
                top_k=top_k,
                stop_on_newline=True,
                include_prompt=True,  # ensures "q ...answer" on one line
            )
            neg = clean_first_line(neg)

            if not neg:
                continue

            # append with positive blank
            data.append({"positive": "", "negative": neg})
            existing_qs.add(q)
            added_total += 1

            # periodic save so you don't lose progress
            if added_total % 100 == 0:
                save_json_list(out_path, data)
                pbar.set_postfix_str(f"saved={len(data)} last='{q}'")

            if added_total >= n_samples:
                break

        if added_total >= n_samples:
            break

    save_json_list(out_path, data)
    print(f"Done. Appended {added_total} items. Total now: {len(data)} in {out_path}")

# ---- Run it ----
# Example: generate 2,000 negatives (positives left blank)
collect_negatives_from_python_questions(out_path="dpo/pos_neg_pairs.json", n_samples=10)

Collecting negatives:   0%|          | 0/1 [00:01<?, ?it/s]

Done. Appended 10 items. Total now: 10 in dpo/pos_neg_pairs.json





### Step 6: Build the optimizer and scheduler (**students are required to complete this part!**)

In [8]:
# recommend to use the AdamW optimizer 

### Step 7: Begin training (**students are required to complete this part!**)

In [None]:
total_steps = len(lines) // batch_size
for epoch in range(epochs):
    pbar = tqdm(get_batches(lines, batch_size))
    for step, (neg_tensor,pos_tensor) in enumerate(pbar):
        ###########################################################
        # Please complete the training code here!
        # Examples: 
        # ...
        # neg_logprob
        # pos_logprob 
        # loss = -F.logsigmoid((pos_logprob - neg_logprob) / beta).mean() - pos_logprob.mean() * 0.1 
        # ...
        ###########################################################
    ckpt_path = f"./dpo.pt"
    torch.save({
        "model_state_dict": gpt.state_dict(),
        "model_args": ckpt['model_args'],
    }, ckpt_path)
    print(f"Saved checkpoint to {ckpt_path}")

### Step 8: Begin testing (**students are required to complete this part!**)

In [None]:
# Load the fine-tuned model
ckpt_path = "../dpo/dpo.pt"
checkpoint = torch.load(ckpt_path, map_location=device)
gptconf = GPTConfig(**checkpoint['model_args'])
gpt = GPT(gptconf).cuda()
try:
    state_dict = checkpoint['model']
except:
    state_dict = checkpoint['model_state_dict']
unwanted_prefix = '_orig_mod.'
for k,v in list(state_dict.items()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
gpt.load_state_dict(state_dict)
# Test
gpt.eval()
test_set = ["17+19=?", "3*17=?", "72/4=?", "72-x=34,x=?", "x*11=44,x=?", "3*17=?", "72/4=?", "72-x=34,x=?"]
with torch.no_grad():
    for prompt in test_set: 
        prompt_ids = encode(prompt)
        ###########################################################
        # Please complete the test code here!
        # ...
        # gpt.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
        # ...
        ###########################################################