<a href="https://colab.research.google.com/github/superasymmetry/ARC-AGI-Fun/blob/main/Potemkin.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Potemkin Benchmark

In [None]:
!pip install -q trl accelerate bitsandbytes datasets sentencepiece jsonlines

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m504.6/504.6 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m70.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m64.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m55.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!pip uninstall -y trl
!pip install -U "transformers>=4.42" "accelerate>=0.31" peft
!pip install -U git+https://github.com/huggingface/trl@main


In [None]:
from huggingface_hub import login
from google.colab import userdata
HF_TOKEN=userdata.get('HF_TOKEN')

if HF_TOKEN:
    login(HF_TOKEN)
    print("Successfully logged in to Hugging Face!")
    print(HF_TOKEN)
else:
    print("Token is not set. Please save the token first.")


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch, re

# model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
# tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
# model = AutoModelForCausalLM.from_pretrained(
#     model_id, load_in_4bit=True, token=HF_TOKEN, device_map="auto"
# )


model_id = "Qwen/Qwen2.5-7B"
tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    trust_remote_code=True             # Qwen models need this
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    device_map="auto",                 # or {"": 0} for single‑GPU
    load_in_4bit=True                  # <- correct kwarg
)

YES_RE  = re.compile(r"\b(yes|correct)\b", re.I)
NO_RE   = re.compile(r"\b(no|incorrect|wrong)\b", re.I)

# quick sanity check
# model_inputs = tokenizer(["An ABAB rhyme scheme"], return_tensors="pt").to("cuda")
# generated_ids = model.generate(**model_inputs)
# tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

# Get Topics Dataset

In [None]:
import jsonlines
import torch
from torch.utils.data import IterableDataset, DataLoader
import random


class PromptStream(IterableDataset):
    def __init__(self, path, tokenizer):
        with jsonlines.open(path) as r:
            self.prompts = [o["prompt"] for o in r]
            self.tokenizer = AutoTokenizer.from_pretrained(
                "Qwen/Qwen2.5-7B",
                trust_remote_code=True             # Qwen models need this
            )

    def __len__(self):
        return len(self.prompts)

    def __iter__(self):
        while True:
            random.shuffle(self.prompts)
            for p in self.prompts:
              if p is not None:
                yield {"prompt": p}

In [None]:
from torch.utils.data import DataLoader

dataset = PromptStream('doing_tasks_dataset.jsonl', tokenizer)
dataloader = DataLoader(dataset, batch_size=32, num_workers=4, drop_last=True, timeout=60)

In [None]:
print(dataset)
print(dataset.__iter__())

In [None]:
def generate(inputs, temperature):
    # inputs = tokenizer(prompt=f"Create an example of the concept/make a: {prompt}", return_tensors="pt").to(model.device)
    answer_ids = model.generate(**inputs, temperature=temperature)
    return tokenizer.decode(answer_ids[0], skip_special_tokens=True)

def grade(execution, prompt, temperature=0, unclear_penalty=-0.2):
    critique_prompt = (
        f"Here is the original task:\n\n{prompt}\n\n"
        f"Here is a proposed answer:\n{execution}\n\n"
        "Does the proposed answer correctly and deeply satisfy the task? "
        "Answer with either YES or NO and give one short reason."
    )
    inputs = tokenizer(critique_prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        verdict_ids = model.generate(
            **inputs,
            max_new_tokens=128,
            temperature=temperature
        )
    verdict = tokenizer.decode(verdict_ids[0], skip_special_tokens=True)

    # 2‑c.  Parse verdict → reward
    if YES_RE.search(verdict) and not NO_RE.search(verdict):
        return 1.0
    if NO_RE.search(verdict):
        return -1.0
    return unclear_penalty

# def group_fn(prompts, model, tokenizer, device):
#     groups, rewards = [], []
#     for p in prompts:
#         ans  = [generate(p,t) for t in (0.05,0.15,0.40,0.70,1.10)]
#         rew  = [grade(a,p) for a in ans]
#         # centre inside group
#         r    = torch.tensor(rew)
#         r    = (r - r.mean()) / (r.std() + 1e-7)
#         groups.append(ans)
#         rewards.append(r.tolist())
#     return groups, rewards

def group_fn(prompt_dicts, model, tokenizer, device):
    groups, rewards = [], []
    for pd in prompt_dicts:
        p = pd["prompt"]
        print(p)
        ans  = [generate(p,t) for t in (0.05,0.15,0.40,0.70,1.10)]
        rew  = [grade(a,p) for a in ans]
        r    = torch.tensor(rew)
        r    = (r - r.mean()) / (r.std() + 1e-7)
        groups.append(ans)
        rewards.append(r.tolist())
    return groups, rewards



# GRPO

In [None]:
!pip install trl[vllm]



In [None]:
from trl import GRPOTrainer, GRPOConfig
from datasets import IterableDataset

model_id = "meta-llama/Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    trust_remote_code=True             # Qwen models need this
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    device_map="auto",                 # or {"": 0} for single‑GPU
    load_in_4bit=True                  # <- correct kwarg
)
cfg = GRPOConfig(output_dir="meta-llama/Llama-3.1-8B-Instruct", learning_rate=2e-5, max_steps=2000, report_to=['tensorboard'])

identity_collator = lambda batch: batch    # returns List[str] as‑is

trainer = GRPOTrainer(
    model = model,
    processing_class= tokenizer,
    train_dataset = PromptStream("doing_tasks_dataset.jsonl", tokenizer),
    reward_funcs=group_fn,         # or group_fn=… (see signature)
    args = cfg          # contains max_steps
)
trainer.train()

In [None]:
from torch.optim import AdamW
from peft import LoraConfig, get_peft_model
prompts = [obj["prompt"] for obj in jsonlines.open("doing_tasks_dataset.jsonl")]

BATCH_SIZE = 4
DEVICE = "cuda"
MAX_STEPS = 2000
KL_BETA = 0.02

lora_cfg = LoraConfig(
    r=64, lora_alpha=16, lora_dropout=0.1,
    target_modules=["q_proj", "v_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_cfg).to("cuda")

ref_model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.1-8B-Instruct", device_map="auto", torch_dtype=torch.bfloat16
).eval()
for p in ref_model.parameters(): p.requires_grad = False


optimizer = AdamW(model.parameters(), lr=2e-5)


In [None]:

def sequence_logprob(m, txt):
    with torch.no_grad() if not m.training else torch.enable_grad():
        tok = tokenizer(txt, return_tensors="pt").to(DEVICE)
        out = m(**tok)
        # shift for next-token prediction
        logits = out.logits[:, :-1]
        labels = tok.input_ids[:, 1:]
        lp = -torch.nn.functional.cross_entropy(
            logits.reshape(-1, logits.size(-1)),
            labels.reshape(-1),
            reduction="sum",
        )
    return lp


YES_RE = re.compile(r"\b(yes|correct|right|true)\b", re.I)
NO_RE  = re.compile(r"\b(no|incorrect|wrong|false)\b", re.I)

def grade(answer, prompt, unclear_penalty=-0.2):
    critique = (
        f"Task:\n{prompt}\n\n"
        f"Answer:\n{answer}\n\n"
        "Is the answer correct and deep? Reply YES or NO and one reason."
    )
    inp = tokenizer(critique, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        vid = model.generate(**inp, max_new_tokens=128, temperature=0.01, pad_token_id=tokenizer.eos_token_id)
    verdict = tokenizer.decode(vid[0], skip_special_tokens=True)
    if YES_RE.search(verdict) and not NO_RE.search(verdict):
        return 1.0
    if NO_RE.search(verdict):
        return -1.0
    return unclear_penalty

def generate(prompt, temperature):
    inp = tokenizer(prompt, return_tensors="pt").to(DEVICE)
    out = model.generate(**inp, max_new_tokens=256, temperature=temperature)
    return tokenizer.decode(out[0], skip_special_tokens=True)

TEMPS = (0.05,0.15,0.40,0.70,1.10)
for step in range(1, MAX_STEPS+1):
    batch_prompts = random.sample(prompts, 4)

    policy_loss = 0.0
    kl_sum      = 0.0

    for p in batch_prompts:
        # k answers
        answers = [generate(p, t) for t in TEMPS]
        rewards = [grade(a, p)    for a in answers]

        # centre rewards (GRPO advantage)
        rew_t = torch.tensor(rewards, dtype=torch.float32, device=DEVICE)
        adv   = (rew_t - rew_t.mean()) / (rew_t.std() + 1e-7)

        # accumulate loss for this prompt
        for a, w in zip(answers, adv):
            lp   = sequence_logprob(model, a)
            rlp  = sequence_logprob(ref_model, a)
            policy_loss += -w * lp
            kl_sum      += (lp - rlp)

    policy_loss /= (BATCH_SIZE * len(TEMPS))
    kl_term      = KL_BETA * kl_sum / (BATCH_SIZE * len(TEMPS))
    loss         = policy_loss + kl_term

    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()
    optimizer.zero_grad(set_to_none=True)

    if step % 50 == 0:
        print(f"[{step}/{MAX_STEPS}] loss={loss.item():.3f}  pol={policy_loss.item():.3f}  kl={kl_term.item():.3f}")

    if step % 500 == 0:
        model.save_pretrained(f"ckpt_step_{step}")
        tokenizer.save_pretrained(f"ckpt_step_{step}")



In [None]:
import inspect
print(inspect.signature(GRPOTrainer.__init__))

In [None]:
import jsonlines
import torch
from torch.utils.data import IterableDataset, DataLoader
import random

class PromptStream(IterableDataset):
    def __init__(self, path, tokenizer):
        with jsonlines.open(path) as r:
            self.prompts = [o["prompt"] for o in r]
        self.tok = tokenizer

    def __len__(self):
        return len(self.prompts)

    def __iter__(self):
        while True:
            random.shuffle(self.prompts)
            for p in self.prompts:
                enc = self.tok(p, return_tensors="pt", truncation=True, padding=False)
                if not(enc==None):
                  # Drop batch dim; ONLY tensors – no strings
                  yield {
                      "input_ids": enc["input_ids"].squeeze(0),
                      "attention_mask": enc["attention_mask"].squeeze(0),
                  }

dataset = PromptStream('doing_tasks_dataset.jsonl', tokenizer)
dataloader = DataLoader(dataset, batch_size=32, num_workers=4, drop_last=True, timeout=60)
dataset.__iter__()

def group_fn(batch, model, tokenizer, device):
    prompts = tokenizer.batch_decode(
        [b["input_ids"] for b in batch],
        skip_special_tokens=True
    )
    answer_groups, reward_groups = [], []
    for p in prompts:
        answers = [generate(p, t) for t in (0.05, 0.15, 0.40, 0.70, 1.10)]
        raw     = [grade(a, p)    for a in answers]
        adv     = torch.tensor(raw, dtype=torch.float32, device=device)
        adv     = (adv - adv.mean()) / (adv.std() + 1e-7)
        answer_groups.append(answers)
        reward_groups.append(adv)
    return answer_groups, reward_groups

cfg = GRPOConfig(output_dir="Qwen2-0.5B-GRPO", learning_rate=2e-5, max_steps=2000)

stream = PromptStream("doing_tasks_dataset.jsonl", tokenizer)

trainer = GRPOTrainer(
    model            = model,
    reward_funcs     = group_fn,       # matches your signature
    args             = cfg,
    train_dataset    = stream,
    processing_class = tokenizer
)
trainer.train()


# Direct Preference Optimization

In [None]:
ans1, _, r1 = ask_and_verify(prompt, temperature=0.7)
ans2, _, r2 = ask_and_verify(prompt, temperature=1.1)
chosen, rejected = (ans1, ans2) if r1 > r2 else (ans2, ans1)
from trl import DPOTrainer
dataset = [{"prompt": prompt, "chosen": chosen, "rejected": rejected} for ...]
trainer = DPOTrainer(
    model,
    ref_model=None,           # uses frozen copy internally
    args=training_args,
    beta=0.1,                 # softness of preference
    train_dataset=dataset
)
trainer.train()

In [None]:
from peft import LoraConfig, get_peft_model
from torch.optim import AdamW

lora = LoraConfig(r=64, target_modules=["q_proj", "v_proj", "up_proj", "down_proj"])
model = get_peft_model(model, lora)
optimizer = AdamW(model.parameters(), lr=1e-5)
KL_BETA   = 0.02

ref_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-7B", load_in_4bit=True, device_map="cuda")
ref_model.eval()
ref_model.requires_grad_(False)