<a href="https://colab.research.google.com/github/superasymmetry/AGI-Fun/blob/main/potemkintraining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q trl accelerate bitsandbytes datasets sentencepiece jsonlines
!pip install git+https://github.com/huggingface/transformers

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch, re

bnb_config = BitsAndBytesConfig(
  load_in_4bit=True,
  bnb_4bit_use_double_quant=True,
  bnb_4bit_quant_type="nf4",
  bnb_4bit_compute_dtype=torch.float32
)

model_id = "Qwen/Qwen2-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    trust_remote_code=True,
    quantization_config=bnb_config
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    device_map="auto",
)

YES_RE  = re.compile(r"\b(yes|correct)\b", re.I)
NO_RE   = re.compile(r"\b(no|incorrect|wrong)\b", re.I)

# quick sanity check
# model_inputs = tokenizer(["An ABAB rhyme scheme"], return_tensors="pt").to("cuda")
# generated_ids = model.generate(**model_inputs)
# tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

In [None]:
from trl import GRPOTrainer, GRPOConfig
from datasets import IterableDataset

model_id = "meta-llama/Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    trust_remote_code=True             # Qwen models need this
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    device_map="auto",                 # or {"": 0} for single‑GPU
    load_in_4bit=True                  # <- correct kwarg
)
cfg = GRPOConfig(output_dir="meta-llama/Llama-3.1-8B-Instruct", learning_rate=2e-5, max_steps=2000, report_to=['tensorboard'])

identity_collator = lambda batch: batch    # returns List[str] as‑is

trainer = GRPOTrainer(
    model = model,
    processing_class= tokenizer,
    train_dataset = PromptStream("doing_tasks_dataset.jsonl", tokenizer),
    reward_funcs=group_fn,         # or group_fn=… (see signature)
    args = cfg          # contains max_steps
)
trainer.train()

In [None]:
from torch.optim import AdamW
from peft import LoraConfig, get_peft_model
prompts = [obj["prompt"] for obj in jsonlines.open("doing_tasks_dataset.jsonl")]

lora_cfg = LoraConfig(
    r=64, lora_alpha=16, lora_dropout=0.1,
    target_modules=["q_proj", "v_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_cfg).to("cuda")

ref_model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2-0.5B-Instruct", device_map="auto",
    # torch_dtype=torch.bfloat16
).eval()
for p in ref_model.parameters(): p.requires_grad = False


optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
BATCH_SIZE = 4
DEVICE = "cuda"
MAX_STEPS = 200
KL_BETA = 0.02

def sequence_logprob(m, txt):
    with torch.no_grad() if not m.training else torch.enable_grad():
        tok = tokenizer(txt, return_tensors="pt").to(DEVICE)
        out = m(**tok)
        # shift for next-token prediction
        logits = out.logits[:, :-1]
        labels = tok.input_ids[:, 1:]
        lp = -torch.nn.functional.cross_entropy(
            logits.reshape(-1, logits.size(-1)),
            labels.reshape(-1),
            reduction="sum",
        )
    return lp


YES_RE = re.compile(r"\b(yes|correct|right|true)\b", re.I)
NO_RE  = re.compile(r"\b(no|incorrect|wrong|false)\b", re.I)

def grade(answer, prompt, unclear_penalty=-0.2):
    critique = (
        f"Task:\n{prompt}\n\n"
        f"Answer:\n{answer}\n\n"
        "Is the answer correct and deep? Reply YES or NO and one reason."
    )
    inp = tokenizer(critique, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        vid = model.generate(**inp, max_new_tokens=128, temperature=0.01, pad_token_id=tokenizer.eos_token_id)
    verdict = tokenizer.decode(vid[0], skip_special_tokens=True)
    if YES_RE.search(verdict) and not NO_RE.search(verdict):
        return 1.0
    if NO_RE.search(verdict):
        return -1.0
    return unclear_penalty

def generate(prompt, temperature):
    inp = tokenizer(prompt, return_tensors="pt").to(DEVICE)
    out = model.generate(**inp, max_new_tokens=256, temperature=temperature)
    return tokenizer.decode(out[0], skip_special_tokens=True)

TEMPS = (0.05,0.15,0.40,0.70,1.10)
for step in range(1, MAX_STEPS+1):
    batch_prompts = random.sample(prompts, 4)

    policy_loss = 0.0
    kl_sum      = 0.0

    for p in batch_prompts:
        # k answers
        answers = [generate(p, t) for t in TEMPS]
        rewards = [grade(a, p)    for a in answers]

        # centre rewards (GRPO advantage)
        rew_t = torch.tensor(rewards, dtype=torch.float32, device=DEVICE)
        adv   = (rew_t - rew_t.mean()) / (rew_t.std() + 1e-7)

        # accumulate loss for this prompt
        for a, w in zip(answers, adv):
            lp   = sequence_logprob(model, a)
            rlp  = sequence_logprob(ref_model, a)
            policy_loss += -w * lp
            kl_sum      += (lp - rlp)

    policy_loss /= (BATCH_SIZE * len(TEMPS))
    kl_term      = KL_BETA * kl_sum / (BATCH_SIZE * len(TEMPS))
    loss         = policy_loss + kl_term

    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()
    optimizer.zero_grad(set_to_none=True)

    print(f"[{step}/{MAX_STEPS}] loss={loss.item():.3f}  pol={policy_loss.item():.3f}  kl={kl_term.item():.3f}")

    if step % 500 == 0:
        model.save_pretrained(f"ckpt_step_{step}")
        tokenizer.save_pretrained(f"ckpt_step_{step}")

In [None]:
# save model
from huggingface_hub import login
from google.colab import userdata
HF_TOKEN=userdata.get('HF_TOKEN')

if HF_TOKEN:
    login(HF_TOKEN)
    print("Successfully logged in to Hugging Face!")
    print(HF_TOKEN)
else:
    print("Token is not set. Please save the token first.")

# Push the LoRA adapters
model.push_to_hub("stringbot/potemkin")
