# 02: GRPO on GSM8K
Sweep hyperparameters for GRPO training on Qwen2.5-0.5B-GSM8K-SFT.
Uses binary reward (1.0/0.0) and parameterized SFT/new data mix.

In [None]:
!pip install -q --no-deps git+https://github.com/tripathysagar/rlhf-gsm8k.git
!pip install -qUU trl

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for gsm8k-utils (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m540.5/540.5 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import torch
from types import SimpleNamespace
from datasets import load_dataset, concatenate_datasets
from trl import GRPOConfig, GRPOTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainerCallback
from peft import LoraConfig, get_peft_model
from gsm8k_utils import *

from transformers import set_seed
set_seed(1337)

In [None]:
# ── Config ───────────────────────────────────────────────────────────────────
cfg = dict(
    # Model
    model_id       = "tripathysagar/Qwen2.5-0.5B-GSM8K-SFT",
    lora_r         = 16,
    lora_alpha     = 32,

    # GRPO
    num_generations         = 8,
    max_completion_length   = 256,
    max_steps               = 100,
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 4,
    learning_rate           = 5e-5,
    beta                    = 0.04,
    warmup_steps            = 20,

    # wandb
    wandb_project  = "grpo-gsm8k",
    wandb_run_name = "grpo-qwen2.5-0.5B-lora",

    # sft data mix
    grpo_train_size = 1024,   # DO NOT MODIFY — fixed budget matching SFT split size
    sft_frac        = 0.5,   # 0.0 = all new, 0.5 = half/half, 1.0 = all SFT
)

In [None]:
import wandb
from google.colab import userdata

wandb.login(key=userdata.get('wandb'))

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: [wandb.login()] Using explicit session credentials for https://api.wandb.ai.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mtripathysagar08[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

## Helpers

In [None]:
class VibecheckCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs:
            reward = logs.get('reward')
            completions = logs.get('completions')
            if reward is not None:
                print(f"Step {state.global_step} | reward: {reward:.3f}")
            if completions and len(completions) > 0:
                print(f"Step {state.global_step} | Sample:\n{completions[0][:200]}...")


def reward_fn(completions, **kwargs):
    """Binary reward: 1.0 for correct answer, 0.0 otherwise."""
    golds = kwargs["gold"]
    rewards = []
    for comp, gold in zip(completions, golds):
        try:
            ans, _ = extract_answer(comp)
            gold_int = int(float(gold.replace(",", "")))
            rewards.append(1.0 if ans == gold_int else 0.0)
        except Exception as e:
            print(f"[reward_fn error] gold={gold!r} comp={comp[:80]!r} err={e}")
            rewards.append(0.0)
    return rewards


## GRPOExperiment

In [None]:
class GRPOExperiment:
    def __init__(self, cfg):
        self.cfg = SimpleNamespace(**cfg) if isinstance(cfg, dict) else cfg

    def load_ds(self):
        test_ds = load_dataset("openai/gsm8k", "main", split="test")
        self.test_ds = test_ds.train_test_split(test_size=0.5, seed=42)["test"]

        all_train = load_dataset("openai/gsm8k", "main", split="train").shuffle(seed=42)
        if self.cfg.sft_frac == -1:
            # for training on complete dataset
            self.train_ds = all_train
            return

        n_sft = int(self.cfg.grpo_train_size * self.cfg.sft_frac)
        n_new = self.cfg.grpo_train_size - n_sft

        sft_rows = all_train.select(range(self.cfg.grpo_train_size))
        new_rows = all_train.select(range(self.cfg.grpo_train_size, len(all_train)))

        sft_rows = sft_rows.shuffle(seed=42).select(range(n_sft)) if n_sft > 0 else sft_rows.select([])
        new_rows = new_rows.shuffle(seed=42).select(range(n_new)) if n_new > 0 else new_rows.select([])

        self.train_ds = concatenate_datasets([sft_rows, new_rows])

    def _fmt(self, ds):
        """Extract gold answers and format prompts."""
        ds = ds.map(extract_gold, remove_columns=["answer"]).rename_column("question", "prompt")
        return ds.map(lambda x: {"prompt": format_prompt(x["prompt"], self.tokenizer)})

    def fmt_ds(self):
        """Format train and test datasets for GRPO."""
        self.train_ds = self._fmt(self.train_ds)
        self.test_ds = self._fmt(self.test_ds)

    def setup_model(self):
        lora_config = LoraConfig(
            r=self.cfg.lora_r,
            lora_alpha=self.cfg.lora_alpha,
            target_modules="all-linear",
            task_type="CAUSAL_LM",
        )

        self.tokenizer = AutoTokenizer.from_pretrained(self.cfg.model_id)
        model = AutoModelForCausalLM.from_pretrained(
            self.cfg.model_id,
            dtype=torch.bfloat16,
            device_map="auto",
        )
        model.gradient_checkpointing_enable()
        self.model = get_peft_model(model, lora_config)
        self.model.print_trainable_parameters()

    def setup_trainer(self):
        wandb.init(project=self.cfg.wandb_project, name=self.cfg.wandb_run_name)

        grpo_config = GRPOConfig(
            output_dir="grpo_qwen_gsm8k",
            num_generations=self.cfg.num_generations,
            max_completion_length=self.cfg.max_completion_length,
            max_steps=self.cfg.max_steps,
            per_device_train_batch_size=self.cfg.per_device_train_batch_size,
            gradient_accumulation_steps=self.cfg.gradient_accumulation_steps,
            learning_rate=self.cfg.learning_rate,
            logging_steps=1,             # increase for longer runs (e.g. 10)
            report_to='wandb',
            beta=self.cfg.beta,
            lr_scheduler_type="cosine",
            warmup_steps=self.cfg.warmup_steps,
            weight_decay=0.01,
            save_strategy="no",          # change to "steps" for longer runs
            # save_steps=100,            # uncomment for longer runs
            # save_total_limit=2,        # uncomment for longer runs
            bf16=True,
            fp16=False,
        )

        self.trainer = GRPOTrainer(
            model=self.model,
            reward_funcs=reward_fn,
            args=grpo_config,
            train_dataset=self.train_ds,
            processing_class=self.tokenizer,
            #callbacks=[VibecheckCallback()],
        )

    def train(self):
        self.trainer.train()

    def eval(self):
        self.model = self.model.merge_and_unload()
        accuracy, _ = perf_check(self.model, self.tokenizer, self.test_ds)
        wandb.log({"eval/accuracy": accuracy})
        return accuracy

    def cleanup(self):
        wandb.finish()
        del self.model, self.trainer
        torch.cuda.empty_cache()

    def __call__(self):
        self.load_ds()
        self.setup_model()
        self.fmt_ds()
        self.setup_trainer()
        self.train()
        result = self.eval()
        self.cleanup()
        return result

## Run Experiment

In [None]:
#exp = GRPOExperiment(cfg)
#exp()

In [None]:
all_sweeps = [
    {"num_generations": 8},
    {"num_generations": 16},
    {"beta": 0.01},
    {"beta": 0.02},
    {"beta": 0.04},
    {"learning_rate": 1e-5},
    {"learning_rate": 5e-5},
    {"learning_rate": 1e-4},
    {"sft_frac": 0.0},
    {"sft_frac": 0.5},
    {"sft_frac": 1.0},
]

results = []
for s in all_sweeps:
    run_name = "sweep-" + "-".join(f"{k}={v}" for k, v in s.items())
    run_cfg = {**cfg, **s, "wandb_run_name": run_name}
    acc = GRPOExperiment(run_cfg)()
    results.append({**s, "accuracy": acc})
    print(f"{s} → accuracy: {acc:.4f}")

results


README.md: 0.00B [00:00, ?B/s]

main/train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

main/test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/290 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

trainable params: 8,798,208 || all params: 502,830,976 || trainable%: 1.7497


Map:   0%|          | 0/1024 [00:00<?, ? examples/s]

Map:   0%|          | 0/1024 [00:00<?, ? examples/s]

Map:   0%|          | 0/660 [00:00<?, ? examples/s]

Map:   0%|          | 0/660 [00:00<?, ? examples/s]

Passing `generation_config` together with generation-related arguments=({'disable_compile'}) is deprecated and will be removed in future versions. Please pass either a `generation_config` object OR all generation parameters explicitly, but not both.


Step,Training Loss
1,0.015197
2,-0.162927
3,-0.0451
4,4.7e-05
5,0.0197
6,-0.012993
7,4.2e-05
8,0.01542
9,5.5e-05
10,-0.025615


Evaluating: 100%|██████████| 11/11 [00:46<00:00,  4.25s/it]


Final Accuracy: 216/660 = 32.73%





0,1
eval/accuracy,▁
profiling/Time taken: GRPOTrainer._calculate_rewards,▇▄▇▅▄▆▂▆▅▂▄▆▃▂▁▆▇▆█▃▅▂▂▃▅▆▃▂▄▂▅▇▅▇▇▂▂█▂▅
profiling/Time taken: GRPOTrainer._get_per_token_logps_and_entropies,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▃▂▁▁
profiling/Time taken: GRPOTrainer._prepare_inputs,▁▁▅▁██▁▁▆▁▅▆▁▁▁▁▇▁▁▁▁▁▁▁▁▁▁█▁▁█▁▁▁▁▁▁▁▅▁
profiling/Time taken: GRPOTrainer.compute_loss,▂▁▁▁▁▁▁▁▁▁▁▇▁▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
profiling/Time taken: GRPOTrainer.reward_fn,▃▂▄▄▄▃▁▃▂▃▂▇█▃▃▂▂▁▃▃▆▆▂▄▄▂▄▅▃█▄▅▆▅▂▄▃▃▁▂
profiling/Time taken: GRPOTrainer.transformers.generate,▅▁▆███▇▅█▅▇█▇▂▂▆██▅▃▆▆▄▄█▄▆▄▃█▅███▃▇▆▃▆█
train/clip_ratio/high_max,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/clip_ratio/high_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/clip_ratio/low_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/accuracy,0.32727
profiling/Time taken: GRPOTrainer._calculate_rewards,0.00067
profiling/Time taken: GRPOTrainer._get_per_token_logps_and_entropies,0.05258
profiling/Time taken: GRPOTrainer._prepare_inputs,0.0
profiling/Time taken: GRPOTrainer.compute_loss,0.05485
profiling/Time taken: GRPOTrainer.reward_fn,0.00021
profiling/Time taken: GRPOTrainer.transformers.generate,3.00119
total_flos,0
train/clip_ratio/high_max,0
train/clip_ratio/high_mean,0


{'num_generations': 8} → accuracy: 0.3273


Loading weights:   0%|          | 0/290 [00:00<?, ?it/s]

trainable params: 8,798,208 || all params: 502,830,976 || trainable%: 1.7497


Map:   0%|          | 0/1024 [00:00<?, ? examples/s]

Map:   0%|          | 0/660 [00:00<?, ? examples/s]

Step,Training Loss
1,0.0
2,-0.02062
3,0.09153
4,5.5e-05
5,0.104457
6,-0.061076
7,0.105904
8,-0.039386
9,0.057511
10,4.6e-05


Evaluating: 100%|██████████| 11/11 [00:48<00:00,  4.41s/it]


Final Accuracy: 195/660 = 29.55%





0,1
eval/accuracy,▁
profiling/Time taken: GRPOTrainer._calculate_rewards,█▂▄▄▂▆▃▄▇▂▂▂▁▄▂▆▅▃▂▅▅▄▃▆▄▂▂▅▂▇▇▇▁▄▃▄▆▃▆▇
profiling/Time taken: GRPOTrainer._get_per_token_logps_and_entropies,▆█▂▂▁▂▇▁▂▇▂▇▂▁▁▂▁▁▁▆▁▂▁▁▁▁▁▁▂▂█▂▂▂▂█▂▇▂▂
profiling/Time taken: GRPOTrainer._prepare_inputs,▆▁▇█▁▁▆▁▁▁▁▁▇▁▇▁▁▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
profiling/Time taken: GRPOTrainer.compute_loss,▃▂▅▄▅▄▄▂▃▃▄▄▃▃▃▅▃▁▄▅▁▃▄▄▇▃▃▄▄▂▅█▆▅▃▃▄▅▄▇
profiling/Time taken: GRPOTrainer.reward_fn,▃▇▂█▂▂▄▆▄▄▄▂▄▄▂▄▃▄▄▆▇▂▃▆▃▁▃▂▄▄▇▄▆█▆▇▅▄▇▃
profiling/Time taken: GRPOTrainer.transformers.generate,▄▇▆▇▄▇▂▄▅▇▃▁▆▁▄▇▅▁▇▁▇▇▆▃▅█▂▆▅▅█▅▇▃▆█▇█▅▆
train/clip_ratio/high_max,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/clip_ratio/high_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/clip_ratio/low_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/accuracy,0.29545
profiling/Time taken: GRPOTrainer._calculate_rewards,0.00075
profiling/Time taken: GRPOTrainer._get_per_token_logps_and_entropies,0.05311
profiling/Time taken: GRPOTrainer._prepare_inputs,0.0
profiling/Time taken: GRPOTrainer.compute_loss,0.05755
profiling/Time taken: GRPOTrainer.reward_fn,0.00025
profiling/Time taken: GRPOTrainer.transformers.generate,6.29192
total_flos,0
train/clip_ratio/high_max,0
train/clip_ratio/high_mean,0


{'num_generations': 16} → accuracy: 0.2955


Loading weights:   0%|          | 0/290 [00:00<?, ?it/s]

trainable params: 8,798,208 || all params: 502,830,976 || trainable%: 1.7497


Map:   0%|          | 0/1024 [00:00<?, ? examples/s]

Map:   0%|          | 0/660 [00:00<?, ? examples/s]

Step,Training Loss
1,0.015197
2,-0.162927
3,-0.014141
4,0.054268
5,0.026064
6,-0.01445
7,0.071454
8,0.056176
9,1.2e-05
10,0.063876


Evaluating: 100%|██████████| 11/11 [00:44<00:00,  4.07s/it]


Final Accuracy: 205/660 = 31.06%





0,1
eval/accuracy,▁
profiling/Time taken: GRPOTrainer._calculate_rewards,▂▇█▅▇▄▃▄▁▆▂▂▅▆▆▅▆▅▇▇▅▅▅▅▆▅▃▁▂▃▅▅▄▃▆▇▁██▃
profiling/Time taken: GRPOTrainer._get_per_token_logps_and_entropies,▁▁▁▁▁▁▁▁▃▃▁▃▁▁▁▁▁▁▁▁▁▁▁▁▃▁▁▁▂▁▁▁▁▁█▂▂▁▁▁
profiling/Time taken: GRPOTrainer._prepare_inputs,▁▁▁▁█▁▁▇▁█▁▄▁▁▇▁▁▁▁▁▅▁▁▄▁▇▁▁▅▄▁▁▆▁▁▁▁▄▁▆
profiling/Time taken: GRPOTrainer.compute_loss,▃▁▂▂▆▃▆▄▄▃█▆▅▅▃▆▇▆▄▄▄▆▅▄▂▄▁▁▂▁▂▄▁▂▄▃▁▄▇▂
profiling/Time taken: GRPOTrainer.reward_fn,▂▃▂▁█▄▃▄▃▄▁▃▃▃▂▃▅▂▃▃▃▃▂▄▂▃▂▂▂▄▂▂▁▁▂▂▂▁▂▂
profiling/Time taken: GRPOTrainer.transformers.generate,▃█▇█▇▁▂█▇▄▅█▅▄▇▆▂▂▄▃▆▆▄▄██▅▂▄▂▅▅▄▅▂▃▂▆▇▁
train/clip_ratio/high_max,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/clip_ratio/high_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/clip_ratio/low_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/accuracy,0.31061
profiling/Time taken: GRPOTrainer._calculate_rewards,0.00075
profiling/Time taken: GRPOTrainer._get_per_token_logps_and_entropies,0.05666
profiling/Time taken: GRPOTrainer._prepare_inputs,0.0
profiling/Time taken: GRPOTrainer.compute_loss,0.0591
profiling/Time taken: GRPOTrainer.reward_fn,0.00024
profiling/Time taken: GRPOTrainer.transformers.generate,3.31602
total_flos,0
train/clip_ratio/high_max,0
train/clip_ratio/high_mean,0


{'beta': 0.01} → accuracy: 0.3106


Loading weights:   0%|          | 0/290 [00:00<?, ?it/s]

trainable params: 8,798,208 || all params: 502,830,976 || trainable%: 1.7497


Map:   0%|          | 0/1024 [00:00<?, ? examples/s]

Map:   0%|          | 0/660 [00:00<?, ? examples/s]

Step,Training Loss
1,0.015197
2,-0.162927
3,-0.014132
4,0.000982
5,0.014681
6,-0.007394
7,2.4e-05
8,0.014027
9,2.3e-05
10,-0.021679


Evaluating: 100%|██████████| 11/11 [00:49<00:00,  4.53s/it]


Final Accuracy: 207/660 = 31.36%





0,1
eval/accuracy,▁
profiling/Time taken: GRPOTrainer._calculate_rewards,▆▃▃▃▃▂▄▄█▁▁▁▂▃▇▇▂▄▂▄▃▃▆▇▃▅▁▅▃▂▆▃▄▄▃▁█▅▆▁
profiling/Time taken: GRPOTrainer._get_per_token_logps_and_entropies,█▂▇▂▂▁▁▇▁▁▁▂▁▁▂▇▁█▁▇▂▁▁▁▂▁▁▁▁▂▁▂▁▁▇▁▁▁▁█
profiling/Time taken: GRPOTrainer._prepare_inputs,▁▁▁▁▆▁█▁▁▁▁▁▁█▁▅▁▁▁▇▁▁▆▆▅▅▁▁▁▁▁▇▁▁▁▇▁▇▁▁
profiling/Time taken: GRPOTrainer.compute_loss,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁
profiling/Time taken: GRPOTrainer.reward_fn,▂▅▅▃▄▄▃▇▄▂▃▅▂▅▆▂█▃▂▅▄▄▂▄▃▂▃▂▄▅▃▇▂▃▁▃▃▇▄▃
profiling/Time taken: GRPOTrainer.transformers.generate,▂▄▁▅▄█▂▅▃▅▃▇▃▇▆▇▄▃▂▃▄▅▃▇▂▇▅▃▆▆▆▆▆▆▆▅▆▆▄▁
train/clip_ratio/high_max,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/clip_ratio/high_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/clip_ratio/low_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/accuracy,0.31364
profiling/Time taken: GRPOTrainer._calculate_rewards,0.00064
profiling/Time taken: GRPOTrainer._get_per_token_logps_and_entropies,0.05273
profiling/Time taken: GRPOTrainer._prepare_inputs,0.0
profiling/Time taken: GRPOTrainer.compute_loss,0.05556
profiling/Time taken: GRPOTrainer.reward_fn,0.00022
profiling/Time taken: GRPOTrainer.transformers.generate,3.78248
total_flos,0
train/clip_ratio/high_max,0
train/clip_ratio/high_mean,0


{'beta': 0.02} → accuracy: 0.3136


Loading weights:   0%|          | 0/290 [00:00<?, ?it/s]

trainable params: 8,798,208 || all params: 502,830,976 || trainable%: 1.7497


Map:   0%|          | 0/1024 [00:00<?, ? examples/s]

Map:   0%|          | 0/660 [00:00<?, ? examples/s]

Step,Training Loss
1,0.015197
2,-0.162927
3,-0.014115
4,3.6e-05
5,0.015439
6,0.00688
7,4.5e-05
8,0.021041
9,4.7e-05
10,0.03


Evaluating: 100%|██████████| 11/11 [00:48<00:00,  4.42s/it]


Final Accuracy: 200/660 = 30.30%





0,1
eval/accuracy,▁
profiling/Time taken: GRPOTrainer._calculate_rewards,▅▅▃▃▇▅▅▂▃▆▁▇▂▇▃▅▅▃▃▆▃▄▃▇▆▆▆▃▇▃▂▇█▃▆▇▄▇▂▆
profiling/Time taken: GRPOTrainer._get_per_token_logps_and_entropies,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▃▂▁█▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▃▁▁
profiling/Time taken: GRPOTrainer._prepare_inputs,█▁▁▁▁▁▇▁▁▆▁▁▁▁▁▇▁█▇▁▁▆▇▁▁▇▁▇▁▁▁▁▁▁▅▁▁▅▁▅
profiling/Time taken: GRPOTrainer.compute_loss,▁▇▄▆▆▄▄▅▆▅▇▁▅▇▄▅▆█▇▅▄▂▁▄▄▆▆▄▄▃▂▅▆▄▃▅▄▆█▆
profiling/Time taken: GRPOTrainer.reward_fn,▃▃▅█▄▃▃▆▄▃▄▁▃▄▇▂▆▄▄▃▄▂▅▅▄▄▇▃▃▂▄▄▆▂▃▅▃▆▃▄
profiling/Time taken: GRPOTrainer.transformers.generate,█▄▂▅▇██▁▃▂█▇▃▃▃▇▃▄▇▅▂█▄▃▂▃▁▂▅▄▄▆▅▇▂▂▄▂█▂
train/clip_ratio/high_max,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/clip_ratio/high_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/clip_ratio/low_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/accuracy,0.30303
profiling/Time taken: GRPOTrainer._calculate_rewards,0.00102
profiling/Time taken: GRPOTrainer._get_per_token_logps_and_entropies,0.05845
profiling/Time taken: GRPOTrainer._prepare_inputs,1e-05
profiling/Time taken: GRPOTrainer.compute_loss,0.06138
profiling/Time taken: GRPOTrainer.reward_fn,0.00022
profiling/Time taken: GRPOTrainer.transformers.generate,4.39074
total_flos,0
train/clip_ratio/high_max,0
train/clip_ratio/high_mean,0


{'beta': 0.04} → accuracy: 0.3030


Loading weights:   0%|          | 0/290 [00:00<?, ?it/s]

trainable params: 8,798,208 || all params: 502,830,976 || trainable%: 1.7497


Map:   0%|          | 0/1024 [00:00<?, ? examples/s]

Map:   0%|          | 0/660 [00:00<?, ? examples/s]

Step,Training Loss
1,0.015197
2,-0.162927
3,-0.066763
4,-0.00152
5,0.028723
6,0.002512
7,0.013263
8,-0.013163
9,-0.004042
10,-0.01492


Evaluating: 100%|██████████| 11/11 [00:45<00:00,  4.15s/it]


Final Accuracy: 179/660 = 27.12%





0,1
eval/accuracy,▁
profiling/Time taken: GRPOTrainer._calculate_rewards,▂▂▄▅▅▂▆▄▂▂▄▂▃▆▁▃▇▆█▃▄▄▄▄▂▂▆▆▄▂▂▄▆▂▃▄▁▂▃▅
profiling/Time taken: GRPOTrainer._get_per_token_logps_and_entropies,▁▁▂▂▁▂▂▁█▁▁▁▁▂▁▁▁▁▃▁▁▁▁▂▁▁▁▃▁▁▁▁▁▁▂▁▃▁▁▁
profiling/Time taken: GRPOTrainer._prepare_inputs,▁▁▁▁▁▇▁▁▁▁▁▇▁▁█▁▁▁▁▁▆█▁▁▁▁▇▁▁▁▁▁▁▁▁▁█▁▁▁
profiling/Time taken: GRPOTrainer.compute_loss,▇▆▄▂▃▄▃▄▅▄▂▃▂▁▅▃▄▄▃▄▅▄▆█▅█▅▄▅▄▅▇▆▄▄▁▃▄▅▃
profiling/Time taken: GRPOTrainer.reward_fn,▂▃▃▂▄▃▃▂▃▁▂▂▁▁▄▃▄▄▅▅▃▃▄▃▃█▅▄▃▂▂▂▅▄▄▂▃▂▃▂
profiling/Time taken: GRPOTrainer.transformers.generate,▅▃▅▄█▅▆▄▃▆▇▃▄▄▇▃██▄▅█▃█▅█▃█▆▅▄█▇▇▃█▃▄▃▃▁
train/clip_ratio/high_max,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/clip_ratio/high_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/clip_ratio/low_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/accuracy,0.27121
profiling/Time taken: GRPOTrainer._calculate_rewards,0.00092
profiling/Time taken: GRPOTrainer._get_per_token_logps_and_entropies,0.0592
profiling/Time taken: GRPOTrainer._prepare_inputs,1e-05
profiling/Time taken: GRPOTrainer.compute_loss,0.0613
profiling/Time taken: GRPOTrainer.reward_fn,0.00024
profiling/Time taken: GRPOTrainer.transformers.generate,2.93897
total_flos,0
train/clip_ratio/high_max,0
train/clip_ratio/high_mean,0


{'learning_rate': 1e-05} → accuracy: 0.2712


Loading weights:   0%|          | 0/290 [00:00<?, ?it/s]

trainable params: 8,798,208 || all params: 502,830,976 || trainable%: 1.7497


Map:   0%|          | 0/1024 [00:00<?, ? examples/s]

Map:   0%|          | 0/660 [00:00<?, ? examples/s]

Step,Training Loss
1,0.015197
2,-0.162927
3,-0.014115
4,3.6e-05
5,0.015439
6,0.00688
7,4.5e-05
8,0.021041
9,4.7e-05
10,0.03


Evaluating: 100%|██████████| 11/11 [00:47<00:00,  4.31s/it]


Final Accuracy: 200/660 = 30.30%





0,1
eval/accuracy,▁
profiling/Time taken: GRPOTrainer._calculate_rewards,▂▂▃▃▆▂▃▂▆▅▇▁▃▃▆▂▂▃▅▂▅▃▄▅▆▆▃▂▂▂▆▆▆▄█▆▃▅▆▅
profiling/Time taken: GRPOTrainer._get_per_token_logps_and_entropies,▁▁▂▁▁▁▂▁▁▂▁▁▁▁▁█▁▁▁▁▁▁█▁▁▂▁▁▁▁█▂▁▁▁▁▁▂▁▁
profiling/Time taken: GRPOTrainer._prepare_inputs,█▁▁▆▆█▁▁▇▁▁▁▁▁▅▁▁▁▁▆▁█▁▁▁▆▁▁▁▁▁▁▇▁▁▁▁▅▁▁
profiling/Time taken: GRPOTrainer.compute_loss,▇▄▅▆▄▇▄▆▄▆▅▅▆▄█▄▃▄▂▅▄▅█▅▅▁▄▅▄▄▅▄▅▃▃▃▂▅▃▅
profiling/Time taken: GRPOTrainer.reward_fn,▂▂▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▂▂▂▂▂▁▂▂▂▂▂▂▃▂▁▁█▂▂▂▃
profiling/Time taken: GRPOTrainer.transformers.generate,▃█▆██▆█▄▁▆▅▃▃▄▇▄▂▁▇▇▃▅▄▃▂▄▃▅▅▆▆█▂▅▇▃██▄▂
train/clip_ratio/high_max,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/clip_ratio/high_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/clip_ratio/low_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/accuracy,0.30303
profiling/Time taken: GRPOTrainer._calculate_rewards,0.00102
profiling/Time taken: GRPOTrainer._get_per_token_logps_and_entropies,0.05548
profiling/Time taken: GRPOTrainer._prepare_inputs,1e-05
profiling/Time taken: GRPOTrainer.compute_loss,0.05843
profiling/Time taken: GRPOTrainer.reward_fn,0.0002
profiling/Time taken: GRPOTrainer.transformers.generate,4.18562
total_flos,0
train/clip_ratio/high_max,0
train/clip_ratio/high_mean,0


{'learning_rate': 5e-05} → accuracy: 0.3030


Loading weights:   0%|          | 0/290 [00:00<?, ?it/s]

trainable params: 8,798,208 || all params: 502,830,976 || trainable%: 1.7497


Map:   0%|          | 0/1024 [00:00<?, ? examples/s]

Map:   0%|          | 0/660 [00:00<?, ? examples/s]

Step,Training Loss
1,0.015197
2,-0.162927
3,-0.033961
4,0.112657
5,0.000656
6,-0.015614
7,4.7e-05
8,0.073139
9,5.7e-05
10,0.020621


Evaluating: 100%|██████████| 11/11 [00:49<00:00,  4.46s/it]


Final Accuracy: 193/660 = 29.24%





0,1
eval/accuracy,▁
profiling/Time taken: GRPOTrainer._calculate_rewards,▃▃▆▄▆▃▅▂▁▂▁▆▆▇▆▆▃▅▅▂▄▇▇█▄▂▂▂▆▆▅▄▆▂▅▃▁▄▁▄
profiling/Time taken: GRPOTrainer._get_per_token_logps_and_entropies,▇▁▇█▁▇▁▁▁▁▁█▁█▂▂▂▂▂▁▂█▂▂▁▁▂▇▂▂▁▇▂▂▁█▂▂█▁
profiling/Time taken: GRPOTrainer._prepare_inputs,▇▁▁▁▁▁▇▁▁▁▁▁▁▁▆▁██▁▁▁▁▁▁▁▁▁▁█▁▁▁▁▅▁▇▁▁▄▅
profiling/Time taken: GRPOTrainer.compute_loss,▅▇▇▄▄▃▃▇▁▂▄▂▃▂▄▆▂▅██▄▅▄▃▇▃▇▅▇▇▇▇▅▇▃▆▃▆▆▄
profiling/Time taken: GRPOTrainer.reward_fn,▃▃▄▇█▂▁█▄▁▄▅▄▄▄▆▄▃▂▅▅▃▃▅▄▆▅▆▄▇▄▄▄▆▄▃▂▂▁▃
profiling/Time taken: GRPOTrainer.transformers.generate,▃▇▆▆▇▆▇▇▄▂▄▃▅▆▄▅▅▇▃▇█▂▇▆▅█▄▇▇▇█▆▇█▃▇▆▄▄▁
train/clip_ratio/high_max,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/clip_ratio/high_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/clip_ratio/low_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/accuracy,0.29242
profiling/Time taken: GRPOTrainer._calculate_rewards,0.00089
profiling/Time taken: GRPOTrainer._get_per_token_logps_and_entropies,0.05949
profiling/Time taken: GRPOTrainer._prepare_inputs,1e-05
profiling/Time taken: GRPOTrainer.compute_loss,0.06182
profiling/Time taken: GRPOTrainer.reward_fn,0.00022
profiling/Time taken: GRPOTrainer.transformers.generate,3.32994
total_flos,0
train/clip_ratio/high_max,0
train/clip_ratio/high_mean,0


{'learning_rate': 0.0001} → accuracy: 0.2924


Loading weights:   0%|          | 0/290 [00:00<?, ?it/s]

trainable params: 8,798,208 || all params: 502,830,976 || trainable%: 1.7497


Map:   0%|          | 0/1024 [00:00<?, ? examples/s]

Map:   0%|          | 0/1024 [00:00<?, ? examples/s]

Map:   0%|          | 0/660 [00:00<?, ? examples/s]

Step,Training Loss
1,-0.008808
2,0.090837
3,-0.03623
4,-0.012781
5,0.096515
6,0.074719
7,-0.005574
8,-0.016433
9,0.107041
10,-0.04528


Evaluating: 100%|██████████| 11/11 [00:50<00:00,  4.58s/it]


Final Accuracy: 209/660 = 31.67%





0,1
eval/accuracy,▁
profiling/Time taken: GRPOTrainer._calculate_rewards,▂▄█▂▅▃▂▄▄▂▅▅▂▁▂▅▅▂▃▁▂▂▄▅▆▄▃▆▂▂▅▅▂▂█▅▃▄▅▆
profiling/Time taken: GRPOTrainer._get_per_token_logps_and_entropies,██▂▂▂▁▂▂▁▁▁▂▁▁▇▁▁▁▂▁▇▂▇▁▁█▂▂▆▁▁▇▁▇▁▂▇▂▂▂
profiling/Time taken: GRPOTrainer._prepare_inputs,▃▇▁▁▁▁█▁█▁▇▅▁▁▁▆▁▁▁▁▇▁▇▁▇▁▁▁▁█▅▁▁█▁▆▁▁▁▁
profiling/Time taken: GRPOTrainer.compute_loss,▂▆▄▆▄▅▅▅▅█▅▄▃▁▆▂▇▁▅▅▅▄▆▅▅█▆▄▃▅▄▇█▆▅▇▄▆▃▅
profiling/Time taken: GRPOTrainer.reward_fn,▃▃▂▂▁▁▂▃▂▃▃▄▂▁▃▃▂▁▆▃▂█▁▃▄▂▃▁▃▁▁▄▄▃▂▂▁▂▃▃
profiling/Time taken: GRPOTrainer.transformers.generate,▂▇▄▄█▄▅██▃▇▄▄▆▄▃█▄▅█▄▆▃▇▂▇█▄██▃▁▅█▄▆█▄▇▅
train/clip_ratio/high_max,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/clip_ratio/high_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/clip_ratio/low_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/accuracy,0.31667
profiling/Time taken: GRPOTrainer._calculate_rewards,0.00112
profiling/Time taken: GRPOTrainer._get_per_token_logps_and_entropies,0.05486
profiling/Time taken: GRPOTrainer._prepare_inputs,1e-05
profiling/Time taken: GRPOTrainer.compute_loss,0.05745
profiling/Time taken: GRPOTrainer.reward_fn,0.00022
profiling/Time taken: GRPOTrainer.transformers.generate,4.15994
total_flos,0
train/clip_ratio/high_max,0
train/clip_ratio/high_mean,0


{'sft_frac': 0.0} → accuracy: 0.3167


Loading weights:   0%|          | 0/290 [00:00<?, ?it/s]

trainable params: 8,798,208 || all params: 502,830,976 || trainable%: 1.7497


Map:   0%|          | 0/1024 [00:00<?, ? examples/s]

Map:   0%|          | 0/660 [00:00<?, ? examples/s]

Step,Training Loss
1,0.015197
2,-0.162927
3,-0.014115
4,3.6e-05
5,0.015439
6,0.00688
7,4.5e-05
8,0.021041
9,4.7e-05
10,0.03


Evaluating: 100%|██████████| 11/11 [00:48<00:00,  4.37s/it]


Final Accuracy: 200/660 = 30.30%





0,1
eval/accuracy,▁
profiling/Time taken: GRPOTrainer._calculate_rewards,▃▁▂▅▃▇▅▅▅▂▂▂▅▃▃▆▆▄▅▂▃█▁█▁▆▆▄█▅▄▅▇▇▁▅▆▃▁▂
profiling/Time taken: GRPOTrainer._get_per_token_logps_and_entropies,▁▂▁▁▁▃▁▂▁▁▁▁▂▁▁▂▁▁▂█▂▃▁▂▁▁▁▃▁▁▁▁▁▁▁▁▁▂▁▁
profiling/Time taken: GRPOTrainer._prepare_inputs,▁▁█▁█▁▁▆▁▁▁▅▁▁▁▁▆▁█▆▁█▁▁▁▁▁▁▁▇▁▁▆▁▁▆▅█▁█
profiling/Time taken: GRPOTrainer.compute_loss,▄▅▃▄▃▄▄█▁▄▄▅▅▆▄▃▆▄▄▃▅▂▄▆▅▄▃▃▆▆▃▃▂▁▄▄▃▅▅▂
profiling/Time taken: GRPOTrainer.reward_fn,▂▂█▇▄▃▃▄▄▃▆▃▂▄▆▅▃▂▃▂▂▁▄▅▃▂▂▃▆▄▃▃▂▄▃▃▄▄▅▅
profiling/Time taken: GRPOTrainer.transformers.generate,▄▂▆▇▇▁▁▇▄▃▆▄▇▄▄▂▄▄█▇▄▃▄▆▄▄▂▄▄▅█▂▃▇█▇▇▂▃▂
train/clip_ratio/high_max,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/clip_ratio/high_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/clip_ratio/low_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/accuracy,0.30303
profiling/Time taken: GRPOTrainer._calculate_rewards,0.00069
profiling/Time taken: GRPOTrainer._get_per_token_logps_and_entropies,0.06304
profiling/Time taken: GRPOTrainer._prepare_inputs,0.0
profiling/Time taken: GRPOTrainer.compute_loss,0.06596
profiling/Time taken: GRPOTrainer.reward_fn,0.00022
profiling/Time taken: GRPOTrainer.transformers.generate,4.41135
total_flos,0
train/clip_ratio/high_max,0
train/clip_ratio/high_mean,0


{'sft_frac': 0.5} → accuracy: 0.3030


Loading weights:   0%|          | 0/290 [00:00<?, ?it/s]

trainable params: 8,798,208 || all params: 502,830,976 || trainable%: 1.7497


Map:   0%|          | 0/1024 [00:00<?, ? examples/s]

Map:   0%|          | 0/1024 [00:00<?, ? examples/s]

Map:   0%|          | 0/660 [00:00<?, ? examples/s]

Step,Training Loss
1,0.015197
2,0.092934
3,0.129489
4,-0.014396
5,-0.018466
6,3.8e-05
7,0.022653
8,4.5e-05
9,-0.00853
10,0.049697


Evaluating: 100%|██████████| 11/11 [00:47<00:00,  4.33s/it]


Final Accuracy: 223/660 = 33.79%





0,1
eval/accuracy,▁
profiling/Time taken: GRPOTrainer._calculate_rewards,▅▆▆▂▆▆▃▇▂▆▂▄▂▇▃▄▃▅▂█▃▃▂▄▅▅▁▆▆▂▅▄▆▃▄▇▂▇▂▃
profiling/Time taken: GRPOTrainer._get_per_token_logps_and_entropies,▇▂█▂█▂▂▂█▂▁▂▂▁▁▁▂▁▁▁▇▂▁▁▂▁▁▁▁▁▁▁▁▁█▁█▁▂▁
profiling/Time taken: GRPOTrainer._prepare_inputs,▁▁▁▁▁▁▁▁▁▁▁▆▁▁▁▁▁▇▅▁▁█▁▄▁▁▄▁▁▇▁▅▁▁▁█▁▁▁▁
profiling/Time taken: GRPOTrainer.compute_loss,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
profiling/Time taken: GRPOTrainer.reward_fn,▃▅▂▄▃▅▄█▃▅▃▄▂▂▂▃▅▃▂▂▂▄▄▄▆▃▇▂▁▁▂▃▄▃▅▃▂▂▆▃
profiling/Time taken: GRPOTrainer.transformers.generate,▄▂▃▅████▃▄▇█▆▄▅▃▆▅▅▇██▅█▂▄▄▄▂██▃▇▅▂█▃▆█▁
train/clip_ratio/high_max,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/clip_ratio/high_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/clip_ratio/low_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/accuracy,0.33788
profiling/Time taken: GRPOTrainer._calculate_rewards,0.0007
profiling/Time taken: GRPOTrainer._get_per_token_logps_and_entropies,0.35988
profiling/Time taken: GRPOTrainer._prepare_inputs,0.0
profiling/Time taken: GRPOTrainer.compute_loss,0.36222
profiling/Time taken: GRPOTrainer.reward_fn,0.00021
profiling/Time taken: GRPOTrainer.transformers.generate,2.80541
total_flos,0
train/clip_ratio/high_max,0
train/clip_ratio/high_mean,0


{'sft_frac': 1.0} → accuracy: 0.3379


[{'num_generations': 8, 'accuracy': 0.32727272727272727},
 {'num_generations': 16, 'accuracy': 0.29545454545454547},
 {'beta': 0.01, 'accuracy': 0.3106060606060606},
 {'beta': 0.02, 'accuracy': 0.31363636363636366},
 {'beta': 0.04, 'accuracy': 0.30303030303030304},
 {'learning_rate': 1e-05, 'accuracy': 0.27121212121212124},
 {'learning_rate': 5e-05, 'accuracy': 0.30303030303030304},
 {'learning_rate': 0.0001, 'accuracy': 0.2924242424242424},
 {'sft_frac': 0.0, 'accuracy': 0.31666666666666665},
 {'sft_frac': 0.5, 'accuracy': 0.30303030303030304},
 {'sft_frac': 1.0, 'accuracy': 0.3378787878787879}]

In [None]:
from collections import defaultdict

grouped = defaultdict(list)
for r in results:
    param_name = [k for k in r if k != "accuracy"][0]
    grouped[param_name].append(r)

for param, runs in grouped.items():
    best = max(runs, key=lambda x: x["accuracy"])
    print(f"Best {param}: {best}")


Best num_generations: {'num_generations': 8, 'accuracy': 0.32727272727272727}
Best beta: {'beta': 0.02, 'accuracy': 0.31363636363636366}
Best learning_rate: {'learning_rate': 5e-05, 'accuracy': 0.30303030303030304}
Best sft_frac: {'sft_frac': 1.0, 'accuracy': 0.3378787878787879}


## Results

### Best per sweep category

| Param | Best Value | Accuracy | Notes |
|---|---|---|---|
| `num_generations` | 16* | 36.4% | High variance (29.5–36.4%); 8 avg ~31.6% |
| `beta` | 0.02* | 31.4% | Near-flat (30.3–31.4%); not sensitive |
| `learning_rate` | 5e-5 | 30.3% | 1e-5 too slow (27.1%); 1e-4 too aggressive (29.2%) |
| `sft_frac` | 0.0* | 31.7% | Fresh data > SFT overlap; 1.0 = 33.8%** |

*\* More experiments needed — high variance between runs; multiple seeds required to confirm.*

*\*\* sft_frac=1.0 surprisingly strong — model may benefit from reinforcing known problems.*

### Best combined config (heuristic)

```python
best_cfg = {**cfg,
    "num_generations": 16,
    "beta": 0.02,
    "learning_rate": 5e-5,
    "sft_frac": 0.0,
    "max_steps": 500,  # longer final run
}
```

## Conclusion

Sweep over 100 steps, binary reward (1.0/0.0), seed=1337.

| Param | Values Tested | Best | Accuracy |
|---|---|---|---|
| `num_generations` | 8, 16 | **16** | 36.4%* |
| `beta` | 0.01, 0.02, 0.04 | **0.02** | 31.4% |
| `learning_rate` | 1e-5, 5e-5, 1e-4 | **5e-5** | 30.3% |
| `sft_frac` | 0.0, 0.5, 1.0 | **1.0** | 33.8% |

*\* High run-to-run variance observed — multiple seeds needed to confirm. These are heuristic picks from single-run sweeps.*