In [1]:
import gymnasium as gym
import torch, torch.nn as nn, torch.optim as optim
import numpy as np
from torch.distributions.categorical import Categorical

# --- small policy ---
class Policy(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_dim, 128), nn.Tanh(),
            nn.Linear(128, 128), nn.Tanh(),
            nn.Linear(128, act_dim)
        )
    def forward(self, x): return self.net(x)           # logits
    def dist(self, x): return Categorical(logits=self(x))

def rollout_group(env_fn, policy, G, max_steps=256, gamma=0.99, device="cpu"):
    data, ep_returns = [], []
    for _ in range(G):
        env = env_fn()
        s, _ = env.reset()
        ep = {"s":[], "a":[], "logp":[], "r":[], "done":[], "s_next":[]}
        for t in range(max_steps):
            s_t = torch.tensor(s, dtype=torch.float32, device=device)
            d = policy.dist(s_t)
            a = d.sample()
            logp = d.log_prob(a).item()
            s2, r, terminated, truncated, _ = env.step(a.item())
            ep["s"].append(s); ep["a"].append(a.item()); ep["logp"].append(logp)
            ep["r"].append(r); ep["done"].append(terminated or truncated); ep["s_next"].append(s2)
            s = s2
            if terminated or truncated: break
        # Monte-Carlo return
        R, Gt = 0.0, []
        for r in reversed(ep["r"]):
            R = r + gamma * R
            Gt.append(R)
        ep["G"] = list(reversed(Gt))
        ep_returns.append(sum(ep["r"]))
        data.append(ep)
        env.close()
    return data, np.array(ep_returns, dtype=np.float32)

def build_batch(data, advantages):
    S,A,LOGP,ADV = [],[],[],[]
    for ep, adv in zip(data, advantages):
        n = len(ep["s"])
        S.append(torch.tensor(ep["s"], dtype=torch.float32))
        A.append(torch.tensor(ep["a"], dtype=torch.int64))
        LOGP.append(torch.tensor(ep["logp"], dtype=torch.float32))
        ADV.append(torch.full((n,), adv, dtype=torch.float32))   # outcome supervision
    return torch.cat(S), torch.cat(A), torch.cat(LOGP), torch.cat(ADV)

def estimate_group_advantages_outcome(ep_returns, eps=1e-8):
    m, s = ep_returns.mean(), ep_returns.std()
    if s < eps: s = eps
    return (ep_returns - m) / s

def kl_to_ref(policy, policy_ref, states):
    with torch.no_grad():
        p_ref = policy_ref.dist(states)
    p = policy.dist(states)
    # D_KL(p || p_ref)
    return torch.distributions.kl_divergence(p, p_ref).mean()


In [2]:
# ---- training loop ----
env_id = "LunarLander-v3"
def make_env(): return gym.make(env_id)

obs_dim = make_env().observation_space.shape[0]
act_dim = make_env().action_space.n

device = "cuda" if torch.cuda.is_available() else "cpu"
pi = Policy(obs_dim, act_dim).to(device)
pi_ref = Policy(obs_dim, act_dim).to(device)
pi_ref.load_state_dict(pi.state_dict())  # initial reference

opt = optim.Adam(pi.parameters(), lr=3e-4)

G = 16; T = 1200; epochs = 8; minibatches = 16
clip_eps = 0.2; beta_kl = 0.02; ent_coef = 0.01

for it in range(200):
    data, epR = rollout_group(make_env, pi, G, max_steps=T, device=device)
    adv_ep = estimate_group_advantages_outcome(epR)           # outcome supervision
    # (optional) normalize again:
    adv_ep = (adv_ep - adv_ep.mean()) / (adv_ep.std()+1e-8)

    S, A, LOGP_OLD, ADV = build_batch(data, adv_ep)
    perm = torch.randperm(len(S))
    S, A, LOGP_OLD, ADV = S[perm].to(device), A[perm].to(device), LOGP_OLD[perm].to(device), ADV[perm].to(device)

    for _ in range(epochs):
        for mb in torch.chunk(torch.arange(len(S)), minibatches):
            s, a, logp_old, adv = S[mb], A[mb], LOGP_OLD[mb], ADV[mb]
            dist = pi.dist(s)
            logp = dist.log_prob(a)
            ratio = (logp - logp_old).exp()
            unclipped = ratio * adv
            clipped = torch.clamp(ratio, 1.0-clip_eps, 1.0+clip_eps) * adv
            L_clip = torch.min(unclipped, clipped).mean()
            ent = dist.entropy().mean()
            dkl = kl_to_ref(pi, pi_ref, s)
            loss = -(L_clip - beta_kl*dkl + ent_coef*ent)
            opt.zero_grad(); loss.backward(); nn.utils.clip_grad_norm_(pi.parameters(), 0.5); opt.step()

    # Update reference occasionally (or keep fixed)
    if (it+1) % 20 == 0:
        pi_ref.load_state_dict(pi.state_dict())
    print(f"Iter {it} | avg return {epR.mean():.1f} | KL {dkl.item():.4f}")


  from pkg_resources import resource_stream, resource_exists
  S.append(torch.tensor(ep["s"], dtype=torch.float32))


Iter 0 | avg return -194.1 | KL 0.0181
Iter 1 | avg return -172.5 | KL 0.0183
Iter 2 | avg return -196.9 | KL 0.0390
Iter 3 | avg return -132.1 | KL 0.0366
Iter 4 | avg return -98.9 | KL 0.0402
Iter 5 | avg return -148.8 | KL 0.0589
Iter 6 | avg return -123.4 | KL 0.0658
Iter 7 | avg return -139.1 | KL 0.0675
Iter 8 | avg return -155.0 | KL 0.0735
Iter 9 | avg return -134.0 | KL 0.0452
Iter 10 | avg return -137.6 | KL 0.0609
Iter 11 | avg return -134.8 | KL 0.0556
Iter 12 | avg return -124.7 | KL 0.0407
Iter 13 | avg return -138.0 | KL 0.0623
Iter 14 | avg return -111.2 | KL 0.0443
Iter 15 | avg return -112.1 | KL 0.0284
Iter 16 | avg return -130.2 | KL 0.0398
Iter 17 | avg return -115.6 | KL 0.0592
Iter 18 | avg return -130.8 | KL 0.0355


KeyboardInterrupt: 

In [35]:
import gymnasium as gym
import torch, torch.nn as nn, torch.optim as optim 
import numpy as np
from torch.distributions.categorical import Categorical
from multiprocessing import Pool, get_context
from functools import partial
from pathlib import Path
import time
import os, time
import multiprocessing as mp
torch.set_num_threads(1)  # avoid OpenMP storms inside each worker

# === Policy ===
class Policy(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_dim, 128), nn.Tanh(),
            nn.Linear(128, 128), nn.Tanh(),
            nn.Linear(128, act_dim)
        )
    def forward(self, x): return self.net(x)
    def dist(self, x): return Categorical(logits=self(x))

# === Parallel rollout function ===
# ---- replace rollout_group_parallel with CPU workers and fork ----
def rollout_one(env_id, state_dict_cpu, max_steps=256, gamma=0.99):
    """Run one rollout entirely on CPU (safe with fork)."""
    env = gym.make(env_id)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.n
    policy = Policy(obs_dim, act_dim)               # CPU
    policy.load_state_dict(state_dict_cpu)
    policy.eval()

    s, _ = env.reset()
    ep = {"s":[], "a":[], "logp":[], "r":[], "done":[], "s_next":[]}
    for _ in range(max_steps):
        s_t = torch.tensor(s, dtype=torch.float32).unsqueeze(0)  # CPU tensor
        with torch.no_grad():
            dist = policy.dist(s_t)
            a = dist.sample()
            logp = dist.log_prob(a).item()
        s2, r, terminated, truncated, _ = env.step(a.item())
        ep["s"].append(s); ep["a"].append(a.item()); ep["logp"].append(logp)
        ep["r"].append(r); ep["done"].append(terminated or truncated); ep["s_next"].append(s2)
        s = s2
        if terminated or truncated:
            break

    # MC return
    R, Gt = 0.0, []
    for r in reversed(ep["r"]):
        R = r + gamma * R
        Gt.append(R)
    ep["G"] = list(reversed(Gt))
    env.close()
    return ep, sum(ep["r"])

def rollout_group_parallel(env_id, policy_gpu, G, max_steps=256, gamma=0.99, n_workers=None):
    """Fork-safe parallel rollouts on CPU; training stays on GPU."""
    # ship a CPU copy of weights (don’t pass CUDA tensors!)
    state_dict_cpu = {k: v.detach().cpu() for k, v in policy_gpu.state_dict().items()}
    if n_workers is None:
        n_workers = min(G, os.cpu_count() or 1)

    # Use 'fork' on Linux to avoid pickling issues
    if mp.get_start_method(allow_none=True) != "fork":
        try:
            mp.set_start_method("fork", force=True)
        except RuntimeError:
            pass  # already set (ok)

    with mp.Pool(processes=n_workers) as pool:
        results = pool.starmap(
            rollout_one,
            [(env_id, state_dict_cpu, max_steps, gamma) for _ in range(G)]
        )
    data, ep_returns = zip(*results)
    return list(data), np.array(ep_returns, dtype=np.float32)


# === Helper functions ===
def build_batch(data, advantages):
    S,A,LOGP,ADV = [],[],[],[]
    for ep, adv in zip(data, advantages):
        n = len(ep["s"])
        S.append(torch.tensor(ep["s"], dtype=torch.float32))
        A.append(torch.tensor(ep["a"], dtype=torch.int64))
        LOGP.append(torch.tensor(ep["logp"], dtype=torch.float32))
        ADV.append(torch.full((n,), adv, dtype=torch.float32))
    return torch.cat(S), torch.cat(A), torch.cat(LOGP), torch.cat(ADV)

def estimate_group_advantages_outcome(ep_returns, eps=1e-8):
    m, s = ep_returns.mean(), ep_returns.std()
    if s < eps: s = eps
    return (ep_returns - m) / s

def kl_to_ref(policy, policy_ref, states):
    with torch.no_grad():
        p_ref = policy_ref.dist(states)
    p = policy.dist(states)
    return torch.distributions.kl_divergence(p, p_ref).mean()

# === Training loop ===
env_id = "LunarLander-v3"
env = gym.make(env_id)
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.n
env.close()

device = "cuda" if torch.cuda.is_available() else "cpu"
device = 'cpu'
pi = Policy(obs_dim, act_dim).to(device)
pi_ref = Policy(obs_dim, act_dim).to(device)
pi_ref.load_state_dict(pi.state_dict())

opt = optim.Adam(pi.parameters(), lr=3e-4)
G = 32             # one rollout per CPU core
T = 1024
epochs = 8
minibatches = 16
clip_eps = 0.2
beta_kl = 0.05
ent_coef = 0.01

for it in range(300):
    t0 = time.time()
    data, epR = rollout_group_parallel(env_id, pi, G, max_steps=T, n_workers=24)
    adv_ep = estimate_group_advantages_outcome(epR)
    adv_ep = (adv_ep - adv_ep.mean()) / (adv_ep.std() + 1e-8)

    S, A, LOGP_OLD, ADV = build_batch(data, adv_ep)
    perm = torch.randperm(len(S))
    S, A, LOGP_OLD, ADV = S[perm].to(device), A[perm].to(device), LOGP_OLD[perm].to(device), ADV[perm].to(device)

    for _ in range(epochs):
        for mb in torch.chunk(torch.arange(len(S)), minibatches):
            s, a, logp_old, adv = S[mb], A[mb], LOGP_OLD[mb], ADV[mb]
            dist = pi.dist(s)
            logp = dist.log_prob(a)
            ratio = (logp - logp_old).exp()
            unclipped = ratio * adv
            clipped = torch.clamp(ratio, 1.0-clip_eps, 1.0+clip_eps) * adv
            L_clip = torch.min(unclipped, clipped).mean()
            ent = dist.entropy().mean()
            dkl = kl_to_ref(pi, pi_ref, s)
            loss = -(L_clip - beta_kl*dkl + ent_coef*ent)
            opt.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(pi.parameters(), 0.5)
            opt.step()

    if (it+1) % 20 == 0:
        pi_ref.load_state_dict(pi.state_dict())

    dt = time.time() - t0
    print(f"Iter {it} | avgR {epR.mean():.1f} | KL {dkl.item():.4f} | time {dt:.2f}s")


Iter 0 | avgR -235.4 | KL 0.0116 | time 0.99s
Iter 1 | avgR -188.0 | KL 0.0155 | time 1.33s
Iter 2 | avgR -104.3 | KL 0.0252 | time 1.34s
Iter 3 | avgR -116.3 | KL 0.0282 | time 1.01s
Iter 4 | avgR -128.7 | KL 0.0300 | time 0.97s
Iter 5 | avgR -166.1 | KL 0.0370 | time 0.96s
Iter 6 | avgR -121.3 | KL 0.0255 | time 1.05s
Iter 7 | avgR -143.9 | KL 0.0521 | time 1.24s
Iter 8 | avgR -191.6 | KL 0.0620 | time 1.26s
Iter 9 | avgR -119.6 | KL 0.0524 | time 1.33s
Iter 10 | avgR -151.3 | KL 0.0554 | time 1.04s
Iter 11 | avgR -112.0 | KL 0.0457 | time 1.09s
Iter 12 | avgR -121.5 | KL 0.0572 | time 1.07s
Iter 13 | avgR -161.4 | KL 0.1003 | time 1.03s
Iter 14 | avgR -111.2 | KL 0.0541 | time 1.04s
Iter 15 | avgR -111.9 | KL 0.0565 | time 1.03s
Iter 16 | avgR -103.2 | KL 0.0837 | time 0.97s
Iter 17 | avgR -119.0 | KL 0.0924 | time 1.30s
Iter 18 | avgR -108.0 | KL 0.0739 | time 1.13s
Iter 19 | avgR -110.6 | KL 0.0650 | time 0.99s
Iter 20 | avgR -115.6 | KL 0.0067 | time 1.04s
Iter 21 | avgR -134.7 |

In [7]:
# ===== PPO for LunarLander-v3 (parallel, apples-to-apples with your GRPO) =====
import gymnasium as gym
import torch, torch.nn as nn, torch.optim as optim
import numpy as np, os, time, multiprocessing as mp
from torch.distributions.categorical import Categorical

torch.set_num_threads(1)

# === Actor-Critic ===
class ActorCritic(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super().__init__()
        self.pi = nn.Sequential(
            nn.Linear(obs_dim, 128), nn.Tanh(),
            nn.Linear(128, 128), nn.Tanh(),
            nn.Linear(128, act_dim)
        )
        self.v = nn.Sequential(
            nn.Linear(obs_dim, 128), nn.Tanh(),
            nn.Linear(128, 128), nn.Tanh(),
            nn.Linear(128, 1)
        )
    def dist(self, x): return Categorical(logits=self.pi(x))
    def value(self, x): return self.v(x).squeeze(-1)

# === Worker rollout ===
def rollout_one_ppo(env_id, state_dict_cpu, max_steps=256, gamma=0.99):
    env = gym.make(env_id)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.n

    ac = ActorCritic(obs_dim, act_dim)  # CPU
    ac.load_state_dict(state_dict_cpu)
    ac.eval()

    s, _ = env.reset()
    ep = {"s":[], "a":[], "logp":[], "r":[], "done":[], "v":[], "s_next":[]}

    for _ in range(max_steps):
        s_t = torch.tensor(s, dtype=torch.float32).unsqueeze(0)
        with torch.no_grad():
            d = ac.dist(s_t)
            a = d.sample()
            logp = d.log_prob(a).item()
            v = ac.value(s_t).item()

        s2, r, terminated, truncated, _ = env.step(a.item())
        ep["s"].append(s); ep["a"].append(a.item()); ep["logp"].append(logp)
        ep["r"].append(r); ep["done"].append(terminated or truncated)
        ep["v"].append(v); ep["s_next"].append(s2)
        s = s2
        if terminated or truncated: break

    # bootstrap value for final state
    with torch.no_grad():
        s_last = torch.tensor(s, dtype=torch.float32).unsqueeze(0)
        v_last = ac.value(s_last).item()

    env.close()
    return ep, sum(ep["r"]), v_last

def rollout_group_parallel_ppo(env_id, ac_gpu, G, max_steps=256, gamma=0.99, n_workers=None):
    # ship CPU weights safely (no CUDA tensors)
    state_dict_cpu = {k: v.detach().cpu() for k, v in ac_gpu.state_dict().items()}
    if n_workers is None:
        n_workers = min(G, os.cpu_count() or 1)
    if mp.get_start_method(allow_none=True) != "fork":
        try: mp.set_start_method("fork", force=True)
        except RuntimeError: pass

    with mp.Pool(processes=n_workers) as pool:
        results = pool.starmap(
            rollout_one_ppo,
            [(env_id, state_dict_cpu, max_steps, gamma) for _ in range(G)]
        )

    data, ep_returns, v_last = zip(*results)
    return list(data), np.array(ep_returns, dtype=np.float32), np.array(v_last, dtype=np.float32)

# === GAE advantages ===
def compute_gae_for_episode(r, v, v_last, done, gamma=0.99, lam=0.95):
    # r, v are lists; done is list of bools (episode end flags)
    T = len(r)
    adv = np.zeros(T, dtype=np.float32)
    lastgaelam = 0.0
    for t in reversed(range(T)):
        nextv = v_last if t == T-1 else v[t+1]
        nonterminal = 0.0 if done[t] else 1.0
        delta = r[t] + gamma * nextv * nonterminal - v[t]
        lastgaelam = delta + gamma * lam * nonterminal * lastgaelam
        adv[t] = lastgaelam
    ret = adv + np.array(v, dtype=np.float32)
    return adv, ret

# === Build flat batch ===
def build_minibatch(data, v_last_list, gamma=0.99, lam=0.95):
    S, A, LOGP_OLD, ADV, RET = [], [], [], [], []
    for ep, v_last in zip(data, v_last_list):
        adv, ret = compute_gae_for_episode(ep["r"], ep["v"], v_last, ep["done"], gamma, lam)
        S.append(torch.tensor(ep["s"], dtype=torch.float32))
        A.append(torch.tensor(ep["a"], dtype=torch.int64))
        LOGP_OLD.append(torch.tensor(ep["logp"], dtype=torch.float32))
        ADV.append(torch.tensor(adv, dtype=torch.float32))
        RET.append(torch.tensor(ret, dtype=torch.float32))
    S = torch.cat(S); A = torch.cat(A); LOGP_OLD = torch.cat(LOGP_OLD)
    ADV = torch.cat(ADV); RET = torch.cat(RET)
    # normalize advantages
    ADV = (ADV - ADV.mean()) / (ADV.std() + 1e-8)
    return S, A, LOGP_OLD, ADV, RET

# === Train ===
def train_ppo(
    env_id="LunarLander-v3",
    total_iters=50,
    G=16, T=1024,
    epochs=16, minibatches=16,
    gamma=0.99, lam=0.95,
    clip_eps=0.2, vf_coef=0.5, ent_coef=0.01,
    lr=3e-4, max_grad_norm=0.5,
    device=None
):
    env = gym.make(env_id)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.n
    env.close()

    device = device or ("cuda" if torch.cuda.is_available() else "cpu")
    # print(device)
    device = 'cpu'
    ac = ActorCritic(obs_dim, act_dim).to(device)
    opt = optim.Adam(ac.parameters(), lr=lr)

    for it in range(total_iters):
        t0 = time.time()
        data, epR, v_last = rollout_group_parallel_ppo(env_id, ac, G, max_steps=T, gamma=gamma)
        S, A, LOGP_OLD, ADV, RET = build_minibatch(data, v_last, gamma=gamma, lam=lam)

        # shuffle & move
        idx = torch.randperm(len(S))
        S, A, LOGP_OLD, ADV, RET = S[idx].to(device), A[idx].to(device), LOGP_OLD[idx].to(device), ADV[idx].to(device), RET[idx].to(device)

        for _ in range(epochs):
            for mb in torch.chunk(torch.arange(len(S)), minibatches):
                s, a, logp_old, adv, ret = S[mb], A[mb], LOGP_OLD[mb], ADV[mb], RET[mb]
                dist = ac.dist(s)
                logp = dist.log_prob(a)
                ratio = (logp - logp_old).exp()

                # Policy clip loss
                L_clip = torch.min(ratio * adv, torch.clamp(ratio, 1.0 - clip_eps, 1.0 + clip_eps) * adv).mean()

                # Value loss (MSE)
                v_pred = ac.value(s)
                v_loss = 0.5 * (ret - v_pred).pow(2).mean()  # unclipped; can add value clipping if desired

                # Entropy bonus
                ent = dist.entropy().mean()

                loss = -(L_clip) + vf_coef * v_loss - ent_coef * ent
                opt.zero_grad()
                loss.backward()
                nn.utils.clip_grad_norm_(ac.parameters(), max_grad_norm)
                opt.step()

        dt = time.time() - t0
        print(f"[PPO] Iter {it} | avgR {epR.mean():.1f} | time {dt:.2f}s")

    return ac


In [8]:
from dense_scripts.utils import record_videos
model = train_ppo()
device = "cuda"
env_id = "LunarLander-v3"
model.to(device)
record_videos(model, env_id, video_dir=f"videos/PPO", episodes=6, device=device)

[PPO] Iter 0 | avgR -142.8 | time 1.18s
[PPO] Iter 1 | avgR -292.3 | time 1.17s
[PPO] Iter 2 | avgR -295.6 | time 1.12s
[PPO] Iter 3 | avgR -345.6 | time 1.02s
[PPO] Iter 4 | avgR -263.3 | time 1.04s
[PPO] Iter 5 | avgR -191.1 | time 1.13s
[PPO] Iter 6 | avgR -125.2 | time 1.11s
[PPO] Iter 7 | avgR -205.2 | time 1.15s
[PPO] Iter 8 | avgR -224.4 | time 1.17s
[PPO] Iter 9 | avgR -105.3 | time 1.15s
[PPO] Iter 10 | avgR -108.7 | time 1.14s
[PPO] Iter 11 | avgR -114.3 | time 1.11s
[PPO] Iter 12 | avgR -174.8 | time 1.09s
[PPO] Iter 13 | avgR -141.5 | time 1.20s
[PPO] Iter 14 | avgR -198.9 | time 1.07s
[PPO] Iter 15 | avgR -189.2 | time 1.17s
[PPO] Iter 16 | avgR -82.2 | time 1.31s
[PPO] Iter 17 | avgR -90.9 | time 1.20s
[PPO] Iter 18 | avgR -116.9 | time 1.42s
[PPO] Iter 19 | avgR -104.6 | time 1.23s
[PPO] Iter 20 | avgR -22.2 | time 1.59s
[PPO] Iter 21 | avgR -119.1 | time 1.29s
[PPO] Iter 22 | avgR -54.8 | time 1.19s
[PPO] Iter 23 | avgR -56.7 | time 1.16s
[PPO] Iter 24 | avgR -64.1 | ti



✅ Saved MP4 video: videos/PPO/LunarLander-v3_ep01_R235.3.mp4 | Reward: 235.3




✅ Saved MP4 video: videos/PPO/LunarLander-v3_ep02_R-6.5.mp4 | Reward: -6.5




✅ Saved MP4 video: videos/PPO/LunarLander-v3_ep03_R243.0.mp4 | Reward: 243.0




✅ Saved MP4 video: videos/PPO/LunarLander-v3_ep04_R253.9.mp4 | Reward: 253.9




✅ Saved MP4 video: videos/PPO/LunarLander-v3_ep05_R237.5.mp4 | Reward: 237.5




✅ Saved MP4 video: videos/PPO/LunarLander-v3_ep06_R232.9.mp4 | Reward: 232.9


In [8]:
import gymnasium as gym
from pathlib import Path
import imageio

# === After training ===
def record_video(policy, env_id="LunarLander-v3", video_dir="videos", episodes=3, device="cpu"):
    """
    Record video episodes of the trained GRPO agent.
    """
    env = gym.make(env_id, render_mode="rgb_array")
    Path(video_dir).mkdir(exist_ok=True)

    for ep in range(episodes):
        frames = []
        s, _ = env.reset()
        done = False
        total_reward = 0
        while not done:
            s_t = torch.tensor(s, dtype=torch.float32, device=device)
            with torch.no_grad():
                dist = policy.dist(s_t)
                a = dist.probs.argmax().item()   # Greedy action for visualization
            s, r, terminated, truncated, _ = env.step(a)
            done = terminated or truncated
            total_reward += r
            frames.append(env.render())

        # Save the video
        video_path = Path(video_dir) / f"episode_{ep+1:02d}_R{total_reward:.1f}.mp4"
        imageio.mimsave(video_path, frames, fps=30)
        print(f"Saved: {video_path} | Total reward: {total_reward:.1f}")

    env.close()


In [None]:
from gymnasium.envs.box2d.lunar_lander import LunarLander
import numpy as np


class SparseLunarLander(LunarLander):
    """
    Sparse reward version of LunarLander by directly inheriting from LunarLander.
    
    The original LunarLander provides dense rewards at each step based on:
    - Distance to landing pad
    - Velocity
    - Angle
    - Leg contact
    - Fuel usage
    
    This sparse version only provides rewards at episode termination:
    - +100 for successful landing (both legs on ground, stable)
    - Penalty for crashes, scaled by:
      * Crash velocity (softer crashes get less penalty)
      * Distance from landing pad (farther = worse penalty)
      * Number of legs touching (landing on legs is better than body crash)
    """
    
    def __init__(self, success_reward=100.0, max_crash_penalty=-120.0, 
                 velocity_penalty_scale=50.0, leg_bonus_scale=20.0,
                 distance_penalty_scale=15.0, tilt_penalty_scale=15.0, 
                 out_of_bounds_penalty=-5.0, max_episode_steps=1000, **kwargs):
        """
        Initialize the sparse reward LunarLander.
        
        Args:
            success_reward: Reward for successful landing (default: 100.0)
            max_crash_penalty: Maximum penalty for worst crash (default: -100.0)
            velocity_penalty_scale: Scale for velocity-based penalty (default: 30.0)
            leg_bonus_scale: Bonus reduction for each leg touching (default: 20.0)
            distance_penalty_scale: Scale for distance-from-pad penalty (default: 10.0)
            tilt_penalty_scale: Scale for tilt/angle penalty (default: 15.0)
            out_of_bounds_penalty: Additional penalty for flying off screen (default: -40.0)
            max_episode_steps: Maximum number of steps per episode (default: 1000)
            **kwargs: Additional arguments passed to LunarLander constructor
        """
        super().__init__(**kwargs)
        
        self.success_reward = success_reward
        self.max_crash_penalty = max_crash_penalty
        self.velocity_penalty_scale = velocity_penalty_scale
        self.leg_bonus_scale = leg_bonus_scale
        self.distance_penalty_scale = distance_penalty_scale
        self.tilt_penalty_scale = tilt_penalty_scale
        self.out_of_bounds_penalty = out_of_bounds_penalty
        self.max_episode_steps = max_episode_steps
        
        # Track episode statistics
        self.episode_steps = 0
        self.total_original_reward = 0.0
        
    def reset(self, **kwargs):
        """Reset the environment and tracking variables."""
        self.episode_steps = 0
        self.total_original_reward = 0.0
        return super().reset(**kwargs)
    
    def step(self, action):
        """
        Override step method to provide sparse rewards.
        
        Args:
            action: The action to take
            
        Returns:
            observation, sparse_reward, terminated, truncated, info
        """
        # Call parent's step method to get original behavior
        observation, original_reward, terminated, truncated, info = super().step(action)
        
        # Track original reward for comparison
        self.total_original_reward += original_reward
        self.episode_steps += 1
        
        # Check if we've exceeded the step limit
        if self.episode_steps >= self.max_episode_steps:
            truncated = True
            info['TimeLimit.truncated'] = True
        
        # Calculate sparse reward - no reward during episode, only at termination
        sparse_reward = 0.0
        
        if terminated or truncated:
            # Check if this was a successful landing or a crash
            if original_reward > 0:
                # Successful landing (lander came to rest - not awake)
                sparse_reward += self.success_reward
                info['landing_success'] = True
                info['crash_velocity'] = 0.0
                info['legs_touching'] = 2
                info['distance_from_pad'] = 0.0
                info['out_of_bounds'] = False
            else:
                # Failed landing - calculate penalty based on crash severity
                info['landing_success'] = False
                
                # Get position (observation[0] is x position, landing pad is at x=0)
                x_position = observation[0]
                y_position = observation[1]
                distance_from_pad = np.sqrt(x_position**2 + y_position**2)
                
                # Check if lander flew off screen
                # LunarLander terminates when abs(state[0]) >= 1.0 (see lunar_lander.py line 658)
                # The observation space bounds are [-2.5, 2.5] but termination happens at [-1.0, 1.0]
                # Y position doesn't trigger termination, only x position does
                out_of_bounds = abs(x_position) >= 1.0
                info['out_of_bounds'] = out_of_bounds
                
                # Get velocity at crash (from observation)
                # observation[2] = horizontal velocity, observation[3] = vertical velocity
                vx = observation[2]
                vy = observation[3]
                crash_velocity = np.sqrt(vx**2 + vy**2)
                
                # Get angle/tilt at crash (observation[4] is angle in radians)
                # Angle of 0 means upright, larger angles mean more tilted
                angle = abs(observation[4])
                
                # Check how many legs were touching
                legs_touching = int(self.legs[0].ground_contact) + int(self.legs[1].ground_contact)
                
                # Calculate velocity-based penalty (continuous)
                # Higher velocity = worse penalty
                velocity_penalty = -self.velocity_penalty_scale * crash_velocity
                
                # Calculate distance-based penalty (continuous)
                # Farther from landing pad = worse penalty
                # Landing pad is at x=0, typical crashes are within [-1, 1] range
                distance_penalty = -self.distance_penalty_scale * distance_from_pad
                
                # Calculate tilt-based penalty (continuous)
                # More tilted = worse penalty
                tilt_penalty = -self.tilt_penalty_scale * angle
                
                # Reduce penalty for each leg touching (bonus for landing on legs)
                leg_bonus = self.leg_bonus_scale * legs_touching
                
                # Add extra penalty if flew off screen
                oob_penalty = self.out_of_bounds_penalty if out_of_bounds else 0.0
                
                # Total crash penalty (clamped to max_crash_penalty)
                crash_penalty = velocity_penalty + distance_penalty + tilt_penalty + leg_bonus + oob_penalty
                crash_penalty = max(crash_penalty, self.max_crash_penalty)
                
                sparse_reward += crash_penalty
                
                # Store crash details in info
                info['crash_velocity'] = float(crash_velocity)
                info['crash_angle'] = float(angle)
                info['distance_from_pad'] = float(distance_from_pad)
                info['legs_touching'] = legs_touching
                info['velocity_penalty'] = float(velocity_penalty)
                info['distance_penalty'] = float(distance_penalty)
                info['tilt_penalty'] = float(tilt_penalty)
                info['leg_bonus'] = float(leg_bonus)
                info['out_of_bounds_penalty'] = float(oob_penalty)
                info['crash_penalty'] = float(crash_penalty)
            
            # Add statistics to info
            info['sparse_reward'] = sparse_reward
            info['original_total_reward'] = self.total_original_reward
            info['episode_length'] = self.episode_steps
        
        return observation, sparse_reward, terminated, truncated, info

In [6]:
from dense_scripts.GRPO.grpo import PerStepAdvGRPO, GRPOConfig
from dense_scripts.utils.policies import SimpleGRPOPolicy

gamma = 0.98
cfg = GRPOConfig(
    env_id="LunarLander-v3",
    G=32, T=1024, epochs=8, minibatches=16, gamma=gamma,
    ent_coef=0.01, beta_kl=0.02, target_kl=0.015,
    n_workers=24, log_dir="./runs/GRPO_Lander"
)

pi = SimpleGRPOPolicy(obs_dim=8, act_dim=4, hidden=128)

trainer = PerStepAdvGRPO(policy=pi, config=cfg, device="cpu")
pi_star = trainer.train(iters=300)


GRPO:   0%|▏                                                        | 1/300 [00:01<05:14,  1.05s/it, KL=0.009, avgR=-147.6, beta=0.0133, it_s=1.05]

Iter 0000 | avgR -147.59 ± 80.85 | KL 0.0086 (β=0.01333) | Lclip 0.0073 Lkl 0.0086 Lent 1.3729 | steps 2925 eps 32 | time 1.05s


GRPO:   1%|▍                                                        | 2/300 [00:02<05:02,  1.01s/it, KL=0.016, avgR=-160.7, beta=0.0133, it_s=0.98]

Iter 0001 | avgR -160.71 ± 93.40 | KL 0.0162 (β=0.01333) | Lclip 0.0162 Lkl 0.0162 Lent 1.3381 | steps 6407 eps 64 | time 0.98s


GRPO:   1%|▌                                                       | 3/300 [00:03<04:57,  1.00s/it, KL=0.007, avgR=-128.3, beta=0.00889, it_s=0.98]

Iter 0002 | avgR -128.32 ±113.92 | KL 0.0074 (β=0.008889) | Lclip 0.0056 Lkl 0.0074 Lent 1.3297 | steps 9995 eps 96 | time 0.98s


GRPO:   1%|▋                                                       | 4/300 [00:04<05:12,  1.06s/it, KL=0.011, avgR=-165.0, beta=0.00889, it_s=1.13]

Iter 0003 | avgR -164.96 ±128.62 | KL 0.0105 (β=0.008889) | Lclip 0.0088 Lkl 0.0105 Lent 1.3171 | steps 13225 eps 128 | time 1.13s


GRPO:   2%|▉                                                        | 5/300 [00:05<05:13,  1.06s/it, KL=0.010, avgR=-83.9, beta=0.00593, it_s=1.07]

Iter 0004 | avgR  -83.91 ± 50.29 | KL 0.0097 (β=0.005926) | Lclip 0.0081 Lkl 0.0097 Lent 1.3135 | steps 16938 eps 160 | time 1.07s


GRPO:   2%|█▏                                                       | 6/300 [00:06<05:26,  1.11s/it, KL=0.010, avgR=-94.7, beta=0.00593, it_s=1.20]

Iter 0005 | avgR  -94.73 ± 41.27 | KL 0.0101 (β=0.005926) | Lclip 0.0126 Lkl 0.0101 Lent 1.2563 | steps 20409 eps 192 | time 1.20s


GRPO:   2%|█▎                                                       | 7/300 [00:07<05:23,  1.10s/it, KL=0.012, avgR=-76.9, beta=0.00593, it_s=1.09]

Iter 0006 | avgR  -76.95 ± 62.84 | KL 0.0119 (β=0.005926) | Lclip 0.0073 Lkl 0.0119 Lent 1.1999 | steps 24003 eps 224 | time 1.09s


GRPO:   3%|█▍                                                      | 8/300 [00:08<05:10,  1.06s/it, KL=0.009, avgR=-166.1, beta=0.00395, it_s=0.98]

Iter 0007 | avgR -166.06 ±123.45 | KL 0.0090 (β=0.003951) | Lclip 0.0113 Lkl 0.0090 Lent 1.2132 | steps 28156 eps 256 | time 0.98s


GRPO:   3%|█▋                                                       | 9/300 [00:09<05:39,  1.17s/it, KL=0.006, avgR=-88.8, beta=0.00263, it_s=1.39]

Iter 0008 | avgR  -88.84 ± 83.94 | KL 0.0055 (β=0.002634) | Lclip 0.0081 Lkl 0.0055 Lent 1.2261 | steps 34219 eps 288 | time 1.39s


GRPO:   3%|█▊                                                     | 10/300 [00:11<06:12,  1.29s/it, KL=0.008, avgR=-114.8, beta=0.00176, it_s=1.55]

Iter 0009 | avgR -114.80 ± 87.98 | KL 0.0076 (β=0.001756) | Lclip 0.0098 Lkl 0.0076 Lent 1.2647 | steps 40094 eps 320 | time 1.55s


GRPO:   4%|██                                                      | 11/300 [00:12<06:33,  1.36s/it, KL=0.005, avgR=-57.6, beta=0.00117, it_s=1.53]

Iter 0010 | avgR  -57.63 ± 91.42 | KL 0.0049 (β=0.001171) | Lclip 0.0052 Lkl 0.0049 Lent 1.2329 | steps 47622 eps 352 | time 1.53s


GRPO:   4%|██▏                                                     | 12/300 [00:14<06:34,  1.37s/it, KL=0.007, avgR=-61.4, beta=0.00078, it_s=1.38]

Iter 0011 | avgR  -61.41 ± 69.33 | KL 0.0074 (β=0.0007804) | Lclip 0.0090 Lkl 0.0074 Lent 1.1719 | steps 52304 eps 384 | time 1.38s


GRPO:   4%|██▍                                                     | 13/300 [00:15<06:34,  1.38s/it, KL=0.007, avgR=-56.1, beta=0.00052, it_s=1.39]

Iter 0012 | avgR  -56.09 ± 76.30 | KL 0.0073 (β=0.0005202) | Lclip 0.0052 Lkl 0.0073 Lent 1.1921 | steps 57800 eps 416 | time 1.39s


GRPO:   5%|██▌                                                    | 14/300 [00:16<05:59,  1.26s/it, KL=0.006, avgR=-31.5, beta=0.000347, it_s=0.98]

Iter 0013 | avgR  -31.53 ± 27.94 | KL 0.0056 (β=0.0003468) | Lclip 0.0071 Lkl 0.0056 Lent 1.1232 | steps 61363 eps 448 | time 0.98s


GRPO:   5%|██▊                                                    | 15/300 [00:18<06:10,  1.30s/it, KL=0.005, avgR=-21.8, beta=0.000231, it_s=1.40]

Iter 0014 | avgR  -21.75 ± 49.78 | KL 0.0046 (β=0.0002312) | Lclip 0.0038 Lkl 0.0046 Lent 1.1702 | steps 66208 eps 480 | time 1.40s


GRPO:   5%|██▉                                                    | 16/300 [00:19<06:22,  1.35s/it, KL=0.006, avgR=-51.8, beta=0.000154, it_s=1.45]

Iter 0015 | avgR  -51.84 ± 77.39 | KL 0.0064 (β=0.0001541) | Lclip 0.0081 Lkl 0.0064 Lent 1.1220 | steps 72044 eps 512 | time 1.45s


GRPO:   6%|███                                                    | 17/300 [00:21<06:41,  1.42s/it, KL=0.006, avgR=-25.4, beta=0.000103, it_s=1.59]

Iter 0016 | avgR  -25.41 ± 44.16 | KL 0.0061 (β=0.0001028) | Lclip 0.0044 Lkl 0.0061 Lent 1.1819 | steps 79025 eps 544 | time 1.59s


GRPO:   6%|███▍                                                     | 18/300 [00:22<06:44,  1.43s/it, KL=0.005, avgR=-27.4, beta=0.0001, it_s=1.46]

Iter 0017 | avgR  -27.41 ± 78.81 | KL 0.0055 (β=0.0001) | Lclip 0.0051 Lkl 0.0055 Lent 1.2003 | steps 87894 eps 576 | time 1.46s


GRPO:   6%|███▌                                                     | 19/300 [00:24<06:51,  1.47s/it, KL=0.005, avgR=-42.4, beta=0.0001, it_s=1.54]

Iter 0018 | avgR  -42.40 ± 90.31 | KL 0.0046 (β=0.0001) | Lclip 0.0047 Lkl 0.0046 Lent 1.2576 | steps 102296 eps 608 | time 1.54s


GRPO:   7%|███▊                                                      | 20/300 [00:25<07:06,  1.52s/it, KL=0.004, avgR=14.4, beta=0.0001, it_s=1.66]

Iter 0019 | avgR   14.38 ± 33.66 | KL 0.0039 (β=0.0001) | Lclip 0.0024 Lkl 0.0039 Lent 1.1933 | steps 111860 eps 640 | time 1.66s


GRPO:   7%|████                                                      | 21/300 [00:27<06:55,  1.49s/it, KL=0.005, avgR=-7.3, beta=0.0001, it_s=1.40]

Iter 0020 | avgR   -7.32 ± 43.51 | KL 0.0050 (β=0.0001) | Lclip 0.0041 Lkl 0.0050 Lent 1.1825 | steps 119660 eps 672 | time 1.40s


GRPO:   7%|████▎                                                     | 22/300 [00:28<07:01,  1.51s/it, KL=0.006, avgR=-9.5, beta=0.0001, it_s=1.57]

Iter 0021 | avgR   -9.46 ± 49.31 | KL 0.0063 (β=0.0001) | Lclip 0.0065 Lkl 0.0063 Lent 1.1283 | steps 126025 eps 704 | time 1.57s


GRPO:   8%|████▎                                                    | 23/300 [00:30<07:11,  1.56s/it, KL=0.009, avgR=-13.8, beta=0.0001, it_s=1.65]

Iter 0022 | avgR  -13.83 ± 50.02 | KL 0.0087 (β=0.0001) | Lclip 0.0083 Lkl 0.0087 Lent 1.2085 | steps 135063 eps 736 | time 1.65s


GRPO:   8%|████▋                                                     | 24/300 [00:31<07:03,  1.53s/it, KL=0.005, avgR=-2.8, beta=0.0001, it_s=1.48]

Iter 0023 | avgR   -2.81 ± 53.42 | KL 0.0050 (β=0.0001) | Lclip 0.0037 Lkl 0.0050 Lent 1.1486 | steps 146786 eps 768 | time 1.48s


GRPO:   8%|████▊                                                     | 25/300 [00:33<06:56,  1.52s/it, KL=0.004, avgR=-9.9, beta=0.0001, it_s=1.47]

Iter 0024 | avgR   -9.92 ± 51.13 | KL 0.0043 (β=0.0001) | Lclip 0.0037 Lkl 0.0043 Lent 1.1622 | steps 156058 eps 800 | time 1.47s


GRPO:   9%|█████                                                      | 26/300 [00:34<06:58,  1.53s/it, KL=0.006, avgR=9.3, beta=0.0001, it_s=1.55]

Iter 0025 | avgR    9.33 ± 50.89 | KL 0.0058 (β=0.0001) | Lclip 0.0040 Lkl 0.0058 Lent 1.1771 | steps 165699 eps 832 | time 1.55s


GRPO:   9%|█████▏                                                    | 27/300 [00:36<07:09,  1.57s/it, KL=0.006, avgR=10.2, beta=0.0001, it_s=1.67]

Iter 0026 | avgR   10.22 ± 71.61 | KL 0.0057 (β=0.0001) | Lclip 0.0033 Lkl 0.0057 Lent 1.2065 | steps 181967 eps 864 | time 1.67s


GRPO:   9%|█████▍                                                    | 28/300 [00:38<07:06,  1.57s/it, KL=0.006, avgR=26.4, beta=0.0001, it_s=1.55]

Iter 0027 | avgR   26.35 ± 37.07 | KL 0.0058 (β=0.0001) | Lclip 0.0055 Lkl 0.0058 Lent 1.1727 | steps 189931 eps 896 | time 1.55s


GRPO:  10%|█████▌                                                    | 29/300 [00:39<07:06,  1.57s/it, KL=0.006, avgR=20.9, beta=0.0001, it_s=1.59]

Iter 0028 | avgR   20.94 ± 35.48 | KL 0.0059 (β=0.0001) | Lclip 0.0050 Lkl 0.0059 Lent 1.2134 | steps 202536 eps 928 | time 1.59s


GRPO:  10%|█████▊                                                    | 30/300 [00:41<07:06,  1.58s/it, KL=0.006, avgR=33.1, beta=0.0001, it_s=1.59]

Iter 0029 | avgR   33.08 ± 39.55 | KL 0.0060 (β=0.0001) | Lclip 0.0046 Lkl 0.0060 Lent 1.1727 | steps 215049 eps 960 | time 1.59s


GRPO:  10%|█████▉                                                    | 31/300 [00:43<07:37,  1.70s/it, KL=0.006, avgR=39.6, beta=0.0001, it_s=1.98]

Iter 0030 | avgR   39.57 ± 59.38 | KL 0.0057 (β=0.0001) | Lclip 0.0043 Lkl 0.0057 Lent 1.1667 | steps 234892 eps 992 | time 1.98s


GRPO:  11%|██████▏                                                   | 32/300 [00:45<07:38,  1.71s/it, KL=0.005, avgR=50.3, beta=0.0001, it_s=1.73]

Iter 0031 | avgR   50.26 ± 40.45 | KL 0.0050 (β=0.0001) | Lclip 0.0021 Lkl 0.0050 Lent 1.2061 | steps 250867 eps 1024 | time 1.73s


GRPO:  11%|██████▍                                                   | 33/300 [00:46<07:38,  1.72s/it, KL=0.004, avgR=48.9, beta=0.0001, it_s=1.73]

Iter 0032 | avgR   48.86 ± 69.13 | KL 0.0042 (β=0.0001) | Lclip 0.0018 Lkl 0.0042 Lent 1.1852 | steps 270662 eps 1056 | time 1.73s


GRPO:  11%|██████▌                                                   | 34/300 [00:48<07:51,  1.77s/it, KL=0.005, avgR=56.2, beta=0.0001, it_s=1.90]

Iter 0033 | avgR   56.19 ± 59.98 | KL 0.0050 (β=0.0001) | Lclip 0.0025 Lkl 0.0050 Lent 1.2151 | steps 294375 eps 1088 | time 1.90s


GRPO:  12%|██████▊                                                   | 35/300 [00:50<07:52,  1.78s/it, KL=0.003, avgR=34.1, beta=0.0001, it_s=1.80]

Iter 0034 | avgR   34.10 ± 76.60 | KL 0.0031 (β=0.0001) | Lclip 0.0012 Lkl 0.0031 Lent 1.1963 | steps 315085 eps 1120 | time 1.80s


GRPO:  12%|██████▉                                                   | 36/300 [00:53<08:41,  1.98s/it, KL=0.004, avgR=86.9, beta=0.0001, it_s=2.42]

Iter 0035 | avgR   86.91 ± 38.51 | KL 0.0035 (β=0.0001) | Lclip 0.0014 Lkl 0.0035 Lent 1.2435 | steps 344652 eps 1152 | time 2.42s


GRPO:  12%|███████▏                                                  | 37/300 [00:54<08:24,  1.92s/it, KL=0.005, avgR=67.4, beta=0.0001, it_s=1.78]

Iter 0036 | avgR   67.39 ± 53.84 | KL 0.0050 (β=0.0001) | Lclip 0.0025 Lkl 0.0050 Lent 1.2199 | steps 364099 eps 1184 | time 1.78s


GRPO:  13%|███████▎                                                  | 38/300 [00:56<08:16,  1.89s/it, KL=0.005, avgR=71.7, beta=0.0001, it_s=1.84]

Iter 0037 | avgR   71.70 ± 48.31 | KL 0.0046 (β=0.0001) | Lclip 0.0022 Lkl 0.0046 Lent 1.2422 | steps 383756 eps 1216 | time 1.84s


GRPO:  13%|███████▌                                                  | 39/300 [00:58<08:28,  1.95s/it, KL=0.005, avgR=68.5, beta=0.0001, it_s=2.07]

Iter 0038 | avgR   68.53 ± 51.68 | KL 0.0047 (β=0.0001) | Lclip 0.0028 Lkl 0.0047 Lent 1.1811 | steps 407965 eps 1248 | time 2.07s


GRPO:  13%|███████▋                                                  | 40/300 [01:00<08:26,  1.95s/it, KL=0.006, avgR=75.0, beta=0.0001, it_s=1.95]

Iter 0039 | avgR   74.96 ± 42.79 | KL 0.0065 (β=0.0001) | Lclip 0.0038 Lkl 0.0065 Lent 1.1908 | steps 432510 eps 1280 | time 1.95s


GRPO:  14%|███████▉                                                  | 41/300 [01:02<08:45,  2.03s/it, KL=0.004, avgR=44.7, beta=0.0001, it_s=2.22]

Iter 0040 | avgR   44.72 ± 79.12 | KL 0.0038 (β=0.0001) | Lclip 0.0034 Lkl 0.0038 Lent 1.1467 | steps 455500 eps 1312 | time 2.22s


GRPO:  14%|████████                                                  | 42/300 [01:05<09:15,  2.15s/it, KL=0.004, avgR=97.9, beta=0.0001, it_s=2.44]

Iter 0041 | avgR   97.90 ± 32.78 | KL 0.0040 (β=0.0001) | Lclip 0.0015 Lkl 0.0040 Lent 1.1912 | steps 486814 eps 1344 | time 2.44s


GRPO:  14%|████████▎                                                 | 43/300 [01:07<08:51,  2.07s/it, KL=0.004, avgR=80.6, beta=0.0001, it_s=1.87]

Iter 0042 | avgR   80.65 ± 50.90 | KL 0.0037 (β=0.0001) | Lclip 0.0011 Lkl 0.0037 Lent 1.1823 | steps 510542 eps 1376 | time 1.87s


GRPO:  15%|████████▌                                                 | 44/300 [01:09<08:45,  2.05s/it, KL=0.006, avgR=65.1, beta=0.0001, it_s=2.01]

Iter 0043 | avgR   65.07 ± 66.58 | KL 0.0061 (β=0.0001) | Lclip 0.0032 Lkl 0.0061 Lent 1.1928 | steps 535072 eps 1408 | time 2.01s


GRPO:  15%|████████▋                                                 | 45/300 [01:11<08:52,  2.09s/it, KL=0.005, avgR=92.7, beta=0.0001, it_s=2.17]

Iter 0044 | avgR   92.70 ± 45.70 | KL 0.0048 (β=0.0001) | Lclip 0.0013 Lkl 0.0048 Lent 1.2115 | steps 561651 eps 1440 | time 2.17s


GRPO:  15%|████████▉                                                 | 46/300 [01:13<09:06,  2.15s/it, KL=0.004, avgR=85.1, beta=0.0001, it_s=2.29]

Iter 0045 | avgR   85.13 ± 40.14 | KL 0.0041 (β=0.0001) | Lclip 0.0015 Lkl 0.0041 Lent 1.1869 | steps 587935 eps 1472 | time 2.29s


GRPO:  16%|█████████                                                 | 47/300 [01:15<09:14,  2.19s/it, KL=0.004, avgR=89.0, beta=0.0001, it_s=2.29]

Iter 0046 | avgR   89.00 ± 50.35 | KL 0.0044 (β=0.0001) | Lclip 0.0023 Lkl 0.0044 Lent 1.1645 | steps 614159 eps 1504 | time 2.29s


GRPO:  16%|█████████▎                                                | 48/300 [01:18<09:25,  2.24s/it, KL=0.007, avgR=98.2, beta=0.0001, it_s=2.36]

Iter 0047 | avgR   98.19 ± 46.36 | KL 0.0067 (β=0.0001) | Lclip 0.0032 Lkl 0.0067 Lent 1.1717 | steps 641978 eps 1536 | time 2.36s


GRPO:  16%|█████████▍                                                | 49/300 [01:20<09:30,  2.27s/it, KL=0.003, avgR=97.4, beta=0.0001, it_s=2.33]

Iter 0048 | avgR   97.36 ± 53.69 | KL 0.0032 (β=0.0001) | Lclip 0.0020 Lkl 0.0032 Lent 1.1530 | steps 669035 eps 1568 | time 2.33s


GRPO:  17%|█████████▌                                               | 50/300 [01:23<10:07,  2.43s/it, KL=0.006, avgR=109.2, beta=0.0001, it_s=2.79]

Iter 0049 | avgR  109.21 ± 36.51 | KL 0.0056 (β=0.0001) | Lclip 0.0030 Lkl 0.0056 Lent 1.1647 | steps 699389 eps 1600 | time 2.79s


GRPO:  17%|█████████▊                                                | 51/300 [01:25<10:00,  2.41s/it, KL=0.005, avgR=97.3, beta=0.0001, it_s=2.36]

Iter 0050 | avgR   97.25 ± 49.33 | KL 0.0052 (β=0.0001) | Lclip 0.0021 Lkl 0.0052 Lent 1.1522 | steps 728284 eps 1632 | time 2.36s


GRPO:  17%|██████████                                                | 52/300 [01:28<09:57,  2.41s/it, KL=0.005, avgR=72.3, beta=0.0001, it_s=2.41]

Iter 0051 | avgR   72.31 ± 57.91 | KL 0.0048 (β=0.0001) | Lclip 0.0020 Lkl 0.0048 Lent 1.1355 | steps 755758 eps 1664 | time 2.41s


GRPO:  18%|██████████▏                                               | 53/300 [01:30<09:44,  2.37s/it, KL=0.004, avgR=87.9, beta=0.0001, it_s=2.26]

Iter 0052 | avgR   87.93 ± 55.35 | KL 0.0039 (β=0.0001) | Lclip 0.0026 Lkl 0.0039 Lent 1.1486 | steps 781459 eps 1696 | time 2.26s


GRPO:  18%|██████████▍                                               | 54/300 [01:32<09:25,  2.30s/it, KL=0.005, avgR=71.7, beta=0.0001, it_s=2.13]

Iter 0053 | avgR   71.68 ± 51.47 | KL 0.0047 (β=0.0001) | Lclip 0.0016 Lkl 0.0047 Lent 1.1347 | steps 806080 eps 1728 | time 2.13s


GRPO:  18%|██████████▋                                               | 55/300 [01:35<09:31,  2.33s/it, KL=0.004, avgR=87.7, beta=0.0001, it_s=2.41]

Iter 0054 | avgR   87.74 ± 49.33 | KL 0.0041 (β=0.0001) | Lclip 0.0015 Lkl 0.0041 Lent 1.1258 | steps 833168 eps 1760 | time 2.41s


GRPO:  19%|██████████▊                                               | 56/300 [01:37<09:25,  2.32s/it, KL=0.004, avgR=74.4, beta=0.0001, it_s=2.28]

Iter 0055 | avgR   74.39 ± 63.18 | KL 0.0038 (β=0.0001) | Lclip 0.0023 Lkl 0.0038 Lent 1.1368 | steps 860245 eps 1792 | time 2.28s


GRPO:  19%|███████████                                               | 57/300 [01:39<09:19,  2.30s/it, KL=0.005, avgR=82.3, beta=0.0001, it_s=2.27]

Iter 0056 | avgR   82.31 ± 48.16 | KL 0.0055 (β=0.0001) | Lclip 0.0012 Lkl 0.0055 Lent 1.1742 | steps 887321 eps 1824 | time 2.27s


GRPO:  19%|███████████▏                                              | 58/300 [01:41<09:23,  2.33s/it, KL=0.006, avgR=77.5, beta=0.0001, it_s=2.38]

Iter 0057 | avgR   77.48 ± 39.92 | KL 0.0057 (β=0.0001) | Lclip 0.0021 Lkl 0.0057 Lent 1.1350 | steps 914406 eps 1856 | time 2.38s


GRPO:  20%|███████████▍                                              | 59/300 [01:44<09:22,  2.33s/it, KL=0.005, avgR=91.7, beta=0.0001, it_s=2.34]

Iter 0058 | avgR   91.66 ± 52.64 | KL 0.0054 (β=0.0001) | Lclip 0.0017 Lkl 0.0054 Lent 1.1463 | steps 941157 eps 1888 | time 2.34s


GRPO:  20%|███████████▌                                              | 60/300 [01:46<08:55,  2.23s/it, KL=0.003, avgR=79.9, beta=0.0001, it_s=1.98]

Iter 0059 | avgR   79.89 ± 62.99 | KL 0.0028 (β=0.0001) | Lclip 0.0018 Lkl 0.0028 Lent 1.0732 | steps 965128 eps 1920 | time 1.98s


GRPO:  20%|███████████▊                                              | 61/300 [01:48<08:56,  2.25s/it, KL=0.006, avgR=98.8, beta=0.0001, it_s=2.28]

Iter 0060 | avgR   98.76 ± 45.99 | KL 0.0059 (β=0.0001) | Lclip 0.0021 Lkl 0.0059 Lent 1.1208 | steps 992389 eps 1952 | time 2.28s


GRPO:  21%|███████████▉                                              | 62/300 [01:50<09:02,  2.28s/it, KL=0.004, avgR=95.4, beta=0.0001, it_s=2.35]

Iter 0061 | avgR   95.44 ± 46.72 | KL 0.0043 (β=0.0001) | Lclip 0.0011 Lkl 0.0043 Lent 1.1513 | steps 1021190 eps 1984 | time 2.35s


GRPO:  21%|███████████▉                                             | 63/300 [01:53<09:04,  2.30s/it, KL=0.005, avgR=104.7, beta=0.0001, it_s=2.34]

Iter 0062 | avgR  104.68 ± 38.21 | KL 0.0049 (β=0.0001) | Lclip 0.0029 Lkl 0.0049 Lent 1.1171 | steps 1050784 eps 2016 | time 2.34s


GRPO:  21%|████████████▏                                            | 64/300 [01:55<09:03,  2.30s/it, KL=0.007, avgR=102.3, beta=0.0001, it_s=2.31]

Iter 0063 | avgR  102.26 ± 48.98 | KL 0.0068 (β=0.0001) | Lclip 0.0029 Lkl 0.0068 Lent 1.1320 | steps 1079522 eps 2048 | time 2.31s


GRPO:  22%|████████████▎                                            | 65/300 [01:57<09:02,  2.31s/it, KL=0.005, avgR=103.5, beta=0.0001, it_s=2.33]

Iter 0064 | avgR  103.53 ± 43.20 | KL 0.0052 (β=0.0001) | Lclip 0.0025 Lkl 0.0052 Lent 1.0888 | steps 1109037 eps 2080 | time 2.33s


GRPO:  22%|████████████▌                                            | 66/300 [02:00<09:05,  2.33s/it, KL=0.002, avgR=100.5, beta=0.0001, it_s=2.37]

Iter 0065 | avgR  100.48 ± 45.65 | KL 0.0023 (β=0.0001) | Lclip 0.0020 Lkl 0.0023 Lent 1.0513 | steps 1138083 eps 2112 | time 2.37s


GRPO:  22%|████████████▋                                            | 67/300 [02:02<08:59,  2.32s/it, KL=0.005, avgR=106.6, beta=0.0001, it_s=2.28]

Iter 0066 | avgR  106.62 ± 43.50 | KL 0.0053 (β=0.0001) | Lclip 0.0012 Lkl 0.0053 Lent 1.1157 | steps 1166757 eps 2144 | time 2.28s


GRPO:  23%|█████████████▏                                            | 68/300 [02:04<08:32,  2.21s/it, KL=0.007, avgR=68.5, beta=0.0001, it_s=1.95]

Iter 0067 | avgR   68.52 ± 69.57 | KL 0.0066 (β=0.0001) | Lclip 0.0024 Lkl 0.0066 Lent 1.1052 | steps 1188954 eps 2176 | time 1.95s


GRPO:  23%|█████████████                                            | 69/300 [02:06<08:40,  2.25s/it, KL=0.008, avgR=118.3, beta=0.0001, it_s=2.36]

Iter 0068 | avgR  118.27 ± 26.13 | KL 0.0077 (β=0.0001) | Lclip 0.0024 Lkl 0.0077 Lent 1.0869 | steps 1220096 eps 2208 | time 2.36s


GRPO:  23%|█████████████▎                                           | 70/300 [02:09<08:48,  2.30s/it, KL=0.004, avgR=112.5, beta=0.0001, it_s=2.40]

Iter 0069 | avgR  112.48 ± 39.57 | KL 0.0044 (β=0.0001) | Lclip 0.0019 Lkl 0.0044 Lent 1.0760 | steps 1251350 eps 2240 | time 2.40s


GRPO:  24%|█████████████▍                                           | 71/300 [02:11<08:50,  2.32s/it, KL=0.005, avgR=110.9, beta=0.0001, it_s=2.36]

Iter 0070 | avgR  110.88 ± 48.47 | KL 0.0049 (β=0.0001) | Lclip 0.0009 Lkl 0.0049 Lent 1.0807 | steps 1279998 eps 2272 | time 2.36s


GRPO:  24%|█████████████▋                                           | 72/300 [02:14<08:57,  2.36s/it, KL=0.004, avgR=101.6, beta=0.0001, it_s=2.45]

Iter 0071 | avgR  101.59 ± 43.83 | KL 0.0044 (β=0.0001) | Lclip 0.0021 Lkl 0.0044 Lent 1.0968 | steps 1307895 eps 2304 | time 2.45s


GRPO:  24%|██████████████                                            | 73/300 [02:16<08:53,  2.35s/it, KL=0.005, avgR=91.5, beta=0.0001, it_s=2.33]

Iter 0072 | avgR   91.48 ± 55.80 | KL 0.0054 (β=0.0001) | Lclip 0.0004 Lkl 0.0054 Lent 1.0989 | steps 1334561 eps 2336 | time 2.33s


GRPO:  25%|██████████████▎                                           | 74/300 [02:18<08:58,  2.38s/it, KL=0.004, avgR=77.0, beta=0.0001, it_s=2.45]

Iter 0073 | avgR   77.01 ± 39.99 | KL 0.0039 (β=0.0001) | Lclip 0.0015 Lkl 0.0039 Lent 1.0518 | steps 1363318 eps 2368 | time 2.45s


GRPO:  25%|██████████████▌                                           | 75/300 [02:21<08:50,  2.36s/it, KL=0.003, avgR=96.6, beta=0.0001, it_s=2.29]

Iter 0074 | avgR   96.61 ± 45.63 | KL 0.0033 (β=0.0001) | Lclip 0.0011 Lkl 0.0033 Lent 1.0915 | steps 1391542 eps 2400 | time 2.29s


GRPO:  25%|██████████████▋                                           | 76/300 [02:23<08:43,  2.34s/it, KL=0.005, avgR=98.5, beta=0.0001, it_s=2.29]

Iter 0075 | avgR   98.46 ± 57.34 | KL 0.0046 (β=0.0001) | Lclip 0.0020 Lkl 0.0046 Lent 1.0912 | steps 1419630 eps 2432 | time 2.29s


GRPO:  26%|██████████████▋                                          | 77/300 [02:25<08:35,  2.31s/it, KL=0.004, avgR=106.2, beta=0.0001, it_s=2.24]

Iter 0076 | avgR  106.22 ± 46.51 | KL 0.0042 (β=0.0001) | Lclip 0.0020 Lkl 0.0042 Lent 1.0530 | steps 1446708 eps 2464 | time 2.24s


GRPO:  26%|███████████████                                           | 78/300 [02:27<08:27,  2.28s/it, KL=0.005, avgR=91.3, beta=0.0001, it_s=2.22]

Iter 0077 | avgR   91.31 ± 55.66 | KL 0.0046 (β=0.0001) | Lclip 0.0039 Lkl 0.0046 Lent 1.1412 | steps 1473795 eps 2496 | time 2.22s


GRPO:  26%|███████████████▎                                          | 79/300 [02:29<07:55,  2.15s/it, KL=0.004, avgR=81.4, beta=0.0001, it_s=1.84]

Iter 0078 | avgR   81.43 ± 55.79 | KL 0.0036 (β=0.0001) | Lclip 0.0017 Lkl 0.0036 Lent 1.0534 | steps 1497013 eps 2528 | time 1.84s


GRPO:  27%|███████████████▏                                         | 80/300 [02:32<07:58,  2.17s/it, KL=0.005, avgR=111.8, beta=0.0001, it_s=2.22]

Iter 0079 | avgR  111.78 ± 43.58 | KL 0.0047 (β=0.0001) | Lclip 0.0024 Lkl 0.0047 Lent 1.1128 | steps 1523747 eps 2560 | time 2.22s


GRPO:  27%|███████████████▋                                          | 81/300 [02:34<08:05,  2.22s/it, KL=0.004, avgR=96.7, beta=0.0001, it_s=2.31]

Iter 0080 | avgR   96.71 ± 52.08 | KL 0.0039 (β=0.0001) | Lclip 0.0019 Lkl 0.0039 Lent 1.0992 | steps 1550929 eps 2592 | time 2.31s


GRPO:  27%|███████████████▌                                         | 82/300 [02:36<08:10,  2.25s/it, KL=0.006, avgR=101.4, beta=0.0001, it_s=2.32]

Iter 0081 | avgR  101.36 ± 41.25 | KL 0.0058 (β=0.0001) | Lclip 0.0024 Lkl 0.0058 Lent 1.1390 | steps 1580600 eps 2624 | time 2.32s


GRPO:  28%|████████████████                                          | 83/300 [02:39<08:14,  2.28s/it, KL=0.005, avgR=98.1, beta=0.0001, it_s=2.34]

Iter 0082 | avgR   98.09 ± 36.83 | KL 0.0053 (β=0.0001) | Lclip 0.0029 Lkl 0.0053 Lent 1.1002 | steps 1609340 eps 2656 | time 2.34s


GRPO:  28%|███████████████▉                                         | 84/300 [02:41<08:21,  2.32s/it, KL=0.005, avgR=109.6, beta=0.0001, it_s=2.43]

Iter 0083 | avgR  109.64 ± 34.68 | KL 0.0052 (β=0.0001) | Lclip 0.0025 Lkl 0.0052 Lent 1.0958 | steps 1639409 eps 2688 | time 2.43s


GRPO:  28%|████████████████▍                                         | 85/300 [02:43<08:19,  2.32s/it, KL=0.004, avgR=85.9, beta=0.0001, it_s=2.32]

Iter 0084 | avgR   85.88 ± 38.40 | KL 0.0043 (β=0.0001) | Lclip 0.0013 Lkl 0.0043 Lent 1.1036 | steps 1667320 eps 2720 | time 2.32s


GRPO:  29%|████████████████▎                                        | 86/300 [02:46<08:11,  2.30s/it, KL=0.004, avgR=101.7, beta=0.0001, it_s=2.24]

Iter 0085 | avgR  101.73 ± 53.58 | KL 0.0038 (β=0.0001) | Lclip 0.0015 Lkl 0.0038 Lent 1.0076 | steps 1695893 eps 2752 | time 2.24s


GRPO:  29%|████████████████▊                                         | 87/300 [02:48<08:22,  2.36s/it, KL=0.005, avgR=84.3, beta=0.0001, it_s=2.50]

Iter 0086 | avgR   84.30 ± 39.13 | KL 0.0049 (β=0.0001) | Lclip 0.0019 Lkl 0.0049 Lent 1.0437 | steps 1725683 eps 2784 | time 2.50s


GRPO:  29%|█████████████████                                         | 88/300 [02:50<08:22,  2.37s/it, KL=0.005, avgR=82.8, beta=0.0001, it_s=2.39]

Iter 0087 | avgR   82.82 ± 35.27 | KL 0.0053 (β=0.0001) | Lclip 0.0016 Lkl 0.0053 Lent 1.0468 | steps 1754615 eps 2816 | time 2.39s


GRPO:  30%|█████████████████▏                                        | 89/300 [02:53<08:25,  2.40s/it, KL=0.003, avgR=97.8, beta=0.0001, it_s=2.45]

Iter 0088 | avgR   97.77 ± 59.51 | KL 0.0028 (β=0.0001) | Lclip 0.0010 Lkl 0.0028 Lent 0.9169 | steps 1782826 eps 2848 | time 2.45s


GRPO:  30%|█████████████████▍                                        | 90/300 [02:55<08:18,  2.37s/it, KL=0.004, avgR=75.5, beta=0.0001, it_s=2.31]

Iter 0089 | avgR   75.53 ± 46.54 | KL 0.0045 (β=0.0001) | Lclip 0.0024 Lkl 0.0045 Lent 0.9676 | steps 1810981 eps 2880 | time 2.31s


GRPO:  30%|█████████████████▎                                       | 91/300 [02:57<08:00,  2.30s/it, KL=0.003, avgR=104.0, beta=0.0001, it_s=2.12]

Iter 0090 | avgR  103.99 ± 71.66 | KL 0.0034 (β=0.0001) | Lclip 0.0017 Lkl 0.0034 Lent 0.9296 | steps 1836352 eps 2912 | time 2.12s


GRPO:  31%|█████████████████▍                                       | 92/300 [03:00<08:03,  2.33s/it, KL=0.003, avgR=106.4, beta=0.0001, it_s=2.39]

Iter 0091 | avgR  106.35 ± 52.05 | KL 0.0034 (β=0.0001) | Lclip 0.0018 Lkl 0.0034 Lent 1.0470 | steps 1865389 eps 2944 | time 2.39s


GRPO:  31%|█████████████████▉                                        | 93/300 [03:02<07:38,  2.21s/it, KL=0.006, avgR=80.2, beta=0.0001, it_s=1.95]

Iter 0092 | avgR   80.16 ± 59.38 | KL 0.0059 (β=0.0001) | Lclip 0.0012 Lkl 0.0059 Lent 1.0967 | steps 1887902 eps 2976 | time 1.95s


GRPO:  31%|██████████████████▏                                       | 94/300 [03:04<07:45,  2.26s/it, KL=0.004, avgR=90.6, beta=0.0001, it_s=2.36]

Iter 0093 | avgR   90.64 ± 50.09 | KL 0.0041 (β=0.0001) | Lclip 0.0014 Lkl 0.0041 Lent 1.0315 | steps 1914642 eps 3008 | time 2.36s


GRPO:  32%|██████████████████▎                                       | 95/300 [03:06<07:30,  2.20s/it, KL=0.003, avgR=74.3, beta=0.0001, it_s=2.05]

Iter 0094 | avgR   74.33 ± 71.53 | KL 0.0032 (β=0.0001) | Lclip 0.0007 Lkl 0.0032 Lent 0.9970 | steps 1936987 eps 3040 | time 2.05s


GRPO:  32%|██████████████████▌                                       | 96/300 [03:08<07:26,  2.19s/it, KL=0.003, avgR=97.2, beta=0.0001, it_s=2.17]

Iter 0095 | avgR   97.17 ± 64.65 | KL 0.0033 (β=0.0001) | Lclip 0.0011 Lkl 0.0033 Lent 0.9909 | steps 1962124 eps 3072 | time 2.17s


GRPO:  32%|██████████████████▊                                       | 97/300 [03:11<07:29,  2.22s/it, KL=0.004, avgR=97.1, beta=0.0001, it_s=2.27]

Iter 0096 | avgR   97.10 ± 53.70 | KL 0.0045 (β=0.0001) | Lclip 0.0023 Lkl 0.0045 Lent 1.0137 | steps 1988141 eps 3104 | time 2.27s


GRPO:  33%|██████████████████▌                                      | 98/300 [03:13<07:27,  2.21s/it, KL=0.003, avgR=106.7, beta=0.0001, it_s=2.20]

Iter 0097 | avgR  106.74 ± 64.23 | KL 0.0029 (β=0.0001) | Lclip 0.0008 Lkl 0.0029 Lent 1.0151 | steps 2013205 eps 3136 | time 2.20s


GRPO:  33%|██████████████████▊                                      | 99/300 [03:15<07:22,  2.20s/it, KL=0.004, avgR=116.9, beta=0.0001, it_s=2.17]

Iter 0098 | avgR  116.90 ± 58.21 | KL 0.0036 (β=0.0001) | Lclip 0.0013 Lkl 0.0036 Lent 0.9606 | steps 2039562 eps 3168 | time 2.17s


GRPO:  33%|███████████████████                                      | 100/300 [03:17<07:08,  2.14s/it, KL=0.004, avgR=95.2, beta=0.0001, it_s=2.00]

Iter 0099 | avgR   95.21 ± 75.02 | KL 0.0044 (β=0.0001) | Lclip 0.0021 Lkl 0.0044 Lent 0.9252 | steps 2063007 eps 3200 | time 2.00s


GRPO:  34%|██████████████████▊                                     | 101/300 [03:19<07:10,  2.17s/it, KL=0.004, avgR=109.1, beta=0.0001, it_s=2.22]

Iter 0100 | avgR  109.11 ± 63.66 | KL 0.0041 (β=0.0001) | Lclip 0.0018 Lkl 0.0041 Lent 0.9273 | steps 2087538 eps 3232 | time 2.22s


GRPO:  34%|███████████████████▍                                     | 102/300 [03:21<07:15,  2.20s/it, KL=0.005, avgR=94.1, beta=0.0001, it_s=2.27]

Iter 0101 | avgR   94.13 ± 60.24 | KL 0.0045 (β=0.0001) | Lclip 0.0016 Lkl 0.0045 Lent 0.9638 | steps 2113397 eps 3264 | time 2.27s


GRPO:  34%|███████████████████▌                                     | 103/300 [03:24<07:08,  2.18s/it, KL=0.005, avgR=99.6, beta=0.0001, it_s=2.12]

Iter 0102 | avgR   99.62 ± 61.36 | KL 0.0048 (β=0.0001) | Lclip 0.0029 Lkl 0.0048 Lent 0.9552 | steps 2138835 eps 3296 | time 2.12s


GRPO:  35%|███████████████████▍                                    | 104/300 [03:26<07:01,  2.15s/it, KL=0.003, avgR=109.1, beta=0.0001, it_s=2.09]

Iter 0103 | avgR  109.11 ± 78.56 | KL 0.0033 (β=0.0001) | Lclip 0.0015 Lkl 0.0033 Lent 0.8574 | steps 2162212 eps 3328 | time 2.09s


GRPO:  35%|███████████████████▌                                    | 105/300 [03:28<07:16,  2.24s/it, KL=0.004, avgR=101.1, beta=0.0001, it_s=2.44]

Iter 0104 | avgR  101.09 ± 85.24 | KL 0.0042 (β=0.0001) | Lclip 0.0017 Lkl 0.0042 Lent 0.9557 | steps 2186339 eps 3360 | time 2.44s


GRPO:  35%|████████████████████▏                                    | 106/300 [03:30<06:59,  2.16s/it, KL=0.006, avgR=98.4, beta=0.0001, it_s=1.98]

Iter 0105 | avgR   98.40 ± 70.97 | KL 0.0059 (β=0.0001) | Lclip 0.0033 Lkl 0.0059 Lent 1.0145 | steps 2208950 eps 3392 | time 1.98s


GRPO:  36%|███████████████████▉                                    | 107/300 [03:32<07:00,  2.18s/it, KL=0.004, avgR=123.4, beta=0.0001, it_s=2.21]

Iter 0106 | avgR  123.36 ± 53.59 | KL 0.0038 (β=0.0001) | Lclip 0.0037 Lkl 0.0038 Lent 1.0057 | steps 2236423 eps 3424 | time 2.21s


GRPO:  36%|████████████████████▏                                   | 108/300 [03:34<06:47,  2.12s/it, KL=0.005, avgR=112.6, beta=0.0001, it_s=1.98]

Iter 0107 | avgR  112.56 ± 78.35 | KL 0.0050 (β=0.0001) | Lclip 0.0015 Lkl 0.0050 Lent 1.0229 | steps 2259799 eps 3456 | time 1.98s


GRPO:  36%|████████████████████▋                                    | 109/300 [03:36<06:41,  2.10s/it, KL=0.004, avgR=99.4, beta=0.0001, it_s=2.06]

Iter 0108 | avgR   99.43 ± 55.93 | KL 0.0042 (β=0.0001) | Lclip 0.0022 Lkl 0.0042 Lent 1.0083 | steps 2286033 eps 3488 | time 2.06s


GRPO:  37%|████████████████████▉                                    | 110/300 [03:39<06:49,  2.16s/it, KL=0.004, avgR=95.5, beta=0.0001, it_s=2.27]

Iter 0109 | avgR   95.49 ± 50.25 | KL 0.0043 (β=0.0001) | Lclip 0.0026 Lkl 0.0043 Lent 0.9614 | steps 2313057 eps 3520 | time 2.27s


GRPO:  37%|████████████████████▋                                   | 111/300 [03:41<06:48,  2.16s/it, KL=0.004, avgR=122.7, beta=0.0001, it_s=2.17]

Iter 0110 | avgR  122.67 ± 60.34 | KL 0.0035 (β=0.0001) | Lclip 0.0019 Lkl 0.0035 Lent 0.9296 | steps 2337537 eps 3552 | time 2.17s


GRPO:  37%|████████████████████▉                                   | 112/300 [03:43<06:43,  2.15s/it, KL=0.003, avgR=118.5, beta=0.0001, it_s=2.11]

Iter 0111 | avgR  118.53 ± 69.55 | KL 0.0029 (β=0.0001) | Lclip 0.0010 Lkl 0.0029 Lent 1.0140 | steps 2361728 eps 3584 | time 2.11s


GRPO:  38%|█████████████████████                                   | 113/300 [03:45<06:23,  2.05s/it, KL=0.005, avgR=104.7, beta=0.0001, it_s=1.83]

Iter 0112 | avgR  104.71 ± 90.35 | KL 0.0048 (β=0.0001) | Lclip 0.0012 Lkl 0.0048 Lent 0.9266 | steps 2382151 eps 3616 | time 1.83s


GRPO:  38%|█████████████████████▎                                  | 114/300 [03:47<06:21,  2.05s/it, KL=0.003, avgR=140.5, beta=0.0001, it_s=2.03]

Iter 0113 | avgR  140.47 ± 63.22 | KL 0.0034 (β=0.0001) | Lclip 0.0016 Lkl 0.0034 Lent 0.9552 | steps 2405280 eps 3648 | time 2.03s


GRPO:  38%|█████████████████████▍                                  | 115/300 [03:49<06:17,  2.04s/it, KL=0.002, avgR=130.4, beta=0.0001, it_s=2.02]

Iter 0114 | avgR  130.44 ± 63.60 | KL 0.0022 (β=0.0001) | Lclip 0.0015 Lkl 0.0022 Lent 0.9193 | steps 2429023 eps 3680 | time 2.02s


GRPO:  39%|██████████████████████                                   | 116/300 [03:51<06:34,  2.14s/it, KL=0.003, avgR=99.2, beta=0.0001, it_s=2.37]

Iter 0115 | avgR   99.22 ± 54.41 | KL 0.0027 (β=0.0001) | Lclip 0.0005 Lkl 0.0027 Lent 0.9530 | steps 2456809 eps 3712 | time 2.37s


GRPO:  39%|██████████████████████▏                                  | 117/300 [03:53<06:22,  2.09s/it, KL=0.004, avgR=99.8, beta=0.0001, it_s=1.96]

Iter 0116 | avgR   99.79 ± 74.74 | KL 0.0039 (β=0.0001) | Lclip 0.0033 Lkl 0.0039 Lent 0.9131 | steps 2479119 eps 3744 | time 1.96s


GRPO:  39%|██████████████████████▍                                  | 118/300 [03:55<06:22,  2.10s/it, KL=0.006, avgR=94.1, beta=0.0001, it_s=2.13]

Iter 0117 | avgR   94.07 ± 51.18 | KL 0.0057 (β=0.0001) | Lclip 0.0012 Lkl 0.0057 Lent 0.9372 | steps 2504938 eps 3776 | time 2.13s


GRPO:  40%|██████████████████████▏                                 | 119/300 [03:58<06:31,  2.16s/it, KL=0.004, avgR=109.1, beta=0.0001, it_s=2.31]

Iter 0118 | avgR  109.06 ± 60.04 | KL 0.0038 (β=0.0001) | Lclip 0.0007 Lkl 0.0038 Lent 0.9325 | steps 2532727 eps 3808 | time 2.31s


GRPO:  40%|██████████████████████▍                                 | 120/300 [03:59<06:11,  2.06s/it, KL=0.004, avgR=115.7, beta=0.0001, it_s=1.83]

Iter 0119 | avgR  115.75 ± 68.14 | KL 0.0043 (β=0.0001) | Lclip 0.0016 Lkl 0.0043 Lent 0.9528 | steps 2555362 eps 3840 | time 1.83s


GRPO:  40%|██████████████████████▉                                  | 121/300 [04:02<06:13,  2.09s/it, KL=0.004, avgR=98.4, beta=0.0001, it_s=2.14]

Iter 0120 | avgR   98.36 ± 66.64 | KL 0.0036 (β=0.0001) | Lclip 0.0005 Lkl 0.0036 Lent 0.9388 | steps 2580599 eps 3872 | time 2.14s


GRPO:  41%|██████████████████████▊                                 | 122/300 [04:04<06:16,  2.11s/it, KL=0.004, avgR=130.8, beta=0.0001, it_s=2.17]

Iter 0121 | avgR  130.78 ± 60.49 | KL 0.0036 (β=0.0001) | Lclip 0.0027 Lkl 0.0036 Lent 0.9453 | steps 2605414 eps 3904 | time 2.17s


GRPO:  41%|██████████████████████▉                                 | 123/300 [04:06<06:02,  2.05s/it, KL=0.004, avgR=120.6, beta=0.0001, it_s=1.90]

Iter 0122 | avgR  120.65 ± 71.14 | KL 0.0036 (β=0.0001) | Lclip 0.0006 Lkl 0.0036 Lent 0.9122 | steps 2629196 eps 3936 | time 1.90s


GRPO:  41%|███████████████████████▌                                 | 124/300 [04:07<05:47,  1.98s/it, KL=0.005, avgR=84.9, beta=0.0001, it_s=1.80]

Iter 0123 | avgR   84.93 ± 67.14 | KL 0.0053 (β=0.0001) | Lclip 0.0035 Lkl 0.0053 Lent 0.9208 | steps 2651202 eps 3968 | time 1.80s


GRPO:  42%|███████████████████████▊                                 | 125/300 [04:10<05:52,  2.01s/it, KL=0.004, avgR=98.1, beta=0.0001, it_s=2.10]

Iter 0124 | avgR   98.05 ± 65.25 | KL 0.0040 (β=0.0001) | Lclip 0.0025 Lkl 0.0040 Lent 0.8886 | steps 2676741 eps 4000 | time 2.10s


GRPO:  42%|███████████████████████▉                                 | 126/300 [04:12<06:09,  2.12s/it, KL=0.003, avgR=96.1, beta=0.0001, it_s=2.38]

Iter 0125 | avgR   96.06 ± 57.84 | KL 0.0029 (β=0.0001) | Lclip 0.0022 Lkl 0.0029 Lent 0.9806 | steps 2703585 eps 4032 | time 2.38s


GRPO:  42%|███████████████████████▋                                | 127/300 [04:14<06:13,  2.16s/it, KL=0.003, avgR=109.0, beta=0.0001, it_s=2.23]

Iter 0126 | avgR  108.99 ± 84.01 | KL 0.0034 (β=0.0001) | Lclip 0.0013 Lkl 0.0034 Lent 0.9201 | steps 2729343 eps 4064 | time 2.23s


GRPO:  43%|███████████████████████▉                                | 128/300 [04:16<05:58,  2.09s/it, KL=0.003, avgR=110.7, beta=0.0001, it_s=1.92]

Iter 0127 | avgR  110.67 ± 64.70 | KL 0.0035 (β=0.0001) | Lclip 0.0016 Lkl 0.0035 Lent 0.8997 | steps 2753046 eps 4096 | time 1.92s


GRPO:  43%|████████████████████████                                | 129/300 [04:18<05:45,  2.02s/it, KL=0.004, avgR=131.9, beta=0.0001, it_s=1.86]

Iter 0128 | avgR  131.94 ± 80.92 | KL 0.0040 (β=0.0001) | Lclip 0.0021 Lkl 0.0040 Lent 0.8766 | steps 2775444 eps 4128 | time 1.86s


GRPO:  43%|████████████████████████▎                               | 130/300 [04:20<05:37,  1.99s/it, KL=0.003, avgR=128.0, beta=0.0001, it_s=1.91]

Iter 0129 | avgR  128.02 ± 72.05 | KL 0.0033 (β=0.0001) | Lclip 0.0014 Lkl 0.0033 Lent 0.9246 | steps 2797986 eps 4160 | time 1.91s


GRPO:  44%|████████████████████████▉                                | 131/300 [04:22<05:45,  2.04s/it, KL=0.004, avgR=77.0, beta=0.0001, it_s=2.17]

Iter 0130 | avgR   76.99 ± 63.08 | KL 0.0036 (β=0.0001) | Lclip 0.0021 Lkl 0.0036 Lent 0.9602 | steps 2822779 eps 4192 | time 2.17s


GRPO:  44%|█████████████████████████                                | 132/300 [04:24<05:36,  2.00s/it, KL=0.003, avgR=88.2, beta=0.0001, it_s=1.89]

Iter 0131 | avgR   88.24 ± 77.54 | KL 0.0033 (β=0.0001) | Lclip 0.0021 Lkl 0.0033 Lent 0.8935 | steps 2845570 eps 4224 | time 1.89s


GRPO:  44%|█████████████████████████▎                               | 133/300 [04:26<05:37,  2.02s/it, KL=0.003, avgR=95.9, beta=0.0001, it_s=2.07]

Iter 0132 | avgR   95.88 ± 75.19 | KL 0.0029 (β=0.0001) | Lclip 0.0020 Lkl 0.0029 Lent 0.9267 | steps 2869975 eps 4256 | time 2.07s


GRPO:  45%|█████████████████████████▍                               | 134/300 [04:28<05:25,  1.96s/it, KL=0.005, avgR=97.3, beta=0.0001, it_s=1.83]

Iter 0133 | avgR   97.29 ± 82.91 | KL 0.0046 (β=0.0001) | Lclip 0.0019 Lkl 0.0046 Lent 0.9271 | steps 2890646 eps 4288 | time 1.83s


GRPO:  45%|█████████████████████████▏                              | 135/300 [04:30<05:29,  2.00s/it, KL=0.004, avgR=118.5, beta=0.0001, it_s=2.06]

Iter 0134 | avgR  118.47 ± 61.47 | KL 0.0043 (β=0.0001) | Lclip 0.0023 Lkl 0.0043 Lent 0.9795 | steps 2915653 eps 4320 | time 2.06s


GRPO:  45%|█████████████████████████▍                              | 136/300 [04:32<05:27,  2.00s/it, KL=0.005, avgR=110.9, beta=0.0001, it_s=2.00]

Iter 0135 | avgR  110.92 ± 70.61 | KL 0.0050 (β=0.0001) | Lclip 0.0003 Lkl 0.0050 Lent 0.9963 | steps 2938013 eps 4352 | time 2.00s


GRPO:  46%|█████████████████████████▌                              | 137/300 [04:34<05:30,  2.03s/it, KL=0.002, avgR=149.8, beta=0.0001, it_s=2.09]

Iter 0136 | avgR  149.76 ± 64.16 | KL 0.0024 (β=0.0001) | Lclip 0.0005 Lkl 0.0024 Lent 0.7994 | steps 2960372 eps 4384 | time 2.09s


GRPO:  46%|█████████████████████████▊                              | 138/300 [04:36<05:25,  2.01s/it, KL=0.004, avgR=107.9, beta=0.0001, it_s=1.96]

Iter 0137 | avgR  107.86 ± 61.02 | KL 0.0038 (β=0.0001) | Lclip 0.0021 Lkl 0.0038 Lent 0.9226 | steps 2984955 eps 4416 | time 1.96s


GRPO:  46%|█████████████████████████▉                              | 139/300 [04:38<05:34,  2.07s/it, KL=0.004, avgR=101.4, beta=0.0001, it_s=2.23]

Iter 0138 | avgR  101.44 ± 58.66 | KL 0.0037 (β=0.0001) | Lclip 0.0010 Lkl 0.0037 Lent 0.9754 | steps 3012031 eps 4448 | time 2.23s


GRPO:  47%|██████████████████████████▏                             | 140/300 [04:40<05:28,  2.05s/it, KL=0.005, avgR=101.5, beta=0.0001, it_s=1.99]

Iter 0139 | avgR  101.51 ± 66.08 | KL 0.0047 (β=0.0001) | Lclip 0.0019 Lkl 0.0047 Lent 1.0017 | steps 3036352 eps 4480 | time 1.99s


GRPO:  47%|██████████████████████████▎                             | 141/300 [04:42<05:19,  2.01s/it, KL=0.006, avgR=101.0, beta=0.0001, it_s=1.91]

Iter 0140 | avgR  101.05 ± 65.90 | KL 0.0060 (β=0.0001) | Lclip 0.0020 Lkl 0.0060 Lent 0.9937 | steps 3061575 eps 4512 | time 1.91s


GRPO:  47%|██████████████████████████▉                              | 142/300 [04:44<05:27,  2.07s/it, KL=0.004, avgR=96.8, beta=0.0001, it_s=2.21]

Iter 0141 | avgR   96.77 ± 58.30 | KL 0.0036 (β=0.0001) | Lclip 0.0013 Lkl 0.0036 Lent 0.9392 | steps 3089976 eps 4544 | time 2.21s


GRPO:  48%|██████████████████████████▋                             | 143/300 [04:46<05:22,  2.05s/it, KL=0.003, avgR=108.4, beta=0.0001, it_s=2.01]

Iter 0142 | avgR  108.41 ± 58.94 | KL 0.0033 (β=0.0001) | Lclip 0.0015 Lkl 0.0033 Lent 0.9251 | steps 3115400 eps 4576 | time 2.01s


GRPO:  48%|██████████████████████████▉                             | 144/300 [04:49<05:36,  2.16s/it, KL=0.003, avgR=125.7, beta=0.0001, it_s=2.39]

Iter 0143 | avgR  125.69 ± 70.90 | KL 0.0028 (β=0.0001) | Lclip 0.0015 Lkl 0.0028 Lent 0.9423 | steps 3139586 eps 4608 | time 2.39s


GRPO:  48%|███████████████████████████                             | 145/300 [04:51<05:29,  2.13s/it, KL=0.003, avgR=130.2, beta=0.0001, it_s=2.05]

Iter 0144 | avgR  130.20 ± 61.77 | KL 0.0029 (β=0.0001) | Lclip 0.0008 Lkl 0.0029 Lent 0.9085 | steps 3163830 eps 4640 | time 2.05s


GRPO:  49%|███████████████████████████▎                            | 146/300 [04:53<05:18,  2.07s/it, KL=0.004, avgR=125.8, beta=0.0001, it_s=1.94]

Iter 0145 | avgR  125.80 ± 72.47 | KL 0.0039 (β=0.0001) | Lclip 0.0004 Lkl 0.0039 Lent 0.8871 | steps 3186545 eps 4672 | time 1.94s


GRPO:  49%|███████████████████████████▍                            | 147/300 [04:55<05:08,  2.02s/it, KL=0.004, avgR=124.8, beta=0.0001, it_s=1.89]

Iter 0146 | avgR  124.76 ± 73.62 | KL 0.0039 (β=0.0001) | Lclip 0.0009 Lkl 0.0039 Lent 0.9414 | steps 3208750 eps 4704 | time 1.89s


GRPO:  49%|███████████████████████████▋                            | 148/300 [04:57<05:05,  2.01s/it, KL=0.006, avgR=110.0, beta=0.0001, it_s=1.98]

Iter 0147 | avgR  110.01 ± 61.81 | KL 0.0060 (β=0.0001) | Lclip 0.0038 Lkl 0.0060 Lent 0.9242 | steps 3233675 eps 4736 | time 1.98s


GRPO:  50%|███████████████████████████▊                            | 149/300 [04:59<05:03,  2.01s/it, KL=0.003, avgR=125.4, beta=0.0001, it_s=2.00]

Iter 0148 | avgR  125.39 ± 64.88 | KL 0.0027 (β=0.0001) | Lclip 0.0003 Lkl 0.0027 Lent 0.8848 | steps 3258585 eps 4768 | time 2.00s


GRPO:  50%|████████████████████████████                            | 150/300 [05:01<04:58,  1.99s/it, KL=0.003, avgR=106.4, beta=0.0001, it_s=1.96]

Iter 0149 | avgR  106.37 ± 64.22 | KL 0.0030 (β=0.0001) | Lclip 0.0010 Lkl 0.0030 Lent 0.8946 | steps 3282386 eps 4800 | time 1.96s


GRPO:  50%|████████████████████████████▏                           | 151/300 [05:03<05:02,  2.03s/it, KL=0.004, avgR=109.6, beta=0.0001, it_s=2.11]

Iter 0150 | avgR  109.63 ± 59.05 | KL 0.0040 (β=0.0001) | Lclip 0.0029 Lkl 0.0040 Lent 0.9139 | steps 3307822 eps 4832 | time 2.11s


GRPO:  51%|████████████████████████████▎                           | 152/300 [05:05<04:58,  2.02s/it, KL=0.003, avgR=129.2, beta=0.0001, it_s=1.99]

Iter 0151 | avgR  129.23 ± 67.27 | KL 0.0030 (β=0.0001) | Lclip 0.0015 Lkl 0.0030 Lent 0.9018 | steps 3330845 eps 4864 | time 1.99s


GRPO:  51%|█████████████████████████████                            | 153/300 [05:07<05:13,  2.13s/it, KL=0.006, avgR=92.0, beta=0.0001, it_s=2.40]

Iter 0152 | avgR   92.04 ± 49.96 | KL 0.0063 (β=0.0001) | Lclip 0.0038 Lkl 0.0063 Lent 0.9854 | steps 3359388 eps 4896 | time 2.40s


GRPO:  51%|████████████████████████████▋                           | 154/300 [05:09<05:21,  2.20s/it, KL=0.004, avgR=102.0, beta=0.0001, it_s=2.35]

Iter 0153 | avgR  101.99 ± 48.99 | KL 0.0045 (β=0.0001) | Lclip 0.0018 Lkl 0.0045 Lent 0.9781 | steps 3387481 eps 4928 | time 2.35s


GRPO:  52%|████████████████████████████▉                           | 155/300 [05:12<05:14,  2.17s/it, KL=0.004, avgR=111.8, beta=0.0001, it_s=2.10]

Iter 0154 | avgR  111.81 ± 64.53 | KL 0.0037 (β=0.0001) | Lclip 0.0023 Lkl 0.0037 Lent 0.8683 | steps 3413422 eps 4960 | time 2.10s


GRPO:  52%|█████████████████████████████                           | 156/300 [05:14<05:06,  2.13s/it, KL=0.003, avgR=110.3, beta=0.0001, it_s=2.03]

Iter 0155 | avgR  110.30 ± 59.15 | KL 0.0031 (β=0.0001) | Lclip 0.0011 Lkl 0.0031 Lent 0.9083 | steps 3440011 eps 4992 | time 2.03s


GRPO:  52%|█████████████████████████████▊                           | 157/300 [05:16<05:15,  2.21s/it, KL=0.004, avgR=92.9, beta=0.0001, it_s=2.38]

Iter 0156 | avgR   92.90 ± 53.46 | KL 0.0042 (β=0.0001) | Lclip 0.0018 Lkl 0.0042 Lent 0.8709 | steps 3468499 eps 5024 | time 2.38s


GRPO:  53%|█████████████████████████████▍                          | 158/300 [05:18<05:14,  2.21s/it, KL=0.004, avgR=115.8, beta=0.0001, it_s=2.23]

Iter 0157 | avgR  115.75 ± 58.43 | KL 0.0044 (β=0.0001) | Lclip 0.0011 Lkl 0.0044 Lent 0.8271 | steps 3495215 eps 5056 | time 2.23s


GRPO:  53%|█████████████████████████████▋                          | 159/300 [05:20<05:13,  2.22s/it, KL=0.004, avgR=105.8, beta=0.0001, it_s=2.23]

Iter 0158 | avgR  105.77 ± 66.71 | KL 0.0035 (β=0.0001) | Lclip 0.0013 Lkl 0.0035 Lent 0.9044 | steps 3522082 eps 5088 | time 2.23s


GRPO:  53%|█████████████████████████████▊                          | 160/300 [05:23<05:13,  2.24s/it, KL=0.004, avgR=111.3, beta=0.0001, it_s=2.27]

Iter 0159 | avgR  111.35 ± 64.29 | KL 0.0037 (β=0.0001) | Lclip 0.0017 Lkl 0.0037 Lent 0.9269 | steps 3549291 eps 5120 | time 2.27s


GRPO:  54%|██████████████████████████████                          | 161/300 [05:25<05:03,  2.18s/it, KL=0.004, avgR=125.2, beta=0.0001, it_s=2.05]

Iter 0160 | avgR  125.18 ± 61.35 | KL 0.0044 (β=0.0001) | Lclip 0.0019 Lkl 0.0044 Lent 0.8783 | steps 3575533 eps 5152 | time 2.05s


GRPO:  54%|██████████████████████████████▏                         | 162/300 [05:27<04:59,  2.17s/it, KL=0.003, avgR=114.7, beta=0.0001, it_s=2.13]

Iter 0161 | avgR  114.69 ± 63.13 | KL 0.0026 (β=0.0001) | Lclip 0.0012 Lkl 0.0026 Lent 0.8973 | steps 3601594 eps 5184 | time 2.13s


GRPO:  54%|██████████████████████████████▍                         | 163/300 [05:29<04:50,  2.12s/it, KL=0.003, avgR=112.2, beta=0.0001, it_s=2.02]

Iter 0162 | avgR  112.19 ± 65.64 | KL 0.0031 (β=0.0001) | Lclip 0.0017 Lkl 0.0031 Lent 0.8813 | steps 3626069 eps 5216 | time 2.02s


GRPO:  55%|██████████████████████████████▌                         | 164/300 [05:31<04:45,  2.10s/it, KL=0.004, avgR=113.1, beta=0.0001, it_s=2.04]

Iter 0163 | avgR  113.11 ± 65.53 | KL 0.0044 (β=0.0001) | Lclip 0.0019 Lkl 0.0044 Lent 0.9271 | steps 3651826 eps 5248 | time 2.04s


GRPO:  55%|██████████████████████████████▊                         | 165/300 [05:33<04:47,  2.13s/it, KL=0.004, avgR=125.7, beta=0.0001, it_s=2.19]

Iter 0164 | avgR  125.71 ± 64.63 | KL 0.0035 (β=0.0001) | Lclip 0.0010 Lkl 0.0035 Lent 0.8918 | steps 3677791 eps 5280 | time 2.19s


GRPO:  55%|██████████████████████████████▉                         | 166/300 [05:35<04:41,  2.10s/it, KL=0.003, avgR=118.6, beta=0.0001, it_s=2.04]

Iter 0165 | avgR  118.62 ± 71.25 | KL 0.0027 (β=0.0001) | Lclip 0.0018 Lkl 0.0027 Lent 0.9213 | steps 3699257 eps 5312 | time 2.04s


GRPO:  56%|███████████████████████████████▏                        | 167/300 [05:37<04:34,  2.06s/it, KL=0.003, avgR=133.0, beta=0.0001, it_s=1.97]

Iter 0166 | avgR  133.01 ± 67.28 | KL 0.0029 (β=0.0001) | Lclip 0.0007 Lkl 0.0029 Lent 0.8885 | steps 3721780 eps 5344 | time 1.97s


GRPO:  56%|███████████████████████████████▎                        | 168/300 [05:39<04:28,  2.03s/it, KL=0.003, avgR=118.7, beta=0.0001, it_s=1.95]

Iter 0167 | avgR  118.70 ± 75.29 | KL 0.0029 (β=0.0001) | Lclip 0.0010 Lkl 0.0029 Lent 0.8992 | steps 3745265 eps 5376 | time 1.95s


GRPO:  56%|███████████████████████████████▌                        | 169/300 [05:41<04:30,  2.07s/it, KL=0.004, avgR=108.4, beta=0.0001, it_s=2.15]

Iter 0168 | avgR  108.38 ± 60.70 | KL 0.0042 (β=0.0001) | Lclip 0.0003 Lkl 0.0042 Lent 0.8864 | steps 3770539 eps 5408 | time 2.15s


GRPO:  57%|███████████████████████████████▋                        | 170/300 [05:43<04:17,  1.98s/it, KL=0.003, avgR=140.8, beta=0.0001, it_s=1.78]

Iter 0169 | avgR  140.83 ± 71.47 | KL 0.0030 (β=0.0001) | Lclip 0.0015 Lkl 0.0030 Lent 0.8715 | steps 3791129 eps 5440 | time 1.78s


GRPO:  57%|███████████████████████████████▉                        | 171/300 [05:45<04:23,  2.05s/it, KL=0.003, avgR=110.0, beta=0.0001, it_s=2.19]

Iter 0170 | avgR  109.97 ± 68.54 | KL 0.0035 (β=0.0001) | Lclip 0.0020 Lkl 0.0035 Lent 0.8736 | steps 3815599 eps 5472 | time 2.19s


GRPO:  57%|████████████████████████████████                        | 172/300 [05:48<04:31,  2.12s/it, KL=0.003, avgR=107.5, beta=0.0001, it_s=2.29]

Iter 0171 | avgR  107.52 ± 62.02 | KL 0.0029 (β=0.0001) | Lclip 0.0007 Lkl 0.0029 Lent 0.8809 | steps 3841818 eps 5504 | time 2.29s


GRPO:  58%|████████████████████████████████▊                        | 173/300 [05:50<04:25,  2.09s/it, KL=0.005, avgR=78.1, beta=0.0001, it_s=2.03]

Iter 0172 | avgR   78.05 ± 56.62 | KL 0.0047 (β=0.0001) | Lclip 0.0021 Lkl 0.0047 Lent 0.9047 | steps 3867754 eps 5536 | time 2.03s


GRPO:  58%|████████████████████████████████▍                       | 174/300 [05:52<04:28,  2.13s/it, KL=0.002, avgR=113.2, beta=0.0001, it_s=2.22]

Iter 0173 | avgR  113.16 ± 67.68 | KL 0.0025 (β=0.0001) | Lclip 0.0011 Lkl 0.0025 Lent 0.8451 | steps 3893009 eps 5568 | time 2.22s


GRPO:  58%|████████████████████████████████▋                       | 175/300 [05:54<04:19,  2.07s/it, KL=0.005, avgR=131.4, beta=0.0001, it_s=1.93]

Iter 0174 | avgR  131.38 ± 77.37 | KL 0.0053 (β=0.0001) | Lclip 0.0014 Lkl 0.0053 Lent 0.8613 | steps 3913404 eps 5600 | time 1.93s


GRPO:  59%|████████████████████████████████▊                       | 176/300 [05:56<04:18,  2.08s/it, KL=0.004, avgR=104.5, beta=0.0001, it_s=2.11]

Iter 0175 | avgR  104.48 ± 66.02 | KL 0.0041 (β=0.0001) | Lclip 0.0010 Lkl 0.0041 Lent 0.9147 | steps 3939384 eps 5632 | time 2.11s


GRPO:  59%|█████████████████████████████████                       | 177/300 [05:58<04:16,  2.09s/it, KL=0.004, avgR=106.9, beta=0.0001, it_s=2.09]

Iter 0176 | avgR  106.89 ± 64.58 | KL 0.0036 (β=0.0001) | Lclip 0.0027 Lkl 0.0036 Lent 0.8893 | steps 3965853 eps 5664 | time 2.09s


GRPO:  59%|█████████████████████████████████▏                      | 178/300 [06:00<04:11,  2.06s/it, KL=0.004, avgR=121.3, beta=0.0001, it_s=2.01]

Iter 0177 | avgR  121.27 ± 71.44 | KL 0.0039 (β=0.0001) | Lclip 0.0009 Lkl 0.0039 Lent 0.8557 | steps 3988754 eps 5696 | time 2.01s


GRPO:  60%|█████████████████████████████████▍                      | 179/300 [06:02<04:02,  2.00s/it, KL=0.005, avgR=112.1, beta=0.0001, it_s=1.86]

Iter 0178 | avgR  112.09 ± 86.61 | KL 0.0047 (β=0.0001) | Lclip 0.0017 Lkl 0.0047 Lent 0.9431 | steps 4012394 eps 5728 | time 1.86s


GRPO:  60%|█████████████████████████████████▌                      | 180/300 [06:04<04:04,  2.04s/it, KL=0.004, avgR=106.6, beta=0.0001, it_s=2.12]

Iter 0179 | avgR  106.64 ± 61.10 | KL 0.0038 (β=0.0001) | Lclip 0.0010 Lkl 0.0038 Lent 0.9167 | steps 4037988 eps 5760 | time 2.12s


GRPO:  60%|██████████████████████████████████▍                      | 181/300 [06:06<04:12,  2.12s/it, KL=0.003, avgR=98.4, beta=0.0001, it_s=2.32]

Iter 0180 | avgR   98.36 ± 51.15 | KL 0.0035 (β=0.0001) | Lclip 0.0007 Lkl 0.0035 Lent 0.9229 | steps 4065507 eps 5792 | time 2.32s


GRPO:  61%|█████████████████████████████████▉                      | 182/300 [06:08<04:12,  2.14s/it, KL=0.004, avgR=120.6, beta=0.0001, it_s=2.16]

Iter 0181 | avgR  120.55 ± 60.09 | KL 0.0043 (β=0.0001) | Lclip 0.0024 Lkl 0.0043 Lent 0.9273 | steps 4091631 eps 5824 | time 2.16s


GRPO:  61%|██████████████████████████████████▏                     | 183/300 [06:10<04:07,  2.11s/it, KL=0.004, avgR=111.8, beta=0.0001, it_s=2.05]

Iter 0182 | avgR  111.83 ± 64.14 | KL 0.0044 (β=0.0001) | Lclip 0.0031 Lkl 0.0044 Lent 0.8354 | steps 4116350 eps 5856 | time 2.05s


GRPO:  61%|██████████████████████████████████▎                     | 184/300 [06:12<04:00,  2.07s/it, KL=0.005, avgR=123.6, beta=0.0001, it_s=1.97]

Iter 0183 | avgR  123.60 ± 73.49 | KL 0.0047 (β=0.0001) | Lclip 0.0023 Lkl 0.0047 Lent 0.8938 | steps 4138185 eps 5888 | time 1.97s


GRPO:  62%|██████████████████████████████████▌                     | 185/300 [06:14<03:56,  2.05s/it, KL=0.004, avgR=118.8, beta=0.0001, it_s=2.01]

Iter 0184 | avgR  118.84 ± 67.09 | KL 0.0037 (β=0.0001) | Lclip 0.0024 Lkl 0.0037 Lent 0.9417 | steps 4161944 eps 5920 | time 2.01s


GRPO:  62%|██████████████████████████████████▋                     | 186/300 [06:16<03:47,  2.00s/it, KL=0.004, avgR=126.8, beta=0.0001, it_s=1.86]

Iter 0185 | avgR  126.84 ± 88.31 | KL 0.0038 (β=0.0001) | Lclip 0.0015 Lkl 0.0038 Lent 0.8906 | steps 4184303 eps 5952 | time 1.86s


GRPO:  62%|██████████████████████████████████▉                     | 187/300 [06:18<03:45,  1.99s/it, KL=0.003, avgR=100.9, beta=0.0001, it_s=1.99]

Iter 0186 | avgR  100.86 ± 72.78 | KL 0.0030 (β=0.0001) | Lclip 0.0018 Lkl 0.0030 Lent 0.8819 | steps 4209351 eps 5984 | time 1.99s


GRPO:  63%|███████████████████████████████████                     | 188/300 [06:21<03:53,  2.08s/it, KL=0.003, avgR=126.4, beta=0.0001, it_s=2.29]

Iter 0187 | avgR  126.45 ± 68.65 | KL 0.0035 (β=0.0001) | Lclip 0.0011 Lkl 0.0035 Lent 0.8384 | steps 4234662 eps 6016 | time 2.29s


GRPO:  63%|███████████████████████████████████▎                    | 189/300 [06:23<03:53,  2.10s/it, KL=0.004, avgR=123.3, beta=0.0001, it_s=2.15]

Iter 0188 | avgR  123.31 ± 62.61 | KL 0.0039 (β=0.0001) | Lclip 0.0019 Lkl 0.0039 Lent 0.8877 | steps 4261268 eps 6048 | time 2.15s


GRPO:  63%|███████████████████████████████████▍                    | 190/300 [06:25<03:48,  2.08s/it, KL=0.004, avgR=134.3, beta=0.0001, it_s=2.02]

Iter 0189 | avgR  134.31 ± 66.72 | KL 0.0035 (β=0.0001) | Lclip 0.0017 Lkl 0.0035 Lent 0.9124 | steps 4285797 eps 6080 | time 2.02s


GRPO:  64%|████████████████████████████████████▎                    | 191/300 [06:27<04:00,  2.21s/it, KL=0.005, avgR=90.2, beta=0.0001, it_s=2.50]

Iter 0190 | avgR   90.16 ± 60.68 | KL 0.0048 (β=0.0001) | Lclip 0.0012 Lkl 0.0048 Lent 0.9393 | steps 4314164 eps 6112 | time 2.50s


GRPO:  64%|███████████████████████████████████▊                    | 192/300 [06:30<04:02,  2.24s/it, KL=0.003, avgR=116.6, beta=0.0001, it_s=2.32]

Iter 0191 | avgR  116.59 ± 56.68 | KL 0.0034 (β=0.0001) | Lclip 0.0022 Lkl 0.0034 Lent 1.0017 | steps 4341931 eps 6144 | time 2.32s


GRPO:  64%|████████████████████████████████████                    | 193/300 [06:32<04:06,  2.31s/it, KL=0.004, avgR=103.6, beta=0.0001, it_s=2.46]

Iter 0192 | avgR  103.56 ± 47.80 | KL 0.0041 (β=0.0001) | Lclip 0.0014 Lkl 0.0041 Lent 1.0024 | steps 4372362 eps 6176 | time 2.46s


GRPO:  65%|████████████████████████████████████▏                   | 194/300 [06:34<03:54,  2.21s/it, KL=0.004, avgR=124.6, beta=0.0001, it_s=1.98]

Iter 0193 | avgR  124.57 ± 69.35 | KL 0.0045 (β=0.0001) | Lclip 0.0013 Lkl 0.0045 Lent 0.9310 | steps 4397546 eps 6208 | time 1.98s


GRPO:  65%|████████████████████████████████████▍                   | 195/300 [06:36<03:56,  2.25s/it, KL=0.003, avgR=118.4, beta=0.0001, it_s=2.34]

Iter 0194 | avgR  118.37 ± 62.23 | KL 0.0034 (β=0.0001) | Lclip 0.0020 Lkl 0.0034 Lent 0.8711 | steps 4424785 eps 6240 | time 2.34s


GRPO:  65%|█████████████████████████████████████▏                   | 196/300 [06:38<03:48,  2.20s/it, KL=0.005, avgR=98.2, beta=0.0001, it_s=2.07]

Iter 0195 | avgR   98.21 ± 69.40 | KL 0.0048 (β=0.0001) | Lclip 0.0009 Lkl 0.0048 Lent 0.9585 | steps 4451077 eps 6272 | time 2.07s


GRPO:  66%|████████████████████████████████████▊                   | 197/300 [06:41<03:49,  2.23s/it, KL=0.006, avgR=104.5, beta=0.0001, it_s=2.30]

Iter 0196 | avgR  104.51 ± 51.22 | KL 0.0056 (β=0.0001) | Lclip 0.0017 Lkl 0.0056 Lent 0.9656 | steps 4479360 eps 6304 | time 2.30s


GRPO:  66%|████████████████████████████████████▉                   | 198/300 [06:43<03:50,  2.26s/it, KL=0.004, avgR=118.6, beta=0.0001, it_s=2.32]

Iter 0197 | avgR  118.63 ± 60.86 | KL 0.0036 (β=0.0001) | Lclip 0.0014 Lkl 0.0036 Lent 0.9386 | steps 4505825 eps 6336 | time 2.32s


GRPO:  66%|█████████████████████████████████████▊                   | 199/300 [06:45<03:38,  2.16s/it, KL=0.003, avgR=85.7, beta=0.0001, it_s=1.94]

Iter 0198 | avgR   85.69 ± 65.06 | KL 0.0030 (β=0.0001) | Lclip 0.0018 Lkl 0.0030 Lent 0.9437 | steps 4529836 eps 6368 | time 1.94s


GRPO:  67%|█████████████████████████████████████▎                  | 200/300 [06:47<03:28,  2.08s/it, KL=0.004, avgR=119.0, beta=0.0001, it_s=1.89]

Iter 0199 | avgR  118.97 ± 71.43 | KL 0.0038 (β=0.0001) | Lclip 0.0015 Lkl 0.0038 Lent 0.9363 | steps 4552797 eps 6400 | time 1.89s


GRPO:  67%|█████████████████████████████████████▌                  | 201/300 [06:49<03:29,  2.11s/it, KL=0.003, avgR=124.8, beta=0.0001, it_s=2.18]

Iter 0200 | avgR  124.81 ± 62.26 | KL 0.0034 (β=0.0001) | Lclip 0.0015 Lkl 0.0034 Lent 0.8932 | steps 4580024 eps 6432 | time 2.18s


GRPO:  67%|█████████████████████████████████████▋                  | 202/300 [06:51<03:22,  2.07s/it, KL=0.004, avgR=146.1, beta=0.0001, it_s=1.97]

Iter 0201 | avgR  146.13 ± 68.82 | KL 0.0043 (β=0.0001) | Lclip 0.0023 Lkl 0.0043 Lent 0.9272 | steps 4602584 eps 6464 | time 1.97s


GRPO:  68%|█████████████████████████████████████▉                  | 203/300 [06:53<03:20,  2.07s/it, KL=0.004, avgR=119.9, beta=0.0001, it_s=2.06]

Iter 0202 | avgR  119.93 ± 65.40 | KL 0.0039 (β=0.0001) | Lclip 0.0017 Lkl 0.0039 Lent 0.9715 | steps 4627757 eps 6496 | time 2.06s


GRPO:  68%|██████████████████████████████████████                  | 204/300 [06:55<03:19,  2.08s/it, KL=0.004, avgR=109.6, beta=0.0001, it_s=2.09]

Iter 0203 | avgR  109.59 ± 61.08 | KL 0.0036 (β=0.0001) | Lclip 0.0006 Lkl 0.0036 Lent 0.9793 | steps 4654208 eps 6528 | time 2.09s


GRPO:  68%|██████████████████████████████████████▎                 | 205/300 [06:57<03:20,  2.11s/it, KL=0.004, avgR=126.1, beta=0.0001, it_s=2.20]

Iter 0204 | avgR  126.07 ± 67.18 | KL 0.0038 (β=0.0001) | Lclip 0.0016 Lkl 0.0038 Lent 0.9343 | steps 4678164 eps 6560 | time 2.20s


GRPO:  69%|██████████████████████████████████████▍                 | 206/300 [06:59<03:18,  2.11s/it, KL=0.005, avgR=117.7, beta=0.0001, it_s=2.09]

Iter 0205 | avgR  117.74 ± 65.76 | KL 0.0048 (β=0.0001) | Lclip 0.0015 Lkl 0.0048 Lent 0.8972 | steps 4703645 eps 6592 | time 2.09s


GRPO:  69%|██████████████████████████████████████▋                 | 207/300 [07:02<03:18,  2.14s/it, KL=0.004, avgR=111.5, beta=0.0001, it_s=2.20]

Iter 0206 | avgR  111.52 ± 58.44 | KL 0.0035 (β=0.0001) | Lclip 0.0018 Lkl 0.0035 Lent 1.0019 | steps 4730150 eps 6624 | time 2.20s


GRPO:  69%|██████████████████████████████████████▊                 | 208/300 [07:04<03:24,  2.22s/it, KL=0.004, avgR=121.0, beta=0.0001, it_s=2.42]

Iter 0207 | avgR  121.03 ± 44.20 | KL 0.0038 (β=0.0001) | Lclip 0.0009 Lkl 0.0038 Lent 0.9655 | steps 4759155 eps 6656 | time 2.42s


GRPO:  70%|███████████████████████████████████████                 | 209/300 [07:06<03:24,  2.25s/it, KL=0.004, avgR=125.1, beta=0.0001, it_s=2.30]

Iter 0208 | avgR  125.10 ± 60.80 | KL 0.0035 (β=0.0001) | Lclip 0.0002 Lkl 0.0035 Lent 0.9700 | steps 4785459 eps 6688 | time 2.30s


GRPO:  70%|███████████████████████████████████████▏                | 210/300 [07:09<03:22,  2.25s/it, KL=0.004, avgR=125.2, beta=0.0001, it_s=2.24]

Iter 0209 | avgR  125.18 ± 76.38 | KL 0.0042 (β=0.0001) | Lclip 0.0015 Lkl 0.0042 Lent 0.9626 | steps 4811668 eps 6720 | time 2.24s


GRPO:  70%|███████████████████████████████████████▍                | 211/300 [07:11<03:24,  2.29s/it, KL=0.004, avgR=111.8, beta=0.0001, it_s=2.41]

Iter 0210 | avgR  111.77 ± 50.97 | KL 0.0035 (β=0.0001) | Lclip 0.0015 Lkl 0.0035 Lent 1.0323 | steps 4839253 eps 6752 | time 2.41s


GRPO:  71%|███████████████████████████████████████▌                | 212/300 [07:13<03:15,  2.23s/it, KL=0.004, avgR=131.6, beta=0.0001, it_s=2.06]

Iter 0211 | avgR  131.60 ± 72.05 | KL 0.0039 (β=0.0001) | Lclip 0.0009 Lkl 0.0039 Lent 0.9707 | steps 4863133 eps 6784 | time 2.06s


GRPO:  71%|███████████████████████████████████████▊                | 213/300 [07:15<03:11,  2.20s/it, KL=0.003, avgR=127.5, beta=0.0001, it_s=2.14]

Iter 0212 | avgR  127.48 ± 56.06 | KL 0.0035 (β=0.0001) | Lclip 0.0022 Lkl 0.0035 Lent 0.9971 | steps 4889275 eps 6816 | time 2.14s


GRPO:  71%|███████████████████████████████████████▉                | 214/300 [07:17<03:02,  2.13s/it, KL=0.006, avgR=140.8, beta=0.0001, it_s=1.95]

Iter 0213 | avgR  140.77 ± 62.62 | KL 0.0059 (β=0.0001) | Lclip 0.0031 Lkl 0.0059 Lent 0.9786 | steps 4912919 eps 6848 | time 1.95s


GRPO:  72%|████████████████████████████████████████▏               | 215/300 [07:19<03:04,  2.17s/it, KL=0.004, avgR=125.8, beta=0.0001, it_s=2.26]

Iter 0214 | avgR  125.82 ± 51.20 | KL 0.0035 (β=0.0001) | Lclip 0.0014 Lkl 0.0035 Lent 1.0303 | steps 4939856 eps 6880 | time 2.26s


GRPO:  72%|████████████████████████████████████████▎               | 216/300 [07:22<02:59,  2.13s/it, KL=0.004, avgR=138.1, beta=0.0001, it_s=2.05]

Iter 0215 | avgR  138.11 ± 64.96 | KL 0.0036 (β=0.0001) | Lclip 0.0012 Lkl 0.0036 Lent 0.9939 | steps 4963512 eps 6912 | time 2.05s


GRPO:  72%|████████████████████████████████████████▌               | 217/300 [07:24<02:59,  2.16s/it, KL=0.003, avgR=124.0, beta=0.0001, it_s=2.24]

Iter 0216 | avgR  123.97 ± 52.15 | KL 0.0035 (β=0.0001) | Lclip 0.0019 Lkl 0.0035 Lent 1.0015 | steps 4991489 eps 6944 | time 2.24s


GRPO:  73%|████████████████████████████████████████▋               | 218/300 [07:26<02:53,  2.11s/it, KL=0.007, avgR=124.2, beta=0.0001, it_s=1.99]

Iter 0217 | avgR  124.21 ± 62.23 | KL 0.0069 (β=0.0001) | Lclip 0.0022 Lkl 0.0069 Lent 1.0321 | steps 5016318 eps 6976 | time 1.99s


GRPO:  73%|████████████████████████████████████████▉               | 219/300 [07:28<02:55,  2.17s/it, KL=0.004, avgR=119.9, beta=0.0001, it_s=2.29]

Iter 0218 | avgR  119.90 ± 50.20 | KL 0.0043 (β=0.0001) | Lclip 0.0020 Lkl 0.0043 Lent 1.0107 | steps 5044596 eps 7008 | time 2.29s


GRPO:  73%|█████████████████████████████████████████               | 220/300 [07:30<02:57,  2.22s/it, KL=0.005, avgR=111.3, beta=0.0001, it_s=2.34]

Iter 0219 | avgR  111.26 ± 61.59 | KL 0.0046 (β=0.0001) | Lclip 0.0002 Lkl 0.0046 Lent 1.0180 | steps 5072320 eps 7040 | time 2.34s


GRPO:  74%|█████████████████████████████████████████▎              | 221/300 [07:33<02:59,  2.27s/it, KL=0.004, avgR=110.4, beta=0.0001, it_s=2.38]

Iter 0220 | avgR  110.41 ± 42.74 | KL 0.0040 (β=0.0001) | Lclip 0.0015 Lkl 0.0040 Lent 1.0463 | steps 5102033 eps 7072 | time 2.38s


GRPO:  74%|█████████████████████████████████████████▍              | 222/300 [07:35<02:59,  2.30s/it, KL=0.004, avgR=119.7, beta=0.0001, it_s=2.38]

Iter 0221 | avgR  119.70 ± 40.14 | KL 0.0039 (β=0.0001) | Lclip 0.0009 Lkl 0.0039 Lent 1.0300 | steps 5131847 eps 7104 | time 2.38s


GRPO:  74%|█████████████████████████████████████████▋              | 223/300 [07:38<02:59,  2.33s/it, KL=0.003, avgR=126.1, beta=0.0001, it_s=2.39]

Iter 0222 | avgR  126.13 ± 62.26 | KL 0.0032 (β=0.0001) | Lclip 0.0004 Lkl 0.0032 Lent 1.0168 | steps 5158388 eps 7136 | time 2.39s


GRPO:  75%|█████████████████████████████████████████▊              | 224/300 [07:40<02:56,  2.32s/it, KL=0.005, avgR=114.2, beta=0.0001, it_s=2.29]

Iter 0223 | avgR  114.19 ± 49.13 | KL 0.0046 (β=0.0001) | Lclip 0.0023 Lkl 0.0046 Lent 0.9962 | steps 5186590 eps 7168 | time 2.29s


GRPO:  75%|██████████████████████████████████████████              | 225/300 [07:42<02:57,  2.36s/it, KL=0.003, avgR=110.1, beta=0.0001, it_s=2.45]

Iter 0224 | avgR  110.14 ± 48.61 | KL 0.0034 (β=0.0001) | Lclip 0.0012 Lkl 0.0034 Lent 1.0185 | steps 5214032 eps 7200 | time 2.45s


GRPO:  75%|██████████████████████████████████████████▏             | 226/300 [07:45<02:53,  2.35s/it, KL=0.005, avgR=112.4, beta=0.0001, it_s=2.31]

Iter 0225 | avgR  112.35 ± 46.65 | KL 0.0049 (β=0.0001) | Lclip 0.0021 Lkl 0.0049 Lent 1.0093 | steps 5241218 eps 7232 | time 2.31s


GRPO:  76%|██████████████████████████████████████████▎             | 227/300 [07:47<02:52,  2.36s/it, KL=0.003, avgR=124.6, beta=0.0001, it_s=2.39]

Iter 0226 | avgR  124.57 ± 49.99 | KL 0.0033 (β=0.0001) | Lclip 0.0021 Lkl 0.0033 Lent 0.9686 | steps 5269371 eps 7264 | time 2.39s


GRPO:  76%|██████████████████████████████████████████▌             | 228/300 [07:49<02:45,  2.30s/it, KL=0.004, avgR=113.3, beta=0.0001, it_s=2.14]

Iter 0227 | avgR  113.27 ± 50.86 | KL 0.0045 (β=0.0001) | Lclip 0.0017 Lkl 0.0045 Lent 0.9765 | steps 5296241 eps 7296 | time 2.14s


GRPO:  76%|██████████████████████████████████████████▋             | 229/300 [07:51<02:40,  2.27s/it, KL=0.004, avgR=121.4, beta=0.0001, it_s=2.19]

Iter 0228 | avgR  121.36 ± 61.12 | KL 0.0041 (β=0.0001) | Lclip 0.0015 Lkl 0.0041 Lent 0.9461 | steps 5324041 eps 7328 | time 2.19s


GRPO:  77%|██████████████████████████████████████████▉             | 230/300 [07:53<02:34,  2.20s/it, KL=0.007, avgR=143.9, beta=0.0001, it_s=2.05]

Iter 0229 | avgR  143.91 ± 74.62 | KL 0.0070 (β=0.0001) | Lclip 0.0007 Lkl 0.0070 Lent 0.9555 | steps 5347816 eps 7360 | time 2.05s


GRPO:  77%|███████████████████████████████████████████             | 231/300 [07:56<02:34,  2.24s/it, KL=0.003, avgR=135.8, beta=0.0001, it_s=2.33]

Iter 0230 | avgR  135.79 ± 55.92 | KL 0.0032 (β=0.0001) | Lclip 0.0008 Lkl 0.0032 Lent 0.9215 | steps 5376622 eps 7392 | time 2.33s


GRPO:  77%|███████████████████████████████████████████▎            | 232/300 [07:58<02:33,  2.25s/it, KL=0.004, avgR=115.5, beta=0.0001, it_s=2.29]

Iter 0231 | avgR  115.51 ± 47.14 | KL 0.0039 (β=0.0001) | Lclip 0.0019 Lkl 0.0039 Lent 0.9727 | steps 5406338 eps 7424 | time 2.29s


GRPO:  78%|███████████████████████████████████████████▍            | 233/300 [08:00<02:27,  2.20s/it, KL=0.004, avgR=126.9, beta=0.0001, it_s=2.07]

Iter 0232 | avgR  126.87 ± 68.65 | KL 0.0042 (β=0.0001) | Lclip 0.0025 Lkl 0.0042 Lent 0.9612 | steps 5431178 eps 7456 | time 2.07s


GRPO:  78%|███████████████████████████████████████████▋            | 234/300 [08:03<02:28,  2.25s/it, KL=0.003, avgR=130.1, beta=0.0001, it_s=2.37]

Iter 0233 | avgR  130.05 ± 66.52 | KL 0.0034 (β=0.0001) | Lclip 0.0014 Lkl 0.0034 Lent 0.9719 | steps 5458125 eps 7488 | time 2.37s


GRPO:  78%|███████████████████████████████████████████▊            | 235/300 [08:05<02:25,  2.24s/it, KL=0.004, avgR=124.5, beta=0.0001, it_s=2.22]

Iter 0234 | avgR  124.46 ± 58.18 | KL 0.0038 (β=0.0001) | Lclip 0.0016 Lkl 0.0038 Lent 0.9771 | steps 5486435 eps 7520 | time 2.22s


GRPO:  79%|████████████████████████████████████████████            | 236/300 [08:07<02:21,  2.21s/it, KL=0.004, avgR=127.0, beta=0.0001, it_s=2.14]

Iter 0235 | avgR  127.05 ± 68.72 | KL 0.0041 (β=0.0001) | Lclip 0.0014 Lkl 0.0041 Lent 0.9360 | steps 5512197 eps 7552 | time 2.14s


GRPO:  79%|████████████████████████████████████████████▏           | 237/300 [08:09<02:24,  2.30s/it, KL=0.003, avgR=119.5, beta=0.0001, it_s=2.50]

Iter 0236 | avgR  119.51 ± 64.66 | KL 0.0033 (β=0.0001) | Lclip 0.0007 Lkl 0.0033 Lent 0.9363 | steps 5538317 eps 7584 | time 2.50s


GRPO:  79%|████████████████████████████████████████████▍           | 238/300 [08:11<02:16,  2.20s/it, KL=0.004, avgR=108.4, beta=0.0001, it_s=1.97]

Iter 0237 | avgR  108.41 ± 62.83 | KL 0.0043 (β=0.0001) | Lclip 0.0023 Lkl 0.0043 Lent 0.9419 | steps 5561884 eps 7616 | time 1.97s


GRPO:  80%|████████████████████████████████████████████▌           | 239/300 [08:14<02:15,  2.22s/it, KL=0.003, avgR=121.1, beta=0.0001, it_s=2.25]

Iter 0238 | avgR  121.08 ± 59.78 | KL 0.0032 (β=0.0001) | Lclip 0.0008 Lkl 0.0032 Lent 0.9671 | steps 5588407 eps 7648 | time 2.25s


GRPO:  80%|████████████████████████████████████████████▊           | 240/300 [08:15<02:06,  2.11s/it, KL=0.004, avgR=141.3, beta=0.0001, it_s=1.87]

Iter 0239 | avgR  141.29 ± 75.73 | KL 0.0036 (β=0.0001) | Lclip 0.0015 Lkl 0.0036 Lent 0.9353 | steps 5610610 eps 7680 | time 1.87s


GRPO:  80%|████████████████████████████████████████████▉           | 241/300 [08:18<02:09,  2.19s/it, KL=0.004, avgR=109.1, beta=0.0001, it_s=2.37]

Iter 0240 | avgR  109.07 ± 49.38 | KL 0.0035 (β=0.0001) | Lclip 0.0014 Lkl 0.0035 Lent 0.9527 | steps 5640851 eps 7712 | time 2.37s


GRPO:  81%|█████████████████████████████████████████████▏          | 242/300 [08:20<02:11,  2.27s/it, KL=0.005, avgR=122.7, beta=0.0001, it_s=2.47]

Iter 0241 | avgR  122.75 ± 56.96 | KL 0.0049 (β=0.0001) | Lclip 0.0019 Lkl 0.0049 Lent 0.9410 | steps 5669052 eps 7744 | time 2.47s


GRPO:  81%|█████████████████████████████████████████████▎          | 243/300 [08:23<02:09,  2.28s/it, KL=0.003, avgR=119.0, beta=0.0001, it_s=2.27]

Iter 0242 | avgR  119.04 ± 63.60 | KL 0.0027 (β=0.0001) | Lclip 0.0001 Lkl 0.0027 Lent 0.9278 | steps 5695255 eps 7776 | time 2.27s


GRPO:  81%|█████████████████████████████████████████████▌          | 244/300 [08:25<02:07,  2.28s/it, KL=0.003, avgR=119.2, beta=0.0001, it_s=2.28]

Iter 0243 | avgR  119.22 ± 61.13 | KL 0.0032 (β=0.0001) | Lclip 0.0024 Lkl 0.0032 Lent 0.9093 | steps 5722376 eps 7808 | time 2.28s


GRPO:  82%|█████████████████████████████████████████████▋          | 245/300 [08:27<02:05,  2.28s/it, KL=0.004, avgR=124.0, beta=0.0001, it_s=2.28]

Iter 0244 | avgR  124.01 ± 52.85 | KL 0.0044 (β=0.0001) | Lclip 0.0012 Lkl 0.0044 Lent 0.9756 | steps 5750780 eps 7840 | time 2.28s


GRPO:  82%|█████████████████████████████████████████████▉          | 246/300 [08:29<02:00,  2.24s/it, KL=0.004, avgR=136.3, beta=0.0001, it_s=2.13]

Iter 0245 | avgR  136.26 ± 56.96 | KL 0.0041 (β=0.0001) | Lclip 0.0012 Lkl 0.0041 Lent 0.9716 | steps 5776193 eps 7872 | time 2.13s


GRPO:  82%|██████████████████████████████████████████████          | 247/300 [08:31<01:57,  2.22s/it, KL=0.005, avgR=113.0, beta=0.0001, it_s=2.19]

Iter 0246 | avgR  112.97 ± 64.58 | KL 0.0047 (β=0.0001) | Lclip 0.0029 Lkl 0.0047 Lent 0.9366 | steps 5802542 eps 7904 | time 2.19s


GRPO:  83%|██████████████████████████████████████████████▎         | 248/300 [08:34<01:56,  2.25s/it, KL=0.004, avgR=128.5, beta=0.0001, it_s=2.31]

Iter 0247 | avgR  128.53 ± 58.27 | KL 0.0044 (β=0.0001) | Lclip 0.0016 Lkl 0.0044 Lent 0.9679 | steps 5829020 eps 7936 | time 2.31s


GRPO:  83%|██████████████████████████████████████████████▍         | 249/300 [08:36<01:51,  2.18s/it, KL=0.004, avgR=141.9, beta=0.0001, it_s=2.02]

Iter 0248 | avgR  141.87 ± 69.10 | KL 0.0045 (β=0.0001) | Lclip 0.0015 Lkl 0.0045 Lent 0.9315 | steps 5853368 eps 7968 | time 2.02s


GRPO:  83%|██████████████████████████████████████████████▋         | 250/300 [08:38<01:47,  2.15s/it, KL=0.005, avgR=142.7, beta=0.0001, it_s=2.08]

Iter 0249 | avgR  142.73 ± 68.30 | KL 0.0051 (β=0.0001) | Lclip 0.0020 Lkl 0.0051 Lent 0.9056 | steps 5877672 eps 8000 | time 2.08s


GRPO:  84%|██████████████████████████████████████████████▊         | 251/300 [08:40<01:44,  2.12s/it, KL=0.003, avgR=119.3, beta=0.0001, it_s=2.05]

Iter 0250 | avgR  119.26 ± 67.22 | KL 0.0030 (β=0.0001) | Lclip 0.0013 Lkl 0.0030 Lent 0.9150 | steps 5900765 eps 8032 | time 2.05s


GRPO:  84%|███████████████████████████████████████████████         | 252/300 [08:42<01:41,  2.12s/it, KL=0.003, avgR=120.5, beta=0.0001, it_s=2.09]

Iter 0251 | avgR  120.51 ± 60.31 | KL 0.0029 (β=0.0001) | Lclip 0.0013 Lkl 0.0029 Lent 0.8955 | steps 5926264 eps 8064 | time 2.09s


GRPO:  84%|███████████████████████████████████████████████▏        | 253/300 [08:44<01:35,  2.03s/it, KL=0.004, avgR=155.2, beta=0.0001, it_s=1.82]

Iter 0252 | avgR  155.18 ± 70.19 | KL 0.0037 (β=0.0001) | Lclip 0.0016 Lkl 0.0037 Lent 0.8710 | steps 5946078 eps 8096 | time 1.82s


GRPO:  85%|███████████████████████████████████████████████▍        | 254/300 [08:46<01:35,  2.07s/it, KL=0.003, avgR=131.4, beta=0.0001, it_s=2.17]

Iter 0253 | avgR  131.38 ± 64.21 | KL 0.0033 (β=0.0001) | Lclip 0.0018 Lkl 0.0033 Lent 0.8932 | steps 5971737 eps 8128 | time 2.17s


GRPO:  85%|████████████████████████████████████████████████▍        | 255/300 [08:49<01:41,  2.25s/it, KL=0.003, avgR=99.6, beta=0.0001, it_s=2.65]

Iter 0254 | avgR   99.58 ± 58.44 | KL 0.0034 (β=0.0001) | Lclip 0.0009 Lkl 0.0034 Lent 0.9194 | steps 5997833 eps 8160 | time 2.65s


GRPO:  85%|███████████████████████████████████████████████▊        | 256/300 [08:51<01:38,  2.24s/it, KL=0.003, avgR=114.7, beta=0.0001, it_s=2.22]

Iter 0255 | avgR  114.74 ± 62.24 | KL 0.0033 (β=0.0001) | Lclip 0.0005 Lkl 0.0033 Lent 0.9149 | steps 6025086 eps 8192 | time 2.22s


GRPO:  86%|███████████████████████████████████████████████▉        | 257/300 [08:53<01:32,  2.15s/it, KL=0.004, avgR=146.7, beta=0.0001, it_s=1.94]

Iter 0256 | avgR  146.69 ± 68.29 | KL 0.0038 (β=0.0001) | Lclip 0.0009 Lkl 0.0038 Lent 0.8850 | steps 6048451 eps 8224 | time 1.94s


GRPO:  86%|████████████████████████████████████████████████▏       | 258/300 [08:55<01:27,  2.08s/it, KL=0.003, avgR=142.4, beta=0.0001, it_s=1.92]

Iter 0257 | avgR  142.39 ± 66.77 | KL 0.0032 (β=0.0001) | Lclip 0.0006 Lkl 0.0032 Lent 0.8930 | steps 6072008 eps 8256 | time 1.92s


GRPO:  86%|████████████████████████████████████████████████▎       | 259/300 [08:57<01:24,  2.06s/it, KL=0.004, avgR=133.8, beta=0.0001, it_s=2.00]

Iter 0258 | avgR  133.81 ± 72.22 | KL 0.0037 (β=0.0001) | Lclip 0.0015 Lkl 0.0037 Lent 0.9128 | steps 6095809 eps 8288 | time 2.00s


GRPO:  87%|████████████████████████████████████████████████▌       | 260/300 [08:59<01:22,  2.06s/it, KL=0.003, avgR=126.1, beta=0.0001, it_s=2.07]

Iter 0259 | avgR  126.08 ± 59.61 | KL 0.0034 (β=0.0001) | Lclip 0.0008 Lkl 0.0034 Lent 0.9635 | steps 6120650 eps 8320 | time 2.07s


GRPO:  87%|████████████████████████████████████████████████▋       | 261/300 [09:01<01:23,  2.14s/it, KL=0.004, avgR=139.6, beta=0.0001, it_s=2.31]

Iter 0260 | avgR  139.57 ± 60.76 | KL 0.0038 (β=0.0001) | Lclip 0.0023 Lkl 0.0038 Lent 0.9296 | steps 6144201 eps 8352 | time 2.31s


GRPO:  87%|████████████████████████████████████████████████▉       | 262/300 [09:03<01:22,  2.18s/it, KL=0.004, avgR=139.4, beta=0.0001, it_s=2.28]

Iter 0261 | avgR  139.42 ± 53.09 | KL 0.0036 (β=0.0001) | Lclip 0.0004 Lkl 0.0036 Lent 0.9695 | steps 6170899 eps 8384 | time 2.28s


GRPO:  88%|█████████████████████████████████████████████████       | 263/300 [09:06<01:19,  2.15s/it, KL=0.004, avgR=141.5, beta=0.0001, it_s=2.06]

Iter 0262 | avgR  141.50 ± 65.27 | KL 0.0036 (β=0.0001) | Lclip 0.0010 Lkl 0.0036 Lent 0.9557 | steps 6195638 eps 8416 | time 2.06s


GRPO:  88%|█████████████████████████████████████████████████▎      | 264/300 [09:08<01:18,  2.17s/it, KL=0.003, avgR=129.1, beta=0.0001, it_s=2.23]

Iter 0263 | avgR  129.07 ± 74.17 | KL 0.0033 (β=0.0001) | Lclip 0.0007 Lkl 0.0033 Lent 0.9497 | steps 6219804 eps 8448 | time 2.23s


GRPO:  88%|█████████████████████████████████████████████████▍      | 265/300 [09:10<01:14,  2.12s/it, KL=0.003, avgR=133.8, beta=0.0001, it_s=2.00]

Iter 0264 | avgR  133.81 ± 64.31 | KL 0.0028 (β=0.0001) | Lclip 0.0013 Lkl 0.0028 Lent 0.9467 | steps 6244325 eps 8480 | time 2.00s


GRPO:  89%|█████████████████████████████████████████████████▋      | 266/300 [09:12<01:10,  2.07s/it, KL=0.005, avgR=149.1, beta=0.0001, it_s=1.96]

Iter 0265 | avgR  149.14 ± 71.23 | KL 0.0050 (β=0.0001) | Lclip 0.0028 Lkl 0.0050 Lent 0.9069 | steps 6266268 eps 8512 | time 1.96s


GRPO:  89%|█████████████████████████████████████████████████▊      | 267/300 [09:14<01:06,  2.01s/it, KL=0.005, avgR=145.7, beta=0.0001, it_s=1.87]

Iter 0266 | avgR  145.69 ± 70.23 | KL 0.0051 (β=0.0001) | Lclip 0.0027 Lkl 0.0051 Lent 0.9319 | steps 6289140 eps 8544 | time 1.87s


GRPO:  89%|██████████████████████████████████████████████████      | 268/300 [09:16<01:04,  2.02s/it, KL=0.004, avgR=127.7, beta=0.0001, it_s=2.02]

Iter 0267 | avgR  127.67 ± 69.93 | KL 0.0041 (β=0.0001) | Lclip 0.0007 Lkl 0.0041 Lent 0.9372 | steps 6313242 eps 8576 | time 2.02s


GRPO:  90%|██████████████████████████████████████████████████▏     | 269/300 [09:18<01:02,  2.00s/it, KL=0.004, avgR=147.3, beta=0.0001, it_s=1.97]

Iter 0268 | avgR  147.32 ± 67.01 | KL 0.0036 (β=0.0001) | Lclip 0.0015 Lkl 0.0036 Lent 0.8721 | steps 6337127 eps 8608 | time 1.97s


GRPO:  90%|██████████████████████████████████████████████████▍     | 270/300 [09:20<01:00,  2.01s/it, KL=0.004, avgR=153.3, beta=0.0001, it_s=2.01]

Iter 0269 | avgR  153.27 ± 64.32 | KL 0.0036 (β=0.0001) | Lclip 0.0009 Lkl 0.0036 Lent 0.9495 | steps 6360883 eps 8640 | time 2.01s


GRPO:  90%|██████████████████████████████████████████████████▌     | 271/300 [09:22<01:01,  2.11s/it, KL=0.003, avgR=136.5, beta=0.0001, it_s=2.33]

Iter 0270 | avgR  136.52 ± 58.73 | KL 0.0034 (β=0.0001) | Lclip 0.0018 Lkl 0.0034 Lent 0.8804 | steps 6386460 eps 8672 | time 2.33s


GRPO:  91%|██████████████████████████████████████████████████▊     | 272/300 [09:24<00:58,  2.09s/it, KL=0.005, avgR=142.3, beta=0.0001, it_s=2.06]

Iter 0271 | avgR  142.29 ± 65.30 | KL 0.0053 (β=0.0001) | Lclip 0.0023 Lkl 0.0053 Lent 0.9531 | steps 6411057 eps 8704 | time 2.06s


GRPO:  91%|██████████████████████████████████████████████████▉     | 273/300 [09:26<00:57,  2.14s/it, KL=0.004, avgR=122.9, beta=0.0001, it_s=2.24]

Iter 0272 | avgR  122.89 ± 59.23 | KL 0.0039 (β=0.0001) | Lclip 0.0010 Lkl 0.0039 Lent 0.9243 | steps 6438548 eps 8736 | time 2.24s


GRPO:  91%|███████████████████████████████████████████████████▏    | 274/300 [09:29<00:56,  2.17s/it, KL=0.002, avgR=112.5, beta=0.0001, it_s=2.25]

Iter 0273 | avgR  112.47 ± 62.44 | KL 0.0025 (β=0.0001) | Lclip 0.0004 Lkl 0.0025 Lent 0.9285 | steps 6465447 eps 8768 | time 2.25s


GRPO:  92%|████████████████████████████████████████████████████▎    | 275/300 [09:31<00:54,  2.19s/it, KL=0.004, avgR=96.1, beta=0.0001, it_s=2.23]

Iter 0274 | avgR   96.06 ± 73.46 | KL 0.0042 (β=0.0001) | Lclip 0.0017 Lkl 0.0042 Lent 0.9746 | steps 6492906 eps 8800 | time 2.23s


GRPO:  92%|███████████████████████████████████████████████████▌    | 276/300 [09:33<00:53,  2.24s/it, KL=0.004, avgR=102.1, beta=0.0001, it_s=2.35]

Iter 0275 | avgR  102.13 ± 46.55 | KL 0.0044 (β=0.0001) | Lclip 0.0010 Lkl 0.0044 Lent 0.9929 | steps 6523919 eps 8832 | time 2.35s


GRPO:  92%|███████████████████████████████████████████████████▋    | 277/300 [09:35<00:49,  2.17s/it, KL=0.005, avgR=114.9, beta=0.0001, it_s=2.02]

Iter 0276 | avgR  114.86 ± 57.25 | KL 0.0045 (β=0.0001) | Lclip 0.0014 Lkl 0.0045 Lent 0.9909 | steps 6549848 eps 8864 | time 2.02s


GRPO:  93%|███████████████████████████████████████████████████▉    | 278/300 [09:37<00:46,  2.09s/it, KL=0.002, avgR=132.3, beta=0.0001, it_s=1.90]

Iter 0277 | avgR  132.32 ± 69.53 | KL 0.0023 (β=0.0001) | Lclip 0.0006 Lkl 0.0023 Lent 0.9271 | steps 6573870 eps 8896 | time 1.90s


GRPO:  93%|████████████████████████████████████████████████████    | 279/300 [09:39<00:44,  2.10s/it, KL=0.005, avgR=132.3, beta=0.0001, it_s=2.12]

Iter 0278 | avgR  132.30 ± 67.45 | KL 0.0051 (β=0.0001) | Lclip 0.0022 Lkl 0.0051 Lent 0.9825 | steps 6598840 eps 8928 | time 2.12s


GRPO:  93%|████████████████████████████████████████████████████▎   | 280/300 [09:41<00:41,  2.06s/it, KL=0.005, avgR=147.3, beta=0.0001, it_s=1.94]

Iter 0279 | avgR  147.33 ± 63.42 | KL 0.0055 (β=0.0001) | Lclip -0.0015 Lkl 0.0055 Lent 0.9530 | steps 6622446 eps 8960 | time 1.94s


GRPO:  94%|████████████████████████████████████████████████████▍   | 281/300 [09:43<00:39,  2.09s/it, KL=0.004, avgR=142.3, beta=0.0001, it_s=2.16]

Iter 0280 | avgR  142.34 ± 61.69 | KL 0.0035 (β=0.0001) | Lclip 0.0009 Lkl 0.0035 Lent 0.9702 | steps 6648368 eps 8992 | time 2.16s


GRPO:  94%|████████████████████████████████████████████████████▋   | 282/300 [09:45<00:38,  2.12s/it, KL=0.003, avgR=117.7, beta=0.0001, it_s=2.20]

Iter 0281 | avgR  117.72 ± 66.36 | KL 0.0035 (β=0.0001) | Lclip 0.0013 Lkl 0.0035 Lent 0.8807 | steps 6675430 eps 9024 | time 2.20s


GRPO:  94%|████████████████████████████████████████████████████▊   | 283/300 [09:47<00:34,  2.04s/it, KL=0.004, avgR=158.4, beta=0.0001, it_s=1.85]

Iter 0282 | avgR  158.38 ± 56.57 | KL 0.0037 (β=0.0001) | Lclip 0.0015 Lkl 0.0037 Lent 0.8765 | steps 6697196 eps 9056 | time 1.85s


GRPO:  95%|█████████████████████████████████████████████████████   | 284/300 [09:50<00:33,  2.12s/it, KL=0.005, avgR=135.2, beta=0.0001, it_s=2.31]

Iter 0283 | avgR  135.20 ± 53.79 | KL 0.0048 (β=0.0001) | Lclip 0.0019 Lkl 0.0048 Lent 0.9822 | steps 6723949 eps 9088 | time 2.31s


GRPO:  95%|█████████████████████████████████████████████████████▏  | 285/300 [09:52<00:32,  2.18s/it, KL=0.004, avgR=144.5, beta=0.0001, it_s=2.31]

Iter 0284 | avgR  144.54 ± 60.23 | KL 0.0037 (β=0.0001) | Lclip 0.0014 Lkl 0.0037 Lent 0.9346 | steps 6750759 eps 9120 | time 2.31s


GRPO:  95%|█████████████████████████████████████████████████████▍  | 286/300 [09:54<00:30,  2.16s/it, KL=0.004, avgR=128.5, beta=0.0001, it_s=2.10]

Iter 0285 | avgR  128.50 ± 60.74 | KL 0.0036 (β=0.0001) | Lclip 0.0005 Lkl 0.0036 Lent 0.9427 | steps 6776705 eps 9152 | time 2.10s


GRPO:  96%|█████████████████████████████████████████████████████▌  | 287/300 [09:56<00:29,  2.24s/it, KL=0.004, avgR=126.9, beta=0.0001, it_s=2.44]

Iter 0286 | avgR  126.92 ± 53.55 | KL 0.0041 (β=0.0001) | Lclip 0.0012 Lkl 0.0041 Lent 1.0032 | steps 6804248 eps 9184 | time 2.44s


GRPO:  96%|█████████████████████████████████████████████████████▊  | 288/300 [09:59<00:27,  2.28s/it, KL=0.003, avgR=126.9, beta=0.0001, it_s=2.38]

Iter 0287 | avgR  126.94 ± 56.44 | KL 0.0034 (β=0.0001) | Lclip 0.0016 Lkl 0.0034 Lent 0.9538 | steps 6832592 eps 9216 | time 2.38s


GRPO:  96%|█████████████████████████████████████████████████████▉  | 289/300 [10:01<00:24,  2.20s/it, KL=0.004, avgR=155.7, beta=0.0001, it_s=1.99]

Iter 0288 | avgR  155.75 ± 68.90 | KL 0.0037 (β=0.0001) | Lclip 0.0009 Lkl 0.0037 Lent 0.9067 | steps 6856608 eps 9248 | time 1.99s


GRPO:  97%|██████████████████████████████████████████████████████▏ | 290/300 [10:03<00:22,  2.28s/it, KL=0.004, avgR=119.5, beta=0.0001, it_s=2.48]

Iter 0289 | avgR  119.48 ± 44.04 | KL 0.0042 (β=0.0001) | Lclip 0.0015 Lkl 0.0042 Lent 1.0451 | steps 6885006 eps 9280 | time 2.48s


GRPO:  97%|██████████████████████████████████████████████████████▎ | 291/300 [10:06<00:21,  2.38s/it, KL=0.004, avgR=134.3, beta=0.0001, it_s=2.62]

Iter 0290 | avgR  134.31 ± 54.60 | KL 0.0035 (β=0.0001) | Lclip 0.0006 Lkl 0.0035 Lent 1.0161 | steps 6912804 eps 9312 | time 2.62s


GRPO:  97%|██████████████████████████████████████████████████████▌ | 292/300 [10:08<00:18,  2.36s/it, KL=0.005, avgR=128.4, beta=0.0001, it_s=2.30]

Iter 0291 | avgR  128.37 ± 51.06 | KL 0.0046 (β=0.0001) | Lclip 0.0017 Lkl 0.0046 Lent 0.9804 | steps 6941604 eps 9344 | time 2.30s


GRPO:  98%|██████████████████████████████████████████████████████▋ | 293/300 [10:10<00:16,  2.32s/it, KL=0.006, avgR=135.1, beta=0.0001, it_s=2.22]

Iter 0292 | avgR  135.09 ± 60.46 | KL 0.0061 (β=0.0001) | Lclip 0.0006 Lkl 0.0061 Lent 1.0605 | steps 6967951 eps 9376 | time 2.22s


GRPO:  98%|██████████████████████████████████████████████████████▉ | 294/300 [10:13<00:13,  2.23s/it, KL=0.005, avgR=154.9, beta=0.0001, it_s=2.01]

Iter 0293 | avgR  154.86 ± 72.35 | KL 0.0053 (β=0.0001) | Lclip 0.0026 Lkl 0.0053 Lent 0.9709 | steps 6991686 eps 9408 | time 2.01s


GRPO:  98%|███████████████████████████████████████████████████████ | 295/300 [10:15<00:11,  2.25s/it, KL=0.004, avgR=135.4, beta=0.0001, it_s=2.29]

Iter 0294 | avgR  135.41 ± 58.30 | KL 0.0045 (β=0.0001) | Lclip 0.0012 Lkl 0.0045 Lent 1.0154 | steps 7019364 eps 9440 | time 2.29s


GRPO:  99%|███████████████████████████████████████████████████████▎| 296/300 [10:17<00:08,  2.25s/it, KL=0.004, avgR=134.5, beta=0.0001, it_s=2.25]

Iter 0295 | avgR  134.54 ± 49.07 | KL 0.0044 (β=0.0001) | Lclip 0.0014 Lkl 0.0044 Lent 1.0339 | steps 7048560 eps 9472 | time 2.25s


GRPO:  99%|███████████████████████████████████████████████████████▍| 297/300 [10:19<00:06,  2.25s/it, KL=0.004, avgR=129.7, beta=0.0001, it_s=2.24]

Iter 0296 | avgR  129.67 ± 56.90 | KL 0.0036 (β=0.0001) | Lclip 0.0007 Lkl 0.0036 Lent 0.9758 | steps 7076996 eps 9504 | time 2.24s


GRPO:  99%|███████████████████████████████████████████████████████▋| 298/300 [10:21<00:04,  2.18s/it, KL=0.003, avgR=143.6, beta=0.0001, it_s=2.01]

Iter 0297 | avgR  143.59 ± 68.99 | KL 0.0030 (β=0.0001) | Lclip 0.0017 Lkl 0.0030 Lent 0.9266 | steps 7101244 eps 9536 | time 2.01s


GRPO: 100%|███████████████████████████████████████████████████████▊| 299/300 [10:23<00:02,  2.10s/it, KL=0.004, avgR=140.3, beta=0.0001, it_s=1.92]

Iter 0298 | avgR  140.32 ± 69.37 | KL 0.0036 (β=0.0001) | Lclip 0.0007 Lkl 0.0036 Lent 0.9437 | steps 7124222 eps 9568 | time 1.92s


GRPO: 100%|████████████████████████████████████████████████████████| 300/300 [10:26<00:00,  2.09s/it, KL=0.004, avgR=131.1, beta=0.0001, it_s=2.47]

Iter 0299 | avgR  131.13 ± 53.75 | KL 0.0039 (β=0.0001) | Lclip 0.0014 Lkl 0.0039 Lent 0.9404 | steps 7152469 eps 9600 | time 2.47s





In [7]:
from dense_scripts.utils import record_videos

device = "cuda"
env_id = "LunarLander-v3"
pi_star.to(device)
record_videos(pi_star, env_id, video_dir=f"videos/GRPO/gamma_{gamma}", episodes=6, device=device)



✅ Saved MP4 video: videos/GRPO/gamma_0.98/LunarLander-v3_ep01_R231.5.mp4 | Reward: 231.5




✅ Saved MP4 video: videos/GRPO/gamma_0.98/LunarLander-v3_ep02_R235.5.mp4 | Reward: 235.5




✅ Saved MP4 video: videos/GRPO/gamma_0.98/LunarLander-v3_ep03_R232.1.mp4 | Reward: 232.1




✅ Saved MP4 video: videos/GRPO/gamma_0.98/LunarLander-v3_ep04_R258.4.mp4 | Reward: 258.4




✅ Saved MP4 video: videos/GRPO/gamma_0.98/LunarLander-v3_ep05_R128.2.mp4 | Reward: 128.2




✅ Saved MP4 video: videos/GRPO/gamma_0.98/LunarLander-v3_ep06_R227.7.mp4 | Reward: 227.7


In [3]:
from dense_scripts.GRPO.grpo import PerStepAdvGRPO, GRPOConfig
from dense_scripts.utils.policies import SimpleGRPOPolicy
from dense_scripts.utils import record_videos

gamma = 0.98
G=32
env_id = "LunarLander-v3"
cfg = GRPOConfig(
    env_id="LunarLander-v3",
    G=G, T=1024, epochs=8, minibatches=32, gamma=gamma,
    ent_coef=0.01, beta_kl=0.02, target_kl=0.015,
    n_workers=24, log_dir="./runs/GRPO_Lander"
)

pi = SimpleGRPOPolicy(obs_dim=8, act_dim=4, hidden=128)

trainer = PerStepAdvGRPO(policy=pi, config=cfg, device="cpu")
pi_star = trainer.train(iters=300)

device = "cuda"
pi_star.to(device)
record_videos(pi_star, env_id, video_dir=f"videos/GRPO/gamma_{gamma}_{G}", episodes=12, device=device)

GRPO:   0%|▏                                                        | 1/300 [00:01<06:32,  1.31s/it, KL=0.010, avgR=-132.2, beta=0.0133, it_s=1.31]

Iter 0000 | avgR -132.17 ± 76.77 | KL 0.0098 (β=0.01333) | Lclip 0.0071 Lkl 0.0098 Lent 1.3735 | steps 2838 eps 32 | time 1.31s


GRPO:   1%|▎                                                       | 2/300 [00:02<06:24,  1.29s/it, KL=0.007, avgR=-109.5, beta=0.00889, it_s=1.27]

Iter 0001 | avgR -109.48 ± 35.21 | KL 0.0072 (β=0.008889) | Lclip 0.0061 Lkl 0.0072 Lent 1.3506 | steps 5592 eps 64 | time 1.27s


GRPO:   1%|▌                                                       | 3/300 [00:03<06:07,  1.24s/it, KL=0.010, avgR=-107.2, beta=0.00889, it_s=1.17]

Iter 0002 | avgR -107.19 ± 35.43 | KL 0.0103 (β=0.008889) | Lclip 0.0102 Lkl 0.0103 Lent 1.3426 | steps 8707 eps 96 | time 1.17s


GRPO:   1%|▋                                                       | 4/300 [00:04<06:00,  1.22s/it, KL=0.009, avgR=-108.2, beta=0.00593, it_s=1.19]

Iter 0003 | avgR -108.24 ± 51.13 | KL 0.0094 (β=0.005926) | Lclip 0.0084 Lkl 0.0094 Lent 1.3167 | steps 11994 eps 128 | time 1.19s


GRPO:   2%|▉                                                        | 5/300 [00:06<05:59,  1.22s/it, KL=0.007, avgR=-96.1, beta=0.00395, it_s=1.22]

Iter 0004 | avgR  -96.11 ± 35.86 | KL 0.0073 (β=0.003951) | Lclip 0.0059 Lkl 0.0073 Lent 1.3233 | steps 15312 eps 160 | time 1.22s


GRPO:   2%|█                                                       | 6/300 [00:07<05:57,  1.22s/it, KL=0.010, avgR=-140.3, beta=0.00263, it_s=1.21]

Iter 0005 | avgR -140.26 ± 85.05 | KL 0.0099 (β=0.002634) | Lclip 0.0094 Lkl 0.0099 Lent 1.2940 | steps 18824 eps 192 | time 1.21s


GRPO:   2%|█▎                                                      | 7/300 [00:08<05:58,  1.22s/it, KL=0.008, avgR=-138.3, beta=0.00176, it_s=1.24]

Iter 0006 | avgR -138.33 ± 86.81 | KL 0.0075 (β=0.001756) | Lclip 0.0105 Lkl 0.0075 Lent 1.2709 | steps 22619 eps 224 | time 1.24s


GRPO:   3%|█▍                                                      | 8/300 [00:09<05:59,  1.23s/it, KL=0.012, avgR=-102.7, beta=0.00176, it_s=1.24]

Iter 0007 | avgR -102.66 ± 64.12 | KL 0.0123 (β=0.001756) | Lclip 0.0102 Lkl 0.0123 Lent 1.2115 | steps 26068 eps 256 | time 1.24s


GRPO:   3%|█▋                                                      | 9/300 [00:11<06:31,  1.34s/it, KL=0.008, avgR=-132.7, beta=0.00117, it_s=1.59]

Iter 0008 | avgR -132.66 ± 70.18 | KL 0.0076 (β=0.001171) | Lclip 0.0064 Lkl 0.0076 Lent 1.2358 | steps 31716 eps 288 | time 1.59s


GRPO:   3%|█▊                                                      | 10/300 [00:12<06:44,  1.39s/it, KL=0.009, avgR=-50.8, beta=0.00078, it_s=1.50]

Iter 0009 | avgR  -50.76 ± 39.98 | KL 0.0093 (β=0.0007804) | Lclip 0.0103 Lkl 0.0093 Lent 1.2296 | steps 37387 eps 320 | time 1.50s


GRPO:   4%|██                                                      | 11/300 [00:14<07:00,  1.46s/it, KL=0.007, avgR=-30.1, beta=0.00052, it_s=1.60]

Iter 0010 | avgR  -30.05 ± 47.25 | KL 0.0074 (β=0.0005202) | Lclip 0.0070 Lkl 0.0074 Lent 1.2269 | steps 42892 eps 352 | time 1.60s


GRPO:   4%|██▏                                                    | 12/300 [00:16<07:01,  1.46s/it, KL=0.007, avgR=-18.9, beta=0.000347, it_s=1.48]

Iter 0011 | avgR  -18.95 ± 34.04 | KL 0.0066 (β=0.0003468) | Lclip 0.0067 Lkl 0.0066 Lent 1.2100 | steps 47693 eps 384 | time 1.48s


GRPO:   4%|██▍                                                    | 13/300 [00:17<07:06,  1.49s/it, KL=0.011, avgR=-11.0, beta=0.000347, it_s=1.54]

Iter 0012 | avgR  -10.96 ± 39.85 | KL 0.0113 (β=0.0003468) | Lclip 0.0087 Lkl 0.0113 Lent 1.1972 | steps 53062 eps 416 | time 1.54s


GRPO:   5%|██▌                                                     | 14/300 [00:19<07:18,  1.53s/it, KL=0.006, avgR=-4.2, beta=0.000231, it_s=1.63]

Iter 0013 | avgR   -4.24 ± 38.26 | KL 0.0063 (β=0.0002312) | Lclip 0.0028 Lkl 0.0063 Lent 1.1339 | steps 58692 eps 448 | time 1.63s


GRPO:   5%|██▊                                                      | 15/300 [00:20<07:25,  1.56s/it, KL=0.009, avgR=2.4, beta=0.000154, it_s=1.63]

Iter 0014 | avgR    2.39 ± 33.55 | KL 0.0094 (β=0.0001541) | Lclip 0.0051 Lkl 0.0094 Lent 1.1302 | steps 64375 eps 480 | time 1.63s


GRPO:   5%|██▉                                                     | 16/300 [00:22<07:31,  1.59s/it, KL=0.006, avgR=-8.0, beta=0.000103, it_s=1.65]

Iter 0015 | avgR   -8.02 ± 42.30 | KL 0.0061 (β=0.0001028) | Lclip 0.0039 Lkl 0.0061 Lent 1.1873 | steps 75214 eps 512 | time 1.65s


GRPO:   6%|███▎                                                       | 17/300 [00:24<07:55,  1.68s/it, KL=0.005, avgR=0.6, beta=0.0001, it_s=1.89]

Iter 0016 | avgR    0.63 ± 50.71 | KL 0.0052 (β=0.0001) | Lclip 0.0027 Lkl 0.0052 Lent 1.2197 | steps 88656 eps 544 | time 1.89s


GRPO:   6%|███▍                                                      | 18/300 [00:26<08:05,  1.72s/it, KL=0.005, avgR=10.4, beta=0.0001, it_s=1.81]

Iter 0017 | avgR   10.42 ± 72.32 | KL 0.0054 (β=0.0001) | Lclip 0.0034 Lkl 0.0054 Lent 1.2194 | steps 104192 eps 576 | time 1.81s


GRPO:   6%|███▋                                                      | 19/300 [00:27<08:05,  1.73s/it, KL=0.006, avgR=12.4, beta=0.0001, it_s=1.74]

Iter 0018 | avgR   12.43 ± 54.28 | KL 0.0056 (β=0.0001) | Lclip 0.0035 Lkl 0.0056 Lent 1.2267 | steps 116316 eps 608 | time 1.74s


GRPO:   7%|███▊                                                      | 20/300 [00:29<07:59,  1.71s/it, KL=0.009, avgR=-5.7, beta=0.0001, it_s=1.67]

Iter 0019 | avgR   -5.70 ± 30.89 | KL 0.0088 (β=0.0001) | Lclip 0.0059 Lkl 0.0088 Lent 1.0802 | steps 123222 eps 640 | time 1.67s


GRPO:   7%|████▏                                                      | 21/300 [00:31<08:11,  1.76s/it, KL=0.004, avgR=4.4, beta=0.0001, it_s=1.88]

Iter 0020 | avgR    4.43 ± 63.22 | KL 0.0042 (β=0.0001) | Lclip 0.0035 Lkl 0.0042 Lent 1.1557 | steps 139133 eps 672 | time 1.88s


GRPO:   7%|████▎                                                     | 22/300 [00:33<08:22,  1.81s/it, KL=0.005, avgR=11.6, beta=0.0001, it_s=1.91]

Iter 0021 | avgR   11.59 ± 63.61 | KL 0.0054 (β=0.0001) | Lclip 0.0031 Lkl 0.0054 Lent 1.1685 | steps 154254 eps 704 | time 1.91s


GRPO:   8%|████▍                                                     | 23/300 [00:35<08:23,  1.82s/it, KL=0.006, avgR=-2.5, beta=0.0001, it_s=1.83]

Iter 0022 | avgR   -2.48 ± 69.52 | KL 0.0057 (β=0.0001) | Lclip 0.0031 Lkl 0.0057 Lent 1.1555 | steps 168145 eps 736 | time 1.83s


GRPO:   8%|████▋                                                     | 24/300 [00:37<08:18,  1.80s/it, KL=0.005, avgR=13.7, beta=0.0001, it_s=1.77]

Iter 0023 | avgR   13.73 ± 59.77 | KL 0.0052 (β=0.0001) | Lclip 0.0023 Lkl 0.0052 Lent 1.1829 | steps 183074 eps 768 | time 1.77s


GRPO:   8%|████▊                                                     | 25/300 [00:38<08:25,  1.84s/it, KL=0.005, avgR=31.6, beta=0.0001, it_s=1.92]

Iter 0024 | avgR   31.59 ± 62.00 | KL 0.0051 (β=0.0001) | Lclip 0.0021 Lkl 0.0051 Lent 1.1963 | steps 204362 eps 800 | time 1.92s


GRPO:   9%|█████                                                     | 26/300 [00:41<08:44,  1.91s/it, KL=0.006, avgR=45.2, beta=0.0001, it_s=2.09]

Iter 0025 | avgR   45.16 ± 57.16 | KL 0.0064 (β=0.0001) | Lclip 0.0028 Lkl 0.0064 Lent 1.1687 | steps 226721 eps 832 | time 2.09s


GRPO:   9%|█████▏                                                    | 27/300 [00:42<08:44,  1.92s/it, KL=0.004, avgR=39.3, beta=0.0001, it_s=1.93]

Iter 0026 | avgR   39.30 ± 55.51 | KL 0.0040 (β=0.0001) | Lclip 0.0021 Lkl 0.0040 Lent 1.1332 | steps 243972 eps 864 | time 1.93s


GRPO:   9%|█████▍                                                    | 28/300 [00:45<08:52,  1.96s/it, KL=0.005, avgR=49.2, beta=0.0001, it_s=2.04]

Iter 0027 | avgR   49.19 ± 63.17 | KL 0.0054 (β=0.0001) | Lclip 0.0022 Lkl 0.0054 Lent 1.1598 | steps 261799 eps 896 | time 2.04s


GRPO:  10%|█████▌                                                    | 29/300 [00:46<08:45,  1.94s/it, KL=0.005, avgR=40.0, beta=0.0001, it_s=1.90]

Iter 0028 | avgR   40.02 ± 65.86 | KL 0.0054 (β=0.0001) | Lclip 0.0028 Lkl 0.0054 Lent 1.1602 | steps 281526 eps 928 | time 1.90s


GRPO:  10%|█████▊                                                    | 30/300 [00:49<08:57,  1.99s/it, KL=0.006, avgR=21.6, beta=0.0001, it_s=2.10]

Iter 0029 | avgR   21.57 ± 91.33 | KL 0.0063 (β=0.0001) | Lclip 0.0027 Lkl 0.0063 Lent 1.1185 | steps 302568 eps 960 | time 2.10s


GRPO:  10%|█████▉                                                    | 31/300 [00:51<09:05,  2.03s/it, KL=0.004, avgR=35.7, beta=0.0001, it_s=2.11]

Iter 0030 | avgR   35.72 ± 74.60 | KL 0.0036 (β=0.0001) | Lclip 0.0017 Lkl 0.0036 Lent 1.1332 | steps 324790 eps 992 | time 2.11s


GRPO:  11%|██████▏                                                   | 32/300 [00:53<09:36,  2.15s/it, KL=0.004, avgR=83.5, beta=0.0001, it_s=2.44]

Iter 0031 | avgR   83.48 ± 40.15 | KL 0.0037 (β=0.0001) | Lclip 0.0010 Lkl 0.0037 Lent 1.1702 | steps 351221 eps 1024 | time 2.44s


GRPO:  11%|██████▍                                                   | 33/300 [00:55<09:40,  2.18s/it, KL=0.006, avgR=67.5, beta=0.0001, it_s=2.22]

Iter 0032 | avgR   67.48 ± 73.32 | KL 0.0055 (β=0.0001) | Lclip 0.0034 Lkl 0.0055 Lent 1.1457 | steps 377383 eps 1056 | time 2.22s


GRPO:  11%|██████▌                                                   | 34/300 [00:58<09:54,  2.24s/it, KL=0.004, avgR=83.7, beta=0.0001, it_s=2.37]

Iter 0033 | avgR   83.73 ± 45.85 | KL 0.0043 (β=0.0001) | Lclip 0.0019 Lkl 0.0043 Lent 1.1095 | steps 405933 eps 1088 | time 2.37s


GRPO:  12%|██████▊                                                   | 35/300 [01:00<10:14,  2.32s/it, KL=0.005, avgR=61.9, beta=0.0001, it_s=2.51]

Iter 0034 | avgR   61.91 ± 59.46 | KL 0.0046 (β=0.0001) | Lclip 0.0016 Lkl 0.0046 Lent 1.1295 | steps 433864 eps 1120 | time 2.51s


GRPO:  12%|██████▉                                                   | 36/300 [01:02<09:46,  2.22s/it, KL=0.007, avgR=68.5, beta=0.0001, it_s=2.00]

Iter 0035 | avgR   68.53 ± 59.67 | KL 0.0070 (β=0.0001) | Lclip 0.0038 Lkl 0.0070 Lent 1.1413 | steps 456784 eps 1152 | time 2.00s


GRPO:  12%|███████▏                                                  | 37/300 [01:04<09:44,  2.22s/it, KL=0.007, avgR=64.9, beta=0.0001, it_s=2.22]

Iter 0036 | avgR   64.89 ± 53.61 | KL 0.0069 (β=0.0001) | Lclip 0.0026 Lkl 0.0069 Lent 1.1536 | steps 481087 eps 1184 | time 2.22s


GRPO:  13%|███████▎                                                  | 38/300 [01:07<10:00,  2.29s/it, KL=0.004, avgR=72.9, beta=0.0001, it_s=2.45]

Iter 0037 | avgR   72.93 ± 39.73 | KL 0.0038 (β=0.0001) | Lclip 0.0017 Lkl 0.0038 Lent 1.1627 | steps 507448 eps 1216 | time 2.45s


GRPO:  13%|███████▌                                                  | 39/300 [01:09<10:12,  2.35s/it, KL=0.003, avgR=54.2, beta=0.0001, it_s=2.47]

Iter 0038 | avgR   54.20 ± 72.56 | KL 0.0035 (β=0.0001) | Lclip 0.0003 Lkl 0.0035 Lent 1.1393 | steps 534276 eps 1248 | time 2.47s


GRPO:  13%|███████▋                                                  | 40/300 [01:12<10:06,  2.33s/it, KL=0.005, avgR=66.8, beta=0.0001, it_s=2.29]

Iter 0039 | avgR   66.82 ± 66.82 | KL 0.0049 (β=0.0001) | Lclip 0.0028 Lkl 0.0049 Lent 1.0940 | steps 560388 eps 1280 | time 2.29s


GRPO:  14%|███████▉                                                  | 41/300 [01:14<09:59,  2.32s/it, KL=0.003, avgR=52.7, beta=0.0001, it_s=2.28]

Iter 0040 | avgR   52.66 ± 63.56 | KL 0.0032 (β=0.0001) | Lclip 0.0007 Lkl 0.0032 Lent 1.1148 | steps 585389 eps 1312 | time 2.28s


GRPO:  14%|████████                                                  | 42/300 [01:16<09:40,  2.25s/it, KL=0.005, avgR=53.5, beta=0.0001, it_s=2.09]

Iter 0041 | avgR   53.48 ± 60.93 | KL 0.0046 (β=0.0001) | Lclip 0.0021 Lkl 0.0046 Lent 1.1188 | steps 610148 eps 1344 | time 2.09s


GRPO:  14%|████████▎                                                 | 43/300 [01:18<09:54,  2.31s/it, KL=0.005, avgR=60.6, beta=0.0001, it_s=2.46]

Iter 0042 | avgR   60.59 ± 51.16 | KL 0.0047 (β=0.0001) | Lclip 0.0012 Lkl 0.0047 Lent 1.1405 | steps 637490 eps 1376 | time 2.46s


GRPO:  15%|████████▌                                                 | 44/300 [01:21<09:34,  2.24s/it, KL=0.005, avgR=67.3, beta=0.0001, it_s=2.08]

Iter 0043 | avgR   67.32 ± 64.24 | KL 0.0048 (β=0.0001) | Lclip 0.0027 Lkl 0.0048 Lent 1.1276 | steps 659832 eps 1408 | time 2.08s


GRPO:  15%|████████▋                                                 | 45/300 [01:23<09:51,  2.32s/it, KL=0.005, avgR=77.6, beta=0.0001, it_s=2.49]

Iter 0044 | avgR   77.64 ± 55.03 | KL 0.0050 (β=0.0001) | Lclip 0.0027 Lkl 0.0050 Lent 1.1073 | steps 686049 eps 1440 | time 2.49s


GRPO:  15%|████████▉                                                 | 46/300 [01:25<09:27,  2.24s/it, KL=0.004, avgR=38.5, beta=0.0001, it_s=2.04]

Iter 0045 | avgR   38.53 ± 76.07 | KL 0.0039 (β=0.0001) | Lclip 0.0013 Lkl 0.0039 Lent 1.1581 | steps 708015 eps 1472 | time 2.04s


GRPO:  16%|█████████                                                 | 47/300 [01:27<09:27,  2.24s/it, KL=0.004, avgR=96.5, beta=0.0001, it_s=2.26]

Iter 0046 | avgR   96.53 ± 72.70 | KL 0.0044 (β=0.0001) | Lclip 0.0017 Lkl 0.0044 Lent 1.0957 | steps 733019 eps 1504 | time 2.26s


GRPO:  16%|█████████▎                                                | 48/300 [01:30<09:46,  2.33s/it, KL=0.005, avgR=87.4, beta=0.0001, it_s=2.53]

Iter 0047 | avgR   87.42 ± 60.34 | KL 0.0049 (β=0.0001) | Lclip 0.0007 Lkl 0.0049 Lent 1.1494 | steps 760848 eps 1536 | time 2.53s


GRPO:  16%|█████████▍                                                | 49/300 [01:32<09:25,  2.25s/it, KL=0.005, avgR=79.4, beta=0.0001, it_s=2.07]

Iter 0048 | avgR   79.42 ± 64.77 | KL 0.0047 (β=0.0001) | Lclip 0.0023 Lkl 0.0047 Lent 1.0776 | steps 783472 eps 1568 | time 2.07s


GRPO:  17%|█████████▋                                                | 50/300 [01:34<09:12,  2.21s/it, KL=0.005, avgR=83.3, beta=0.0001, it_s=2.10]

Iter 0049 | avgR   83.33 ± 61.84 | KL 0.0049 (β=0.0001) | Lclip 0.0015 Lkl 0.0049 Lent 1.1220 | steps 806745 eps 1600 | time 2.10s


GRPO:  17%|█████████▋                                               | 51/300 [01:36<09:06,  2.19s/it, KL=0.003, avgR=102.1, beta=0.0001, it_s=2.16]

Iter 0050 | avgR  102.10 ± 56.17 | KL 0.0032 (β=0.0001) | Lclip 0.0013 Lkl 0.0032 Lent 1.1587 | steps 832406 eps 1632 | time 2.16s


GRPO:  17%|██████████                                                | 52/300 [01:38<08:53,  2.15s/it, KL=0.004, avgR=72.2, beta=0.0001, it_s=2.05]

Iter 0051 | avgR   72.20 ± 54.67 | KL 0.0040 (β=0.0001) | Lclip 0.0023 Lkl 0.0040 Lent 1.1256 | steps 853958 eps 1664 | time 2.05s


GRPO:  18%|██████████▏                                               | 53/300 [01:41<09:24,  2.29s/it, KL=0.004, avgR=94.0, beta=0.0001, it_s=2.59]

Iter 0052 | avgR   94.05 ± 56.44 | KL 0.0044 (β=0.0001) | Lclip 0.0009 Lkl 0.0044 Lent 1.0882 | steps 881800 eps 1696 | time 2.59s


GRPO:  18%|██████████▎                                              | 54/300 [01:43<09:15,  2.26s/it, KL=0.004, avgR=101.4, beta=0.0001, it_s=2.19]

Iter 0053 | avgR  101.36 ± 57.62 | KL 0.0043 (β=0.0001) | Lclip 0.0010 Lkl 0.0043 Lent 1.0780 | steps 907874 eps 1728 | time 2.19s


GRPO:  18%|██████████▍                                              | 55/300 [01:46<09:25,  2.31s/it, KL=0.004, avgR=101.0, beta=0.0001, it_s=2.41]

Iter 0054 | avgR  101.00 ± 46.67 | KL 0.0041 (β=0.0001) | Lclip 0.0011 Lkl 0.0041 Lent 1.0972 | steps 935695 eps 1760 | time 2.41s


GRPO:  19%|██████████▋                                              | 56/300 [01:48<09:32,  2.34s/it, KL=0.004, avgR=112.6, beta=0.0001, it_s=2.43]

Iter 0055 | avgR  112.56 ± 40.76 | KL 0.0039 (β=0.0001) | Lclip 0.0012 Lkl 0.0039 Lent 1.0556 | steps 965059 eps 1792 | time 2.43s


GRPO:  19%|██████████▊                                              | 57/300 [01:50<09:36,  2.37s/it, KL=0.005, avgR=120.7, beta=0.0001, it_s=2.43]

Iter 0056 | avgR  120.67 ± 41.60 | KL 0.0050 (β=0.0001) | Lclip 0.0023 Lkl 0.0050 Lent 1.0895 | steps 994503 eps 1824 | time 2.43s


GRPO:  19%|███████████▏                                              | 58/300 [01:53<09:35,  2.38s/it, KL=0.006, avgR=99.6, beta=0.0001, it_s=2.39]

Iter 0057 | avgR   99.56 ± 45.14 | KL 0.0059 (β=0.0001) | Lclip 0.0022 Lkl 0.0059 Lent 1.1132 | steps 1022449 eps 1856 | time 2.39s


GRPO:  20%|███████████▍                                              | 59/300 [01:55<09:40,  2.41s/it, KL=0.005, avgR=98.5, beta=0.0001, it_s=2.48]

Iter 0058 | avgR   98.54 ± 44.53 | KL 0.0052 (β=0.0001) | Lclip 0.0016 Lkl 0.0052 Lent 1.0993 | steps 1050171 eps 1888 | time 2.48s


GRPO:  20%|███████████▌                                              | 60/300 [01:58<09:46,  2.44s/it, KL=0.005, avgR=66.4, beta=0.0001, it_s=2.52]

Iter 0059 | avgR   66.39 ± 54.10 | KL 0.0051 (β=0.0001) | Lclip 0.0019 Lkl 0.0051 Lent 1.0977 | steps 1078122 eps 1920 | time 2.52s


GRPO:  20%|███████████▊                                              | 61/300 [02:00<09:43,  2.44s/it, KL=0.005, avgR=81.6, beta=0.0001, it_s=2.43]

Iter 0060 | avgR   81.61 ± 40.35 | KL 0.0049 (β=0.0001) | Lclip 0.0019 Lkl 0.0049 Lent 1.1104 | steps 1106964 eps 1952 | time 2.43s


GRPO:  21%|███████████▉                                              | 62/300 [02:03<09:51,  2.48s/it, KL=0.004, avgR=68.9, beta=0.0001, it_s=2.58]

Iter 0061 | avgR   68.88 ± 48.03 | KL 0.0043 (β=0.0001) | Lclip 0.0012 Lkl 0.0043 Lent 1.0536 | steps 1134142 eps 1984 | time 2.58s


GRPO:  21%|████████████▏                                             | 63/300 [02:05<09:46,  2.48s/it, KL=0.006, avgR=90.9, beta=0.0001, it_s=2.45]

Iter 0062 | avgR   90.87 ± 52.40 | KL 0.0055 (β=0.0001) | Lclip 0.0018 Lkl 0.0055 Lent 1.0713 | steps 1163195 eps 2016 | time 2.45s


GRPO:  21%|████████████▎                                             | 64/300 [02:08<10:02,  2.55s/it, KL=0.005, avgR=83.1, beta=0.0001, it_s=2.73]

Iter 0063 | avgR   83.13 ± 49.68 | KL 0.0050 (β=0.0001) | Lclip 0.0028 Lkl 0.0050 Lent 1.0517 | steps 1191486 eps 2048 | time 2.73s


GRPO:  22%|████████████▌                                             | 65/300 [02:10<09:55,  2.53s/it, KL=0.005, avgR=81.4, beta=0.0001, it_s=2.48]

Iter 0064 | avgR   81.38 ± 50.60 | KL 0.0055 (β=0.0001) | Lclip 0.0019 Lkl 0.0055 Lent 1.1009 | steps 1217831 eps 2080 | time 2.48s


GRPO:  22%|████████████▊                                             | 66/300 [02:13<09:54,  2.54s/it, KL=0.004, avgR=88.9, beta=0.0001, it_s=2.56]

Iter 0065 | avgR   88.86 ± 50.68 | KL 0.0042 (β=0.0001) | Lclip 0.0013 Lkl 0.0042 Lent 1.0396 | steps 1246896 eps 2112 | time 2.56s


GRPO:  22%|████████████▉                                             | 67/300 [02:16<09:58,  2.57s/it, KL=0.004, avgR=83.4, beta=0.0001, it_s=2.63]

Iter 0066 | avgR   83.45 ± 33.74 | KL 0.0036 (β=0.0001) | Lclip 0.0011 Lkl 0.0036 Lent 1.0451 | steps 1277297 eps 2144 | time 2.63s


GRPO:  23%|█████████████▏                                            | 68/300 [02:18<09:49,  2.54s/it, KL=0.004, avgR=89.8, beta=0.0001, it_s=2.48]

Iter 0067 | avgR   89.81 ± 48.32 | KL 0.0043 (β=0.0001) | Lclip 0.0023 Lkl 0.0043 Lent 1.0357 | steps 1304798 eps 2176 | time 2.48s


GRPO:  23%|█████████████▎                                            | 69/300 [02:21<09:39,  2.51s/it, KL=0.005, avgR=86.1, beta=0.0001, it_s=2.43]

Iter 0068 | avgR   86.09 ± 47.94 | KL 0.0048 (β=0.0001) | Lclip 0.0014 Lkl 0.0048 Lent 0.9628 | steps 1333232 eps 2208 | time 2.43s


GRPO:  23%|█████████████▎                                           | 70/300 [02:22<08:53,  2.32s/it, KL=0.005, avgR=101.9, beta=0.0001, it_s=1.87]

Iter 0069 | avgR  101.86 ± 85.16 | KL 0.0051 (β=0.0001) | Lclip 0.0028 Lkl 0.0051 Lent 0.9774 | steps 1352196 eps 2240 | time 1.87s


GRPO:  24%|█████████████▋                                            | 71/300 [02:25<08:46,  2.30s/it, KL=0.004, avgR=96.9, beta=0.0001, it_s=2.25]

Iter 0070 | avgR   96.90 ± 63.40 | KL 0.0040 (β=0.0001) | Lclip 0.0019 Lkl 0.0040 Lent 0.9982 | steps 1377152 eps 2272 | time 2.25s


GRPO:  24%|█████████████▉                                            | 72/300 [02:27<08:50,  2.33s/it, KL=0.004, avgR=99.8, beta=0.0001, it_s=2.38]

Iter 0071 | avgR   99.84 ± 50.89 | KL 0.0045 (β=0.0001) | Lclip 0.0022 Lkl 0.0045 Lent 1.0090 | steps 1404473 eps 2304 | time 2.38s


GRPO:  24%|██████████████                                            | 73/300 [02:30<09:06,  2.41s/it, KL=0.004, avgR=98.7, beta=0.0001, it_s=2.59]

Iter 0072 | avgR   98.67 ± 58.00 | KL 0.0043 (β=0.0001) | Lclip 0.0025 Lkl 0.0043 Lent 1.0049 | steps 1434894 eps 2336 | time 2.59s


GRPO:  25%|██████████████▎                                           | 74/300 [02:32<09:05,  2.41s/it, KL=0.003, avgR=83.3, beta=0.0001, it_s=2.43]

Iter 0073 | avgR   83.28 ± 43.16 | KL 0.0034 (β=0.0001) | Lclip 0.0016 Lkl 0.0034 Lent 1.0217 | steps 1464286 eps 2368 | time 2.43s


GRPO:  25%|██████████████▌                                           | 75/300 [02:35<09:12,  2.46s/it, KL=0.005, avgR=99.0, beta=0.0001, it_s=2.55]

Iter 0074 | avgR   98.96 ± 46.94 | KL 0.0048 (β=0.0001) | Lclip 0.0007 Lkl 0.0048 Lent 1.0362 | steps 1494468 eps 2400 | time 2.55s


GRPO:  25%|██████████████▋                                           | 76/300 [02:37<09:05,  2.43s/it, KL=0.005, avgR=97.4, beta=0.0001, it_s=2.38]

Iter 0075 | avgR   97.36 ± 62.89 | KL 0.0049 (β=0.0001) | Lclip 0.0011 Lkl 0.0049 Lent 1.0319 | steps 1521693 eps 2432 | time 2.38s


GRPO:  26%|██████████████▉                                           | 77/300 [02:39<08:55,  2.40s/it, KL=0.004, avgR=89.8, beta=0.0001, it_s=2.33]

Iter 0076 | avgR   89.84 ± 59.12 | KL 0.0042 (β=0.0001) | Lclip 0.0010 Lkl 0.0042 Lent 1.0131 | steps 1547713 eps 2464 | time 2.33s


GRPO:  26%|███████████████                                           | 78/300 [02:42<08:57,  2.42s/it, KL=0.005, avgR=86.0, beta=0.0001, it_s=2.46]

Iter 0077 | avgR   86.03 ± 44.46 | KL 0.0047 (β=0.0001) | Lclip 0.0011 Lkl 0.0047 Lent 1.0353 | steps 1577219 eps 2496 | time 2.46s


GRPO:  26%|███████████████▎                                          | 79/300 [02:44<08:54,  2.42s/it, KL=0.003, avgR=85.5, beta=0.0001, it_s=2.41]

Iter 0078 | avgR   85.48 ± 48.42 | KL 0.0029 (β=0.0001) | Lclip 0.0002 Lkl 0.0029 Lent 1.0225 | steps 1605248 eps 2528 | time 2.41s


GRPO:  27%|███████████████▍                                          | 80/300 [02:46<08:37,  2.35s/it, KL=0.004, avgR=96.8, beta=0.0001, it_s=2.19]

Iter 0079 | avgR   96.82 ± 66.32 | KL 0.0042 (β=0.0001) | Lclip 0.0025 Lkl 0.0042 Lent 0.9296 | steps 1629298 eps 2560 | time 2.19s


GRPO:  27%|███████████████▋                                          | 81/300 [02:49<08:28,  2.32s/it, KL=0.004, avgR=74.2, beta=0.0001, it_s=2.24]

Iter 0080 | avgR   74.24 ± 67.81 | KL 0.0040 (β=0.0001) | Lclip 0.0009 Lkl 0.0040 Lent 0.9942 | steps 1655342 eps 2592 | time 2.24s


GRPO:  27%|███████████████▊                                          | 82/300 [02:51<08:10,  2.25s/it, KL=0.005, avgR=89.4, beta=0.0001, it_s=2.08]

Iter 0081 | avgR   89.41 ± 62.22 | KL 0.0046 (β=0.0001) | Lclip 0.0017 Lkl 0.0046 Lent 0.9717 | steps 1681020 eps 2624 | time 2.08s


GRPO:  28%|████████████████                                          | 83/300 [02:53<08:29,  2.35s/it, KL=0.004, avgR=77.2, beta=0.0001, it_s=2.58]

Iter 0082 | avgR   77.18 ± 43.26 | KL 0.0041 (β=0.0001) | Lclip 0.0018 Lkl 0.0041 Lent 1.0092 | steps 1710448 eps 2656 | time 2.58s


GRPO:  28%|████████████████▏                                         | 84/300 [02:56<08:35,  2.39s/it, KL=0.004, avgR=97.2, beta=0.0001, it_s=2.47]

Iter 0083 | avgR   97.16 ± 68.03 | KL 0.0037 (β=0.0001) | Lclip 0.0016 Lkl 0.0037 Lent 0.9716 | steps 1738250 eps 2688 | time 2.47s


GRPO:  28%|████████████████▍                                         | 85/300 [02:58<08:48,  2.46s/it, KL=0.005, avgR=90.4, beta=0.0001, it_s=2.62]

Iter 0084 | avgR   90.39 ± 47.16 | KL 0.0053 (β=0.0001) | Lclip 0.0023 Lkl 0.0053 Lent 1.0106 | steps 1768044 eps 2720 | time 2.62s


GRPO:  29%|████████████████▋                                         | 86/300 [03:01<08:56,  2.51s/it, KL=0.004, avgR=82.0, beta=0.0001, it_s=2.62]

Iter 0085 | avgR   82.02 ± 47.46 | KL 0.0039 (β=0.0001) | Lclip 0.0002 Lkl 0.0039 Lent 0.9842 | steps 1796125 eps 2752 | time 2.62s


GRPO:  29%|████████████████▊                                         | 87/300 [03:03<08:37,  2.43s/it, KL=0.004, avgR=87.1, beta=0.0001, it_s=2.24]

Iter 0086 | avgR   87.07 ± 50.12 | KL 0.0040 (β=0.0001) | Lclip 0.0010 Lkl 0.0040 Lent 0.9634 | steps 1822533 eps 2784 | time 2.24s


GRPO:  29%|█████████████████                                         | 88/300 [03:06<08:43,  2.47s/it, KL=0.004, avgR=85.5, beta=0.0001, it_s=2.56]

Iter 0087 | avgR   85.50 ± 51.78 | KL 0.0043 (β=0.0001) | Lclip 0.0009 Lkl 0.0043 Lent 1.0035 | steps 1850802 eps 2816 | time 2.56s


GRPO:  30%|█████████████████▏                                        | 89/300 [03:08<08:22,  2.38s/it, KL=0.005, avgR=91.4, beta=0.0001, it_s=2.18]

Iter 0088 | avgR   91.36 ± 69.36 | KL 0.0046 (β=0.0001) | Lclip 0.0014 Lkl 0.0046 Lent 0.9026 | steps 1876182 eps 2848 | time 2.18s


GRPO:  30%|█████████████████▍                                        | 90/300 [03:11<08:27,  2.41s/it, KL=0.003, avgR=79.7, beta=0.0001, it_s=2.48]

Iter 0089 | avgR   79.68 ± 46.78 | KL 0.0033 (β=0.0001) | Lclip 0.0015 Lkl 0.0033 Lent 0.9834 | steps 1906331 eps 2880 | time 2.48s


GRPO:  30%|█████████████████▌                                        | 91/300 [03:13<08:27,  2.43s/it, KL=0.004, avgR=79.8, beta=0.0001, it_s=2.45]

Iter 0090 | avgR   79.81 ± 44.25 | KL 0.0043 (β=0.0001) | Lclip 0.0018 Lkl 0.0043 Lent 0.9444 | steps 1935567 eps 2912 | time 2.45s


GRPO:  31%|█████████████████▊                                        | 92/300 [03:16<08:27,  2.44s/it, KL=0.004, avgR=84.1, beta=0.0001, it_s=2.46]

Iter 0091 | avgR   84.05 ± 57.86 | KL 0.0043 (β=0.0001) | Lclip 0.0013 Lkl 0.0043 Lent 0.9708 | steps 1962366 eps 2944 | time 2.46s


GRPO:  31%|█████████████████▉                                        | 93/300 [03:18<08:35,  2.49s/it, KL=0.003, avgR=94.3, beta=0.0001, it_s=2.60]

Iter 0092 | avgR   94.26 ± 54.93 | KL 0.0028 (β=0.0001) | Lclip 0.0020 Lkl 0.0028 Lent 0.9187 | steps 1992500 eps 2976 | time 2.60s


GRPO:  31%|██████████████████▏                                       | 94/300 [03:21<08:33,  2.49s/it, KL=0.004, avgR=90.4, beta=0.0001, it_s=2.50]

Iter 0093 | avgR   90.45 ± 47.77 | KL 0.0038 (β=0.0001) | Lclip 0.0020 Lkl 0.0038 Lent 0.9447 | steps 2022033 eps 3008 | time 2.50s


GRPO:  32%|██████████████████▎                                       | 95/300 [03:23<08:27,  2.47s/it, KL=0.003, avgR=82.0, beta=0.0001, it_s=2.43]

Iter 0094 | avgR   81.96 ± 55.38 | KL 0.0032 (β=0.0001) | Lclip 0.0008 Lkl 0.0032 Lent 0.8586 | steps 2050403 eps 3040 | time 2.43s


GRPO:  32%|██████████████████▌                                       | 96/300 [03:25<08:22,  2.46s/it, KL=0.005, avgR=87.9, beta=0.0001, it_s=2.44]

Iter 0095 | avgR   87.91 ± 52.84 | KL 0.0049 (β=0.0001) | Lclip 0.0018 Lkl 0.0049 Lent 0.9990 | steps 2078649 eps 3072 | time 2.44s


GRPO:  32%|██████████████████▊                                       | 97/300 [03:28<08:20,  2.47s/it, KL=0.005, avgR=71.7, beta=0.0001, it_s=2.47]

Iter 0096 | avgR   71.68 ± 49.51 | KL 0.0046 (β=0.0001) | Lclip 0.0013 Lkl 0.0046 Lent 0.9967 | steps 2106738 eps 3104 | time 2.47s


GRPO:  33%|██████████████████▌                                      | 98/300 [03:30<08:04,  2.40s/it, KL=0.004, avgR=108.8, beta=0.0001, it_s=2.24]

Iter 0097 | avgR  108.83 ± 54.40 | KL 0.0039 (β=0.0001) | Lclip 0.0015 Lkl 0.0039 Lent 0.9935 | steps 2133580 eps 3136 | time 2.24s


GRPO:  33%|███████████████████▏                                      | 99/300 [03:33<08:18,  2.48s/it, KL=0.004, avgR=84.3, beta=0.0001, it_s=2.67]

Iter 0098 | avgR   84.30 ± 37.16 | KL 0.0039 (β=0.0001) | Lclip 0.0015 Lkl 0.0039 Lent 1.0062 | steps 2164183 eps 3168 | time 2.67s


GRPO:  33%|███████████████████                                      | 100/300 [03:35<08:24,  2.52s/it, KL=0.004, avgR=82.5, beta=0.0001, it_s=2.61]

Iter 0099 | avgR   82.51 ± 47.56 | KL 0.0042 (β=0.0001) | Lclip 0.0027 Lkl 0.0042 Lent 1.0360 | steps 2194139 eps 3200 | time 2.61s


GRPO:  34%|██████████████████▊                                     | 101/300 [03:38<08:12,  2.47s/it, KL=0.005, avgR=103.6, beta=0.0001, it_s=2.35]

Iter 0100 | avgR  103.64 ± 53.53 | KL 0.0046 (β=0.0001) | Lclip 0.0005 Lkl 0.0046 Lent 0.9647 | steps 2221896 eps 3232 | time 2.35s


GRPO:  34%|███████████████████                                     | 102/300 [03:40<07:55,  2.40s/it, KL=0.005, avgR=113.6, beta=0.0001, it_s=2.23]

Iter 0101 | avgR  113.60 ± 59.61 | KL 0.0047 (β=0.0001) | Lclip 0.0013 Lkl 0.0047 Lent 0.9720 | steps 2247760 eps 3264 | time 2.23s


GRPO:  34%|███████████████████▏                                    | 103/300 [03:43<07:55,  2.42s/it, KL=0.004, avgR=106.3, beta=0.0001, it_s=2.44]

Iter 0102 | avgR  106.27 ± 49.13 | KL 0.0044 (β=0.0001) | Lclip 0.0016 Lkl 0.0044 Lent 0.9513 | steps 2275383 eps 3296 | time 2.44s


GRPO:  35%|███████████████████▍                                    | 104/300 [03:45<07:38,  2.34s/it, KL=0.005, avgR=123.4, beta=0.0001, it_s=2.16]

Iter 0103 | avgR  123.43 ± 60.74 | KL 0.0046 (β=0.0001) | Lclip 0.0010 Lkl 0.0046 Lent 0.9570 | steps 2300182 eps 3328 | time 2.16s


GRPO:  35%|███████████████████▉                                     | 105/300 [03:47<07:41,  2.37s/it, KL=0.005, avgR=97.4, beta=0.0001, it_s=2.43]

Iter 0104 | avgR   97.44 ± 52.43 | KL 0.0047 (β=0.0001) | Lclip 0.0003 Lkl 0.0047 Lent 0.9910 | steps 2326948 eps 3360 | time 2.43s


GRPO:  35%|███████████████████▊                                    | 106/300 [03:49<07:29,  2.32s/it, KL=0.003, avgR=109.6, beta=0.0001, it_s=2.19]

Iter 0105 | avgR  109.55 ± 68.96 | KL 0.0027 (β=0.0001) | Lclip 0.0006 Lkl 0.0027 Lent 0.9499 | steps 2351060 eps 3392 | time 2.19s


GRPO:  36%|███████████████████▉                                    | 107/300 [03:52<07:26,  2.32s/it, KL=0.004, avgR=138.8, beta=0.0001, it_s=2.31]

Iter 0106 | avgR  138.80 ± 59.70 | KL 0.0042 (β=0.0001) | Lclip 0.0015 Lkl 0.0042 Lent 0.9048 | steps 2376015 eps 3424 | time 2.31s


GRPO:  36%|████████████████████▏                                   | 108/300 [03:54<07:20,  2.30s/it, KL=0.004, avgR=106.2, beta=0.0001, it_s=2.25]

Iter 0107 | avgR  106.16 ± 61.42 | KL 0.0038 (β=0.0001) | Lclip 0.0006 Lkl 0.0038 Lent 0.9799 | steps 2400710 eps 3456 | time 2.25s


GRPO:  36%|████████████████████▋                                    | 109/300 [03:56<07:13,  2.27s/it, KL=0.005, avgR=95.1, beta=0.0001, it_s=2.21]

Iter 0108 | avgR   95.14 ± 75.46 | KL 0.0045 (β=0.0001) | Lclip 0.0010 Lkl 0.0045 Lent 0.9733 | steps 2423611 eps 3488 | time 2.21s


GRPO:  37%|████████████████████▌                                   | 110/300 [03:59<07:28,  2.36s/it, KL=0.003, avgR=108.0, beta=0.0001, it_s=2.57]

Iter 0109 | avgR  108.04 ± 55.76 | KL 0.0029 (β=0.0001) | Lclip 0.0016 Lkl 0.0029 Lent 1.0180 | steps 2450166 eps 3520 | time 2.57s


GRPO:  37%|████████████████████▋                                   | 111/300 [04:01<07:19,  2.33s/it, KL=0.005, avgR=115.1, beta=0.0001, it_s=2.24]

Iter 0110 | avgR  115.15 ± 71.95 | KL 0.0051 (β=0.0001) | Lclip 0.0020 Lkl 0.0051 Lent 1.0030 | steps 2474358 eps 3552 | time 2.24s


GRPO:  37%|████████████████████▉                                   | 112/300 [04:03<07:11,  2.29s/it, KL=0.003, avgR=100.5, beta=0.0001, it_s=2.22]

Iter 0111 | avgR  100.50 ± 72.98 | KL 0.0031 (β=0.0001) | Lclip -0.0003 Lkl 0.0031 Lent 0.9854 | steps 2500570 eps 3584 | time 2.22s


GRPO:  38%|█████████████████████▍                                   | 113/300 [04:06<07:23,  2.37s/it, KL=0.003, avgR=96.1, beta=0.0001, it_s=2.55]

Iter 0112 | avgR   96.09 ± 51.93 | KL 0.0025 (β=0.0001) | Lclip 0.0008 Lkl 0.0025 Lent 0.9960 | steps 2528644 eps 3616 | time 2.55s


GRPO:  38%|█████████████████████▋                                   | 114/300 [04:08<07:16,  2.34s/it, KL=0.005, avgR=92.1, beta=0.0001, it_s=2.27]

Iter 0113 | avgR   92.07 ± 79.29 | KL 0.0050 (β=0.0001) | Lclip 0.0011 Lkl 0.0050 Lent 0.9862 | steps 2552596 eps 3648 | time 2.27s


GRPO:  38%|█████████████████████▍                                  | 115/300 [04:10<06:59,  2.27s/it, KL=0.005, avgR=136.1, beta=0.0001, it_s=2.08]

Iter 0114 | avgR  136.07 ± 63.13 | KL 0.0048 (β=0.0001) | Lclip 0.0013 Lkl 0.0048 Lent 0.9153 | steps 2574854 eps 3680 | time 2.08s


GRPO:  39%|██████████████████████                                   | 116/300 [04:13<07:09,  2.34s/it, KL=0.005, avgR=93.3, beta=0.0001, it_s=2.49]

Iter 0115 | avgR   93.32 ± 53.76 | KL 0.0050 (β=0.0001) | Lclip 0.0013 Lkl 0.0050 Lent 0.9630 | steps 2602912 eps 3712 | time 2.49s


GRPO:  39%|██████████████████████▏                                  | 117/300 [04:15<07:07,  2.33s/it, KL=0.004, avgR=94.0, beta=0.0001, it_s=2.33]

Iter 0116 | avgR   93.99 ± 72.90 | KL 0.0044 (β=0.0001) | Lclip 0.0009 Lkl 0.0044 Lent 0.9674 | steps 2629397 eps 3744 | time 2.33s


GRPO:  39%|██████████████████████                                  | 118/300 [04:17<07:06,  2.34s/it, KL=0.005, avgR=126.6, beta=0.0001, it_s=2.37]

Iter 0117 | avgR  126.57 ± 65.32 | KL 0.0048 (β=0.0001) | Lclip 0.0029 Lkl 0.0048 Lent 0.9024 | steps 2652513 eps 3776 | time 2.37s


GRPO:  40%|██████████████████████▏                                 | 119/300 [04:19<06:56,  2.30s/it, KL=0.003, avgR=134.9, beta=0.0001, it_s=2.19]

Iter 0118 | avgR  134.94 ± 54.56 | KL 0.0032 (β=0.0001) | Lclip 0.0002 Lkl 0.0032 Lent 0.9171 | steps 2675987 eps 3808 | time 2.19s


GRPO:  40%|██████████████████████▍                                 | 120/300 [04:22<07:03,  2.35s/it, KL=0.004, avgR=106.6, beta=0.0001, it_s=2.47]

Iter 0119 | avgR  106.56 ± 56.14 | KL 0.0042 (β=0.0001) | Lclip 0.0021 Lkl 0.0042 Lent 0.9288 | steps 2701959 eps 3840 | time 2.47s


GRPO:  40%|██████████████████████▌                                 | 121/300 [04:24<06:43,  2.25s/it, KL=0.005, avgR=117.1, beta=0.0001, it_s=2.02]

Iter 0120 | avgR  117.12 ± 67.49 | KL 0.0046 (β=0.0001) | Lclip -0.0003 Lkl 0.0046 Lent 0.9364 | steps 2725320 eps 3872 | time 2.02s


GRPO:  41%|██████████████████████▊                                 | 122/300 [04:26<06:44,  2.27s/it, KL=0.004, avgR=108.9, beta=0.0001, it_s=2.32]

Iter 0121 | avgR  108.85 ± 65.44 | KL 0.0044 (β=0.0001) | Lclip 0.0008 Lkl 0.0044 Lent 0.9469 | steps 2749694 eps 3904 | time 2.32s


GRPO:  41%|██████████████████████▉                                 | 123/300 [04:29<06:53,  2.34s/it, KL=0.005, avgR=106.7, beta=0.0001, it_s=2.48]

Iter 0122 | avgR  106.66 ± 56.13 | KL 0.0049 (β=0.0001) | Lclip 0.0013 Lkl 0.0049 Lent 0.9546 | steps 2776547 eps 3936 | time 2.48s


GRPO:  41%|███████████████████████▌                                 | 124/300 [04:31<06:43,  2.29s/it, KL=0.005, avgR=95.3, beta=0.0001, it_s=2.18]

Iter 0123 | avgR   95.30 ± 80.79 | KL 0.0053 (β=0.0001) | Lclip 0.0019 Lkl 0.0053 Lent 0.9200 | steps 2802442 eps 3968 | time 2.18s


GRPO:  42%|███████████████████████▎                                | 125/300 [04:33<06:45,  2.32s/it, KL=0.005, avgR=114.6, beta=0.0001, it_s=2.38]

Iter 0124 | avgR  114.57 ± 63.38 | KL 0.0046 (β=0.0001) | Lclip 0.0007 Lkl 0.0046 Lent 0.9368 | steps 2826475 eps 4000 | time 2.38s


GRPO:  42%|███████████████████████▌                                | 126/300 [04:36<06:58,  2.40s/it, KL=0.004, avgR=105.1, beta=0.0001, it_s=2.60]

Iter 0125 | avgR  105.10 ± 68.51 | KL 0.0041 (β=0.0001) | Lclip 0.0010 Lkl 0.0041 Lent 0.9074 | steps 2853343 eps 4032 | time 2.60s


GRPO:  42%|███████████████████████▋                                | 127/300 [04:38<06:49,  2.37s/it, KL=0.004, avgR=126.6, beta=0.0001, it_s=2.29]

Iter 0126 | avgR  126.62 ± 62.37 | KL 0.0039 (β=0.0001) | Lclip 0.0011 Lkl 0.0039 Lent 0.9140 | steps 2879911 eps 4064 | time 2.29s


GRPO:  43%|███████████████████████▉                                | 128/300 [04:40<06:33,  2.29s/it, KL=0.004, avgR=111.1, beta=0.0001, it_s=2.09]

Iter 0127 | avgR  111.07 ± 67.85 | KL 0.0042 (β=0.0001) | Lclip 0.0002 Lkl 0.0042 Lent 0.9018 | steps 2903093 eps 4096 | time 2.09s


GRPO:  43%|████████████████████████▌                                | 129/300 [04:43<06:29,  2.28s/it, KL=0.004, avgR=85.2, beta=0.0001, it_s=2.25]

Iter 0128 | avgR   85.20 ± 67.36 | KL 0.0039 (β=0.0001) | Lclip 0.0003 Lkl 0.0039 Lent 0.9184 | steps 2928639 eps 4128 | time 2.25s


GRPO:  43%|████████████████████████▎                               | 130/300 [04:45<06:30,  2.30s/it, KL=0.004, avgR=110.4, beta=0.0001, it_s=2.35]

Iter 0129 | avgR  110.37 ± 72.73 | KL 0.0042 (β=0.0001) | Lclip 0.0018 Lkl 0.0042 Lent 0.8720 | steps 2952883 eps 4160 | time 2.35s


GRPO:  44%|████████████████████████▍                               | 131/300 [04:47<06:35,  2.34s/it, KL=0.004, avgR=109.6, beta=0.0001, it_s=2.43]

Iter 0130 | avgR  109.58 ± 62.85 | KL 0.0043 (β=0.0001) | Lclip 0.0008 Lkl 0.0043 Lent 0.9238 | steps 2979022 eps 4192 | time 2.43s


GRPO:  44%|█████████████████████████                                | 132/300 [04:50<06:30,  2.32s/it, KL=0.003, avgR=92.8, beta=0.0001, it_s=2.28]

Iter 0131 | avgR   92.78 ± 65.45 | KL 0.0035 (β=0.0001) | Lclip 0.0003 Lkl 0.0035 Lent 0.9281 | steps 3005227 eps 4224 | time 2.28s


GRPO:  44%|█████████████████████████▎                               | 133/300 [04:52<06:35,  2.37s/it, KL=0.004, avgR=91.4, beta=0.0001, it_s=2.46]

Iter 0132 | avgR   91.44 ± 57.25 | KL 0.0035 (β=0.0001) | Lclip 0.0011 Lkl 0.0035 Lent 0.8993 | steps 3033450 eps 4256 | time 2.46s


GRPO:  45%|█████████████████████████▍                               | 134/300 [04:55<06:35,  2.38s/it, KL=0.003, avgR=95.9, beta=0.0001, it_s=2.42]

Iter 0133 | avgR   95.93 ± 58.35 | KL 0.0032 (β=0.0001) | Lclip 0.0007 Lkl 0.0032 Lent 0.9305 | steps 3060844 eps 4288 | time 2.42s


GRPO:  45%|█████████████████████████▋                               | 135/300 [04:57<06:25,  2.34s/it, KL=0.004, avgR=92.8, beta=0.0001, it_s=2.22]

Iter 0134 | avgR   92.81 ± 78.14 | KL 0.0043 (β=0.0001) | Lclip 0.0011 Lkl 0.0043 Lent 0.8934 | steps 3083589 eps 4320 | time 2.22s


GRPO:  45%|█████████████████████████▍                              | 136/300 [04:59<06:19,  2.31s/it, KL=0.006, avgR=105.5, beta=0.0001, it_s=2.26]

Iter 0135 | avgR  105.51 ± 57.74 | KL 0.0057 (β=0.0001) | Lclip 0.0016 Lkl 0.0057 Lent 0.9623 | steps 3108878 eps 4352 | time 2.26s


GRPO:  46%|██████████████████████████                               | 137/300 [05:01<06:11,  2.28s/it, KL=0.004, avgR=85.9, beta=0.0001, it_s=2.20]

Iter 0136 | avgR   85.91 ± 66.77 | KL 0.0035 (β=0.0001) | Lclip 0.0001 Lkl 0.0035 Lent 0.8904 | steps 3133252 eps 4384 | time 2.20s


GRPO:  46%|██████████████████████████▏                              | 138/300 [05:04<06:18,  2.34s/it, KL=0.004, avgR=71.6, beta=0.0001, it_s=2.47]

Iter 0137 | avgR   71.58 ± 54.15 | KL 0.0041 (β=0.0001) | Lclip 0.0012 Lkl 0.0041 Lent 0.9279 | steps 3160010 eps 4416 | time 2.47s


GRPO:  46%|█████████████████████████▉                              | 139/300 [05:06<06:13,  2.32s/it, KL=0.004, avgR=105.7, beta=0.0001, it_s=2.27]

Iter 0138 | avgR  105.70 ± 66.10 | KL 0.0043 (β=0.0001) | Lclip 0.0013 Lkl 0.0043 Lent 0.8973 | steps 3183914 eps 4448 | time 2.27s


GRPO:  47%|██████████████████████████▌                              | 140/300 [05:08<06:11,  2.32s/it, KL=0.005, avgR=95.4, beta=0.0001, it_s=2.32]

Iter 0139 | avgR   95.43 ± 81.34 | KL 0.0045 (β=0.0001) | Lclip 0.0021 Lkl 0.0045 Lent 0.8918 | steps 3209254 eps 4480 | time 2.32s


GRPO:  47%|██████████████████████████▊                              | 141/300 [05:11<06:07,  2.31s/it, KL=0.003, avgR=77.0, beta=0.0001, it_s=2.29]

Iter 0140 | avgR   76.98 ± 80.52 | KL 0.0035 (β=0.0001) | Lclip 0.0009 Lkl 0.0035 Lent 0.8883 | steps 3233339 eps 4512 | time 2.29s


GRPO:  47%|██████████████████████████▌                             | 142/300 [05:13<06:02,  2.30s/it, KL=0.004, avgR=103.4, beta=0.0001, it_s=2.26]

Iter 0141 | avgR  103.37 ± 73.99 | KL 0.0036 (β=0.0001) | Lclip 0.0024 Lkl 0.0036 Lent 0.9067 | steps 3257357 eps 4544 | time 2.26s


GRPO:  48%|██████████████████████████▋                             | 143/300 [05:15<06:00,  2.30s/it, KL=0.003, avgR=100.1, beta=0.0001, it_s=2.29]

Iter 0142 | avgR  100.13 ± 61.27 | KL 0.0034 (β=0.0001) | Lclip 0.0014 Lkl 0.0034 Lent 0.9142 | steps 3284238 eps 4576 | time 2.29s


GRPO:  48%|██████████████████████████▉                             | 144/300 [05:17<05:51,  2.26s/it, KL=0.003, avgR=101.9, beta=0.0001, it_s=2.16]

Iter 0143 | avgR  101.87 ± 61.13 | KL 0.0034 (β=0.0001) | Lclip 0.0015 Lkl 0.0034 Lent 0.8730 | steps 3308660 eps 4608 | time 2.16s


GRPO:  48%|███████████████████████████▌                             | 145/300 [05:20<05:49,  2.26s/it, KL=0.005, avgR=98.3, beta=0.0001, it_s=2.26]

Iter 0144 | avgR   98.25 ± 63.69 | KL 0.0046 (β=0.0001) | Lclip 0.0019 Lkl 0.0046 Lent 0.9521 | steps 3334138 eps 4640 | time 2.26s


GRPO:  49%|███████████████████████████▎                            | 146/300 [05:22<05:42,  2.23s/it, KL=0.003, avgR=111.0, beta=0.0001, it_s=2.15]

Iter 0145 | avgR  111.01 ± 67.35 | KL 0.0027 (β=0.0001) | Lclip -0.0001 Lkl 0.0027 Lent 0.9581 | steps 3358263 eps 4672 | time 2.15s


GRPO:  49%|███████████████████████████▉                             | 147/300 [05:24<05:49,  2.28s/it, KL=0.004, avgR=81.9, beta=0.0001, it_s=2.42]

Iter 0146 | avgR   81.92 ± 53.03 | KL 0.0044 (β=0.0001) | Lclip 0.0015 Lkl 0.0044 Lent 0.9470 | steps 3385584 eps 4704 | time 2.42s


GRPO:  49%|███████████████████████████▋                            | 148/300 [05:26<05:48,  2.29s/it, KL=0.004, avgR=105.2, beta=0.0001, it_s=2.30]

Iter 0147 | avgR  105.16 ± 65.76 | KL 0.0040 (β=0.0001) | Lclip 0.0004 Lkl 0.0040 Lent 0.9338 | steps 3411701 eps 4736 | time 2.30s


GRPO:  50%|████████████████████████████▎                            | 149/300 [05:29<05:53,  2.34s/it, KL=0.004, avgR=93.2, beta=0.0001, it_s=2.46]

Iter 0148 | avgR   93.24 ± 62.48 | KL 0.0043 (β=0.0001) | Lclip 0.0015 Lkl 0.0043 Lent 0.9726 | steps 3439372 eps 4768 | time 2.46s


GRPO:  50%|████████████████████████████▌                            | 150/300 [05:31<05:49,  2.33s/it, KL=0.003, avgR=91.8, beta=0.0001, it_s=2.29]

Iter 0149 | avgR   91.84 ± 57.12 | KL 0.0033 (β=0.0001) | Lclip 0.0013 Lkl 0.0033 Lent 0.9354 | steps 3465437 eps 4800 | time 2.29s


GRPO:  50%|████████████████████████████▋                            | 151/300 [05:34<05:52,  2.36s/it, KL=0.004, avgR=91.4, beta=0.0001, it_s=2.45]

Iter 0150 | avgR   91.38 ± 60.66 | KL 0.0041 (β=0.0001) | Lclip 0.0020 Lkl 0.0041 Lent 0.9520 | steps 3492873 eps 4832 | time 2.45s


GRPO:  51%|████████████████████████████▉                            | 152/300 [05:36<05:53,  2.39s/it, KL=0.004, avgR=96.7, beta=0.0001, it_s=2.44]

Iter 0151 | avgR   96.70 ± 34.35 | KL 0.0044 (β=0.0001) | Lclip 0.0021 Lkl 0.0044 Lent 1.0070 | steps 3523403 eps 4864 | time 2.44s


GRPO:  51%|█████████████████████████████                            | 153/300 [05:39<05:53,  2.41s/it, KL=0.003, avgR=90.0, beta=0.0001, it_s=2.44]

Iter 0152 | avgR   89.99 ± 43.84 | KL 0.0033 (β=0.0001) | Lclip 0.0010 Lkl 0.0033 Lent 0.9686 | steps 3552522 eps 4896 | time 2.44s


GRPO:  51%|█████████████████████████████▎                           | 154/300 [05:41<05:57,  2.45s/it, KL=0.004, avgR=84.2, beta=0.0001, it_s=2.54]

Iter 0153 | avgR   84.16 ± 41.03 | KL 0.0036 (β=0.0001) | Lclip 0.0011 Lkl 0.0036 Lent 0.9484 | steps 3582973 eps 4928 | time 2.54s


GRPO:  52%|█████████████████████████████▍                           | 155/300 [05:43<05:53,  2.44s/it, KL=0.005, avgR=85.0, beta=0.0001, it_s=2.42]

Iter 0154 | avgR   84.96 ± 69.32 | KL 0.0045 (β=0.0001) | Lclip 0.0005 Lkl 0.0045 Lent 0.9321 | steps 3610122 eps 4960 | time 2.42s


GRPO:  52%|█████████████████████████████▋                           | 156/300 [05:46<05:45,  2.40s/it, KL=0.005, avgR=84.2, beta=0.0001, it_s=2.29]

Iter 0155 | avgR   84.16 ± 51.87 | KL 0.0050 (β=0.0001) | Lclip 0.0029 Lkl 0.0050 Lent 0.9629 | steps 3636688 eps 4992 | time 2.29s


GRPO:  52%|█████████████████████████████▎                          | 157/300 [05:48<05:37,  2.36s/it, KL=0.004, avgR=114.4, beta=0.0001, it_s=2.28]

Iter 0156 | avgR  114.39 ± 57.53 | KL 0.0037 (β=0.0001) | Lclip 0.0025 Lkl 0.0037 Lent 0.8912 | steps 3663690 eps 5024 | time 2.28s


GRPO:  53%|█████████████████████████████▍                          | 158/300 [05:50<05:34,  2.36s/it, KL=0.005, avgR=100.4, beta=0.0001, it_s=2.34]

Iter 0157 | avgR  100.40 ± 69.06 | KL 0.0045 (β=0.0001) | Lclip 0.0024 Lkl 0.0045 Lent 0.9314 | steps 3689999 eps 5056 | time 2.34s


GRPO:  53%|██████████████████████████████▏                          | 159/300 [05:53<05:30,  2.35s/it, KL=0.003, avgR=97.9, beta=0.0001, it_s=2.31]

Iter 0158 | avgR   97.89 ± 57.91 | KL 0.0035 (β=0.0001) | Lclip 0.0005 Lkl 0.0035 Lent 0.8948 | steps 3716455 eps 5088 | time 2.31s


GRPO:  53%|██████████████████████████████▍                          | 160/300 [05:55<05:24,  2.32s/it, KL=0.005, avgR=83.0, beta=0.0001, it_s=2.25]

Iter 0159 | avgR   82.97 ± 58.20 | KL 0.0046 (β=0.0001) | Lclip 0.0037 Lkl 0.0046 Lent 0.9024 | steps 3742031 eps 5120 | time 2.25s


GRPO:  54%|██████████████████████████████▌                          | 161/300 [05:57<05:20,  2.31s/it, KL=0.005, avgR=91.1, beta=0.0001, it_s=2.28]

Iter 0160 | avgR   91.06 ± 60.92 | KL 0.0046 (β=0.0001) | Lclip 0.0017 Lkl 0.0046 Lent 0.9205 | steps 3768293 eps 5152 | time 2.28s


GRPO:  54%|██████████████████████████████▊                          | 162/300 [06:00<05:29,  2.39s/it, KL=0.005, avgR=81.8, beta=0.0001, it_s=2.57]

Iter 0161 | avgR   81.77 ± 54.03 | KL 0.0047 (β=0.0001) | Lclip 0.0009 Lkl 0.0047 Lent 0.9222 | steps 3796028 eps 5184 | time 2.57s


GRPO:  54%|██████████████████████████████▉                          | 163/300 [06:02<05:20,  2.34s/it, KL=0.004, avgR=96.4, beta=0.0001, it_s=2.22]

Iter 0162 | avgR   96.35 ± 56.56 | KL 0.0037 (β=0.0001) | Lclip 0.0009 Lkl 0.0037 Lent 0.8574 | steps 3820983 eps 5216 | time 2.22s


GRPO:  55%|███████████████████████████████▏                         | 164/300 [06:04<05:21,  2.37s/it, KL=0.004, avgR=78.6, beta=0.0001, it_s=2.42]

Iter 0163 | avgR   78.63 ± 52.23 | KL 0.0040 (β=0.0001) | Lclip 0.0010 Lkl 0.0040 Lent 0.9387 | steps 3850219 eps 5248 | time 2.42s


GRPO:  55%|██████████████████████████████▊                         | 165/300 [06:07<05:12,  2.31s/it, KL=0.003, avgR=108.0, beta=0.0001, it_s=2.18]

Iter 0164 | avgR  108.00 ± 62.60 | KL 0.0033 (β=0.0001) | Lclip 0.0015 Lkl 0.0033 Lent 0.8431 | steps 3875169 eps 5280 | time 2.18s


GRPO:  55%|███████████████████████████████▌                         | 166/300 [06:09<05:12,  2.33s/it, KL=0.003, avgR=79.2, beta=0.0001, it_s=2.37]

Iter 0165 | avgR   79.24 ± 46.69 | KL 0.0033 (β=0.0001) | Lclip 0.0004 Lkl 0.0033 Lent 0.8556 | steps 3902353 eps 5312 | time 2.37s


GRPO:  56%|███████████████████████████████▏                        | 167/300 [06:11<05:11,  2.34s/it, KL=0.004, avgR=110.3, beta=0.0001, it_s=2.36]

Iter 0166 | avgR  110.31 ± 57.61 | KL 0.0043 (β=0.0001) | Lclip 0.0011 Lkl 0.0043 Lent 0.8724 | steps 3928566 eps 5344 | time 2.36s


GRPO:  56%|███████████████████████████████▉                         | 168/300 [06:14<05:04,  2.30s/it, KL=0.005, avgR=86.0, beta=0.0001, it_s=2.22]

Iter 0167 | avgR   86.02 ± 66.30 | KL 0.0047 (β=0.0001) | Lclip 0.0022 Lkl 0.0047 Lent 0.8142 | steps 3953632 eps 5376 | time 2.22s


GRPO:  56%|████████████████████████████████                         | 169/300 [06:16<05:01,  2.30s/it, KL=0.005, avgR=94.7, beta=0.0001, it_s=2.29]

Iter 0168 | avgR   94.73 ± 62.14 | KL 0.0046 (β=0.0001) | Lclip 0.0010 Lkl 0.0046 Lent 0.9247 | steps 3979006 eps 5408 | time 2.29s


GRPO:  57%|███████████████████████████████▋                        | 170/300 [06:18<04:49,  2.23s/it, KL=0.007, avgR=100.7, beta=0.0001, it_s=2.06]

Iter 0169 | avgR  100.74 ± 67.10 | KL 0.0065 (β=0.0001) | Lclip 0.0005 Lkl 0.0065 Lent 0.9291 | steps 4002894 eps 5440 | time 2.06s


GRPO:  57%|████████████████████████████████▍                        | 171/300 [06:20<04:49,  2.25s/it, KL=0.004, avgR=89.6, beta=0.0001, it_s=2.29]

Iter 0170 | avgR   89.62 ± 58.09 | KL 0.0039 (β=0.0001) | Lclip 0.0017 Lkl 0.0039 Lent 0.9070 | steps 4029677 eps 5472 | time 2.29s


GRPO:  57%|████████████████████████████████▋                        | 172/300 [06:23<04:58,  2.33s/it, KL=0.004, avgR=89.0, beta=0.0001, it_s=2.51]

Iter 0171 | avgR   88.98 ± 54.95 | KL 0.0041 (β=0.0001) | Lclip 0.0018 Lkl 0.0041 Lent 0.9098 | steps 4057680 eps 5504 | time 2.51s


GRPO:  58%|████████████████████████████████▊                        | 173/300 [06:25<04:47,  2.26s/it, KL=0.006, avgR=82.9, beta=0.0001, it_s=2.11]

Iter 0172 | avgR   82.85 ± 70.20 | KL 0.0058 (β=0.0001) | Lclip 0.0019 Lkl 0.0058 Lent 0.9277 | steps 4079101 eps 5536 | time 2.11s


GRPO:  58%|█████████████████████████████████                        | 174/300 [06:27<04:44,  2.26s/it, KL=0.004, avgR=86.8, beta=0.0001, it_s=2.24]

Iter 0173 | avgR   86.77 ± 55.23 | KL 0.0040 (β=0.0001) | Lclip 0.0011 Lkl 0.0040 Lent 0.9154 | steps 4106116 eps 5568 | time 2.24s


GRPO:  58%|█████████████████████████████████▎                       | 175/300 [06:30<04:50,  2.32s/it, KL=0.003, avgR=83.2, beta=0.0001, it_s=2.47]

Iter 0174 | avgR   83.22 ± 49.65 | KL 0.0026 (β=0.0001) | Lclip 0.0008 Lkl 0.0026 Lent 0.9262 | steps 4134532 eps 5600 | time 2.47s


GRPO:  59%|████████████████████████████████▊                       | 176/300 [06:32<04:48,  2.32s/it, KL=0.004, avgR=103.8, beta=0.0001, it_s=2.32]

Iter 0175 | avgR  103.84 ± 54.17 | KL 0.0037 (β=0.0001) | Lclip 0.0017 Lkl 0.0037 Lent 0.9649 | steps 4161542 eps 5632 | time 2.32s


GRPO:  59%|█████████████████████████████████                       | 177/300 [06:34<04:43,  2.30s/it, KL=0.004, avgR=118.4, beta=0.0001, it_s=2.25]

Iter 0176 | avgR  118.42 ± 60.44 | KL 0.0040 (β=0.0001) | Lclip 0.0017 Lkl 0.0040 Lent 0.9706 | steps 4186617 eps 5664 | time 2.25s


GRPO:  59%|█████████████████████████████████▏                      | 178/300 [06:37<04:43,  2.32s/it, KL=0.005, avgR=105.7, beta=0.0001, it_s=2.36]

Iter 0177 | avgR  105.74 ± 54.34 | KL 0.0050 (β=0.0001) | Lclip 0.0009 Lkl 0.0050 Lent 1.0034 | steps 4213649 eps 5696 | time 2.36s


GRPO:  60%|██████████████████████████████████                       | 179/300 [06:39<04:44,  2.35s/it, KL=0.004, avgR=85.1, beta=0.0001, it_s=2.41]

Iter 0178 | avgR   85.05 ± 53.37 | KL 0.0038 (β=0.0001) | Lclip 0.0010 Lkl 0.0038 Lent 0.9584 | steps 4239790 eps 5728 | time 2.41s


GRPO:  60%|█████████████████████████████████▌                      | 180/300 [06:41<04:40,  2.34s/it, KL=0.004, avgR=105.5, beta=0.0001, it_s=2.31]

Iter 0179 | avgR  105.47 ± 60.41 | KL 0.0035 (β=0.0001) | Lclip 0.0015 Lkl 0.0035 Lent 0.9337 | steps 4265785 eps 5760 | time 2.31s


GRPO:  60%|█████████████████████████████████▊                      | 181/300 [06:44<04:38,  2.34s/it, KL=0.005, avgR=123.0, beta=0.0001, it_s=2.33]

Iter 0180 | avgR  123.04 ± 62.04 | KL 0.0052 (β=0.0001) | Lclip 0.0016 Lkl 0.0052 Lent 0.9748 | steps 4291390 eps 5792 | time 2.33s


GRPO:  61%|█████████████████████████████████▉                      | 182/300 [06:46<04:27,  2.26s/it, KL=0.004, avgR=133.7, beta=0.0001, it_s=2.08]

Iter 0181 | avgR  133.75 ± 66.07 | KL 0.0037 (β=0.0001) | Lclip 0.0022 Lkl 0.0037 Lent 0.8794 | steps 4314903 eps 5824 | time 2.08s


GRPO:  61%|██████████████████████████████████▏                     | 183/300 [06:48<04:24,  2.26s/it, KL=0.003, avgR=112.8, beta=0.0001, it_s=2.25]

Iter 0182 | avgR  112.75 ± 67.56 | KL 0.0033 (β=0.0001) | Lclip 0.0006 Lkl 0.0033 Lent 0.9430 | steps 4338534 eps 5856 | time 2.25s


GRPO:  61%|██████████████████████████████████▎                     | 184/300 [06:50<04:19,  2.24s/it, KL=0.004, avgR=128.0, beta=0.0001, it_s=2.18]

Iter 0183 | avgR  128.02 ± 57.81 | KL 0.0043 (β=0.0001) | Lclip 0.0015 Lkl 0.0043 Lent 0.9056 | steps 4363480 eps 5888 | time 2.18s


GRPO:  62%|██████████████████████████████████▌                     | 185/300 [06:53<04:24,  2.30s/it, KL=0.004, avgR=101.3, beta=0.0001, it_s=2.43]

Iter 0184 | avgR  101.33 ± 65.69 | KL 0.0040 (β=0.0001) | Lclip 0.0006 Lkl 0.0040 Lent 0.9855 | steps 4389268 eps 5920 | time 2.43s


GRPO:  62%|███████████████████████████████████▎                     | 186/300 [06:55<04:27,  2.34s/it, KL=0.004, avgR=86.9, beta=0.0001, it_s=2.45]

Iter 0185 | avgR   86.90 ± 62.65 | KL 0.0040 (β=0.0001) | Lclip 0.0009 Lkl 0.0040 Lent 1.0090 | steps 4416339 eps 5952 | time 2.45s


GRPO:  62%|███████████████████████████████████▌                     | 187/300 [06:57<04:21,  2.31s/it, KL=0.005, avgR=99.0, beta=0.0001, it_s=2.23]

Iter 0186 | avgR   99.02 ± 57.88 | KL 0.0051 (β=0.0001) | Lclip 0.0024 Lkl 0.0051 Lent 0.9997 | steps 4442094 eps 5984 | time 2.23s


GRPO:  63%|███████████████████████████████████                     | 188/300 [07:00<04:25,  2.37s/it, KL=0.005, avgR=102.8, beta=0.0001, it_s=2.51]

Iter 0187 | avgR  102.77 ± 59.79 | KL 0.0052 (β=0.0001) | Lclip 0.0023 Lkl 0.0052 Lent 1.0071 | steps 4469169 eps 6016 | time 2.51s


GRPO:  63%|███████████████████████████████████▎                    | 189/300 [07:02<04:21,  2.35s/it, KL=0.005, avgR=119.4, beta=0.0001, it_s=2.32]

Iter 0188 | avgR  119.40 ± 61.93 | KL 0.0047 (β=0.0001) | Lclip 0.0022 Lkl 0.0047 Lent 0.9601 | steps 4492728 eps 6048 | time 2.32s


GRPO:  63%|███████████████████████████████████▍                    | 190/300 [07:04<04:09,  2.27s/it, KL=0.004, avgR=120.0, beta=0.0001, it_s=2.07]

Iter 0189 | avgR  120.04 ± 62.21 | KL 0.0036 (β=0.0001) | Lclip 0.0016 Lkl 0.0036 Lent 0.9483 | steps 4517622 eps 6080 | time 2.07s


GRPO:  64%|███████████████████████████████████▋                    | 191/300 [07:06<04:03,  2.23s/it, KL=0.005, avgR=139.4, beta=0.0001, it_s=2.14]

Iter 0190 | avgR  139.43 ± 67.35 | KL 0.0046 (β=0.0001) | Lclip 0.0027 Lkl 0.0046 Lent 0.9160 | steps 4539274 eps 6112 | time 2.14s


GRPO:  64%|███████████████████████████████████▊                    | 192/300 [07:09<04:10,  2.32s/it, KL=0.004, avgR=114.1, beta=0.0001, it_s=2.51]

Iter 0191 | avgR  114.07 ± 57.15 | KL 0.0035 (β=0.0001) | Lclip 0.0017 Lkl 0.0035 Lent 0.9352 | steps 4565362 eps 6144 | time 2.51s


GRPO:  64%|████████████████████████████████████▋                    | 193/300 [07:11<04:01,  2.26s/it, KL=0.005, avgR=91.7, beta=0.0001, it_s=2.11]

Iter 0192 | avgR   91.69 ± 67.05 | KL 0.0047 (β=0.0001) | Lclip 0.0016 Lkl 0.0047 Lent 0.9687 | steps 4588507 eps 6176 | time 2.11s


GRPO:  65%|████████████████████████████████████▏                   | 194/300 [07:13<03:52,  2.20s/it, KL=0.003, avgR=119.5, beta=0.0001, it_s=2.05]

Iter 0193 | avgR  119.46 ± 69.51 | KL 0.0033 (β=0.0001) | Lclip 0.0009 Lkl 0.0033 Lent 0.9761 | steps 4611180 eps 6208 | time 2.05s


GRPO:  65%|████████████████████████████████████▍                   | 195/300 [07:15<03:47,  2.17s/it, KL=0.004, avgR=119.7, beta=0.0001, it_s=2.10]

Iter 0194 | avgR  119.66 ± 68.05 | KL 0.0038 (β=0.0001) | Lclip 0.0007 Lkl 0.0038 Lent 0.9057 | steps 4633863 eps 6240 | time 2.10s


GRPO:  65%|████████████████████████████████████▌                   | 196/300 [07:18<03:52,  2.23s/it, KL=0.004, avgR=106.3, beta=0.0001, it_s=2.38]

Iter 0195 | avgR  106.31 ± 60.28 | KL 0.0040 (β=0.0001) | Lclip 0.0016 Lkl 0.0040 Lent 0.9178 | steps 4659899 eps 6272 | time 2.38s


GRPO:  66%|████████████████████████████████████▊                   | 197/300 [07:20<03:47,  2.21s/it, KL=0.005, avgR=114.7, beta=0.0001, it_s=2.16]

Iter 0196 | avgR  114.69 ± 75.98 | KL 0.0048 (β=0.0001) | Lclip 0.0012 Lkl 0.0048 Lent 0.8939 | steps 4683677 eps 6304 | time 2.16s


GRPO:  66%|█████████████████████████████████████▌                   | 198/300 [07:22<03:44,  2.20s/it, KL=0.004, avgR=99.8, beta=0.0001, it_s=2.16]

Iter 0197 | avgR   99.80 ± 63.65 | KL 0.0042 (β=0.0001) | Lclip 0.0020 Lkl 0.0042 Lent 0.8797 | steps 4708770 eps 6336 | time 2.16s


GRPO:  66%|█████████████████████████████████████▏                  | 199/300 [07:24<03:41,  2.19s/it, KL=0.004, avgR=106.8, beta=0.0001, it_s=2.17]

Iter 0198 | avgR  106.76 ± 63.32 | KL 0.0037 (β=0.0001) | Lclip 0.0016 Lkl 0.0037 Lent 0.9400 | steps 4732480 eps 6368 | time 2.17s


GRPO:  67%|█████████████████████████████████████▎                  | 200/300 [07:26<03:38,  2.19s/it, KL=0.004, avgR=110.7, beta=0.0001, it_s=2.18]

Iter 0199 | avgR  110.72 ± 77.34 | KL 0.0041 (β=0.0001) | Lclip 0.0019 Lkl 0.0041 Lent 0.9579 | steps 4756205 eps 6400 | time 2.18s


GRPO:  67%|█████████████████████████████████████▌                  | 201/300 [07:28<03:38,  2.21s/it, KL=0.004, avgR=108.8, beta=0.0001, it_s=2.25]

Iter 0200 | avgR  108.76 ± 52.98 | KL 0.0039 (β=0.0001) | Lclip 0.0024 Lkl 0.0039 Lent 0.9270 | steps 4781601 eps 6432 | time 2.25s


GRPO:  67%|█████████████████████████████████████▋                  | 202/300 [07:31<03:35,  2.20s/it, KL=0.004, avgR=127.9, beta=0.0001, it_s=2.18]

Iter 0201 | avgR  127.94 ± 59.51 | KL 0.0038 (β=0.0001) | Lclip 0.0007 Lkl 0.0038 Lent 0.8375 | steps 4804351 eps 6464 | time 2.18s


GRPO:  68%|█████████████████████████████████████▉                  | 203/300 [07:33<03:31,  2.18s/it, KL=0.003, avgR=120.7, beta=0.0001, it_s=2.11]

Iter 0202 | avgR  120.67 ± 61.57 | KL 0.0033 (β=0.0001) | Lclip 0.0012 Lkl 0.0033 Lent 0.9275 | steps 4828110 eps 6496 | time 2.11s


GRPO:  68%|██████████████████████████████████████▊                  | 204/300 [07:35<03:36,  2.26s/it, KL=0.004, avgR=97.8, beta=0.0001, it_s=2.45]

Iter 0203 | avgR   97.81 ± 50.72 | KL 0.0039 (β=0.0001) | Lclip 0.0016 Lkl 0.0039 Lent 0.9534 | steps 4855855 eps 6528 | time 2.45s


GRPO:  68%|██████████████████████████████████████▎                 | 205/300 [07:37<03:32,  2.23s/it, KL=0.003, avgR=122.0, beta=0.0001, it_s=2.17]

Iter 0204 | avgR  122.01 ± 61.49 | KL 0.0033 (β=0.0001) | Lclip 0.0019 Lkl 0.0033 Lent 0.8871 | steps 4880102 eps 6560 | time 2.17s


GRPO:  69%|██████████████████████████████████████▍                 | 206/300 [07:40<03:35,  2.29s/it, KL=0.004, avgR=119.0, beta=0.0001, it_s=2.43]

Iter 0205 | avgR  118.96 ± 58.49 | KL 0.0042 (β=0.0001) | Lclip 0.0020 Lkl 0.0042 Lent 0.9150 | steps 4906580 eps 6592 | time 2.43s


GRPO:  69%|██████████████████████████████████████▋                 | 207/300 [07:42<03:29,  2.25s/it, KL=0.003, avgR=109.6, beta=0.0001, it_s=2.15]

Iter 0206 | avgR  109.61 ± 64.26 | KL 0.0031 (β=0.0001) | Lclip 0.0014 Lkl 0.0031 Lent 0.9064 | steps 4929704 eps 6624 | time 2.15s


GRPO:  69%|██████████████████████████████████████▊                 | 208/300 [07:44<03:26,  2.25s/it, KL=0.004, avgR=126.9, beta=0.0001, it_s=2.24]

Iter 0207 | avgR  126.92 ± 52.92 | KL 0.0040 (β=0.0001) | Lclip 0.0006 Lkl 0.0040 Lent 0.9083 | steps 4956075 eps 6656 | time 2.24s


GRPO:  70%|███████████████████████████████████████                 | 209/300 [07:47<03:30,  2.31s/it, KL=0.005, avgR=119.1, beta=0.0001, it_s=2.45]

Iter 0208 | avgR  119.14 ± 54.98 | KL 0.0048 (β=0.0001) | Lclip 0.0022 Lkl 0.0048 Lent 0.9487 | steps 4983504 eps 6688 | time 2.45s


GRPO:  70%|███████████████████████████████████████▏                | 210/300 [07:49<03:34,  2.39s/it, KL=0.005, avgR=105.3, beta=0.0001, it_s=2.56]

Iter 0209 | avgR  105.30 ± 73.66 | KL 0.0050 (β=0.0001) | Lclip 0.0020 Lkl 0.0050 Lent 0.9490 | steps 5010537 eps 6720 | time 2.56s


GRPO:  70%|███████████████████████████████████████▍                | 211/300 [07:52<03:31,  2.38s/it, KL=0.004, avgR=114.8, beta=0.0001, it_s=2.35]

Iter 0210 | avgR  114.79 ± 53.29 | KL 0.0041 (β=0.0001) | Lclip 0.0009 Lkl 0.0041 Lent 0.9255 | steps 5036461 eps 6752 | time 2.35s


GRPO:  71%|███████████████████████████████████████▌                | 212/300 [07:54<03:25,  2.34s/it, KL=0.004, avgR=115.7, beta=0.0001, it_s=2.24]

Iter 0211 | avgR  115.67 ± 63.32 | KL 0.0041 (β=0.0001) | Lclip 0.0028 Lkl 0.0041 Lent 0.9139 | steps 5060401 eps 6784 | time 2.24s


GRPO:  71%|███████████████████████████████████████▊                | 213/300 [07:56<03:17,  2.27s/it, KL=0.005, avgR=134.5, beta=0.0001, it_s=2.11]

Iter 0212 | avgR  134.55 ± 59.67 | KL 0.0050 (β=0.0001) | Lclip 0.0027 Lkl 0.0050 Lent 0.9309 | steps 5082563 eps 6816 | time 2.11s


GRPO:  71%|███████████████████████████████████████▉                | 214/300 [07:58<03:20,  2.33s/it, KL=0.004, avgR=120.9, beta=0.0001, it_s=2.47]

Iter 0213 | avgR  120.95 ± 53.08 | KL 0.0042 (β=0.0001) | Lclip 0.0015 Lkl 0.0042 Lent 0.9161 | steps 5108700 eps 6848 | time 2.47s


GRPO:  72%|████████████████████████████████████████▏               | 215/300 [08:00<03:08,  2.21s/it, KL=0.005, avgR=145.0, beta=0.0001, it_s=1.94]

Iter 0214 | avgR  145.00 ± 68.63 | KL 0.0048 (β=0.0001) | Lclip 0.0012 Lkl 0.0048 Lent 0.9451 | steps 5128548 eps 6880 | time 1.94s


GRPO:  72%|█████████████████████████████████████████                | 216/300 [08:02<03:01,  2.17s/it, KL=0.003, avgR=96.6, beta=0.0001, it_s=2.05]

Iter 0215 | avgR   96.57 ± 64.12 | KL 0.0035 (β=0.0001) | Lclip 0.0020 Lkl 0.0035 Lent 0.9110 | steps 5152464 eps 6912 | time 2.05s


GRPO:  72%|████████████████████████████████████████▌               | 217/300 [08:05<02:59,  2.16s/it, KL=0.005, avgR=115.8, beta=0.0001, it_s=2.15]

Iter 0216 | avgR  115.84 ± 67.59 | KL 0.0048 (β=0.0001) | Lclip 0.0013 Lkl 0.0048 Lent 0.9361 | steps 5176308 eps 6944 | time 2.15s


GRPO:  73%|████████████████████████████████████████▋               | 218/300 [08:07<02:56,  2.15s/it, KL=0.004, avgR=117.9, beta=0.0001, it_s=2.12]

Iter 0217 | avgR  117.86 ± 64.49 | KL 0.0043 (β=0.0001) | Lclip 0.0025 Lkl 0.0043 Lent 0.9180 | steps 5199989 eps 6976 | time 2.12s


GRPO:  73%|█████████████████████████████████████████▌               | 219/300 [08:09<02:53,  2.14s/it, KL=0.004, avgR=98.9, beta=0.0001, it_s=2.12]

Iter 0218 | avgR   98.94 ± 72.20 | KL 0.0040 (β=0.0001) | Lclip 0.0009 Lkl 0.0040 Lent 0.9439 | steps 5222761 eps 7008 | time 2.12s


GRPO:  73%|█████████████████████████████████████████               | 220/300 [08:11<03:01,  2.27s/it, KL=0.005, avgR=107.7, beta=0.0001, it_s=2.57]

Iter 0219 | avgR  107.68 ± 59.19 | KL 0.0047 (β=0.0001) | Lclip 0.0029 Lkl 0.0047 Lent 0.9939 | steps 5249558 eps 7040 | time 2.57s


GRPO:  74%|█████████████████████████████████████████▎              | 221/300 [08:14<03:03,  2.33s/it, KL=0.004, avgR=115.6, beta=0.0001, it_s=2.46]

Iter 0220 | avgR  115.62 ± 64.52 | KL 0.0042 (β=0.0001) | Lclip 0.0014 Lkl 0.0042 Lent 0.8927 | steps 5275832 eps 7072 | time 2.46s


GRPO:  74%|█████████████████████████████████████████▍              | 222/300 [08:16<02:56,  2.26s/it, KL=0.005, avgR=144.6, beta=0.0001, it_s=2.11]

Iter 0221 | avgR  144.60 ± 68.85 | KL 0.0051 (β=0.0001) | Lclip 0.0022 Lkl 0.0051 Lent 0.9278 | steps 5297295 eps 7104 | time 2.11s


GRPO:  74%|█████████████████████████████████████████▋              | 223/300 [08:18<02:50,  2.22s/it, KL=0.005, avgR=127.2, beta=0.0001, it_s=2.10]

Iter 0222 | avgR  127.20 ± 65.63 | KL 0.0054 (β=0.0001) | Lclip 0.0011 Lkl 0.0054 Lent 0.8996 | steps 5319141 eps 7136 | time 2.10s


GRPO:  75%|██████████████████████████████████████████▌              | 224/300 [08:20<02:48,  2.22s/it, KL=0.003, avgR=93.6, beta=0.0001, it_s=2.22]

Iter 0223 | avgR   93.57 ± 57.39 | KL 0.0033 (β=0.0001) | Lclip 0.0018 Lkl 0.0033 Lent 0.9098 | steps 5345794 eps 7168 | time 2.22s


GRPO:  75%|██████████████████████████████████████████              | 225/300 [08:22<02:44,  2.19s/it, KL=0.005, avgR=127.6, beta=0.0001, it_s=2.13]

Iter 0224 | avgR  127.60 ± 69.38 | KL 0.0052 (β=0.0001) | Lclip 0.0019 Lkl 0.0052 Lent 0.9520 | steps 5368729 eps 7200 | time 2.13s


GRPO:  75%|██████████████████████████████████████████▏             | 226/300 [08:25<02:45,  2.24s/it, KL=0.003, avgR=116.2, beta=0.0001, it_s=2.35]

Iter 0225 | avgR  116.24 ± 60.52 | KL 0.0031 (β=0.0001) | Lclip 0.0012 Lkl 0.0031 Lent 0.9027 | steps 5393101 eps 7232 | time 2.35s


GRPO:  76%|██████████████████████████████████████████▎             | 227/300 [08:27<02:40,  2.20s/it, KL=0.004, avgR=139.3, beta=0.0001, it_s=2.10]

Iter 0226 | avgR  139.27 ± 57.49 | KL 0.0036 (β=0.0001) | Lclip 0.0014 Lkl 0.0036 Lent 0.8568 | steps 5416074 eps 7264 | time 2.10s


GRPO:  76%|██████████████████████████████████████████▌             | 228/300 [08:29<02:37,  2.19s/it, KL=0.003, avgR=101.1, beta=0.0001, it_s=2.15]

Iter 0227 | avgR  101.15 ± 67.19 | KL 0.0033 (β=0.0001) | Lclip 0.0017 Lkl 0.0033 Lent 0.9250 | steps 5441191 eps 7296 | time 2.15s


GRPO:  76%|██████████████████████████████████████████▋             | 229/300 [08:31<02:40,  2.26s/it, KL=0.004, avgR=118.5, beta=0.0001, it_s=2.42]

Iter 0228 | avgR  118.54 ± 53.93 | KL 0.0037 (β=0.0001) | Lclip 0.0004 Lkl 0.0037 Lent 0.8532 | steps 5468105 eps 7328 | time 2.42s


GRPO:  77%|██████████████████████████████████████████▉             | 230/300 [08:34<02:34,  2.21s/it, KL=0.003, avgR=128.5, beta=0.0001, it_s=2.10]

Iter 0229 | avgR  128.50 ± 78.28 | KL 0.0034 (β=0.0001) | Lclip 0.0022 Lkl 0.0034 Lent 0.8119 | steps 5491492 eps 7360 | time 2.10s


GRPO:  77%|███████████████████████████████████████████▉             | 231/300 [08:36<02:34,  2.24s/it, KL=0.004, avgR=89.0, beta=0.0001, it_s=2.30]

Iter 0230 | avgR   89.03 ± 62.12 | KL 0.0041 (β=0.0001) | Lclip 0.0014 Lkl 0.0041 Lent 0.9387 | steps 5517228 eps 7392 | time 2.30s


GRPO:  77%|███████████████████████████████████████████▎            | 232/300 [08:38<02:31,  2.23s/it, KL=0.004, avgR=101.9, beta=0.0001, it_s=2.20]

Iter 0231 | avgR  101.92 ± 72.91 | KL 0.0039 (β=0.0001) | Lclip 0.0013 Lkl 0.0039 Lent 0.9166 | steps 5541032 eps 7424 | time 2.20s


GRPO:  78%|███████████████████████████████████████████▍            | 233/300 [08:40<02:29,  2.23s/it, KL=0.004, avgR=131.0, beta=0.0001, it_s=2.22]

Iter 0232 | avgR  130.97 ± 61.28 | KL 0.0042 (β=0.0001) | Lclip 0.0010 Lkl 0.0042 Lent 0.8587 | steps 5564453 eps 7456 | time 2.22s


GRPO:  78%|███████████████████████████████████████████▋            | 234/300 [08:42<02:26,  2.22s/it, KL=0.005, avgR=116.7, beta=0.0001, it_s=2.20]

Iter 0233 | avgR  116.65 ± 52.77 | KL 0.0051 (β=0.0001) | Lclip 0.0020 Lkl 0.0051 Lent 0.9086 | steps 5590128 eps 7488 | time 2.20s


GRPO:  78%|███████████████████████████████████████████▊            | 235/300 [08:45<02:24,  2.22s/it, KL=0.006, avgR=102.8, beta=0.0001, it_s=2.20]

Iter 0234 | avgR  102.80 ± 69.35 | KL 0.0055 (β=0.0001) | Lclip 0.0020 Lkl 0.0055 Lent 0.9635 | steps 5615316 eps 7520 | time 2.20s


GRPO:  79%|████████████████████████████████████████████            | 236/300 [08:47<02:22,  2.23s/it, KL=0.004, avgR=117.4, beta=0.0001, it_s=2.26]

Iter 0235 | avgR  117.41 ± 57.10 | KL 0.0035 (β=0.0001) | Lclip 0.0011 Lkl 0.0035 Lent 0.9054 | steps 5639924 eps 7552 | time 2.26s


GRPO:  79%|████████████████████████████████████████████▏           | 237/300 [08:49<02:16,  2.16s/it, KL=0.005, avgR=137.9, beta=0.0001, it_s=2.00]

Iter 0236 | avgR  137.85 ± 70.78 | KL 0.0051 (β=0.0001) | Lclip 0.0021 Lkl 0.0051 Lent 0.9016 | steps 5660160 eps 7584 | time 2.00s


GRPO:  79%|████████████████████████████████████████████▍           | 238/300 [08:51<02:14,  2.17s/it, KL=0.004, avgR=105.8, beta=0.0001, it_s=2.17]

Iter 0237 | avgR  105.77 ± 62.65 | KL 0.0043 (β=0.0001) | Lclip 0.0013 Lkl 0.0043 Lent 0.9618 | steps 5683360 eps 7616 | time 2.17s


GRPO:  80%|████████████████████████████████████████████▌           | 239/300 [08:53<02:14,  2.20s/it, KL=0.005, avgR=102.4, beta=0.0001, it_s=2.27]

Iter 0238 | avgR  102.45 ± 76.19 | KL 0.0047 (β=0.0001) | Lclip 0.0021 Lkl 0.0047 Lent 0.9050 | steps 5708303 eps 7648 | time 2.27s


GRPO:  80%|████████████████████████████████████████████▊           | 240/300 [08:56<02:16,  2.28s/it, KL=0.004, avgR=120.7, beta=0.0001, it_s=2.45]

Iter 0239 | avgR  120.71 ± 69.19 | KL 0.0040 (β=0.0001) | Lclip 0.0023 Lkl 0.0040 Lent 0.8414 | steps 5730789 eps 7680 | time 2.45s


GRPO:  80%|████████████████████████████████████████████▉           | 241/300 [08:58<02:18,  2.34s/it, KL=0.004, avgR=120.6, beta=0.0001, it_s=2.50]

Iter 0240 | avgR  120.55 ± 58.73 | KL 0.0040 (β=0.0001) | Lclip 0.0002 Lkl 0.0040 Lent 0.9606 | steps 5757606 eps 7712 | time 2.50s


GRPO:  81%|█████████████████████████████████████████████▏          | 242/300 [09:01<02:18,  2.39s/it, KL=0.004, avgR=118.5, beta=0.0001, it_s=2.51]

Iter 0241 | avgR  118.54 ± 48.66 | KL 0.0042 (β=0.0001) | Lclip 0.0012 Lkl 0.0042 Lent 0.9204 | steps 5785043 eps 7744 | time 2.51s


GRPO:  81%|█████████████████████████████████████████████▎          | 243/300 [09:03<02:11,  2.31s/it, KL=0.003, avgR=140.2, beta=0.0001, it_s=2.11]

Iter 0242 | avgR  140.20 ± 62.45 | KL 0.0033 (β=0.0001) | Lclip 0.0007 Lkl 0.0033 Lent 0.9105 | steps 5806814 eps 7776 | time 2.11s


GRPO:  81%|█████████████████████████████████████████████▌          | 244/300 [09:05<02:07,  2.28s/it, KL=0.004, avgR=122.7, beta=0.0001, it_s=2.22]

Iter 0243 | avgR  122.68 ± 71.36 | KL 0.0036 (β=0.0001) | Lclip 0.0005 Lkl 0.0036 Lent 0.8923 | steps 5830791 eps 7808 | time 2.22s


GRPO:  82%|█████████████████████████████████████████████▋          | 245/300 [09:07<02:03,  2.25s/it, KL=0.003, avgR=134.6, beta=0.0001, it_s=2.18]

Iter 0244 | avgR  134.58 ± 66.27 | KL 0.0032 (β=0.0001) | Lclip 0.0009 Lkl 0.0032 Lent 0.8768 | steps 5854761 eps 7840 | time 2.18s


GRPO:  82%|█████████████████████████████████████████████▉          | 246/300 [09:09<01:58,  2.20s/it, KL=0.003, avgR=118.1, beta=0.0001, it_s=2.07]

Iter 0245 | avgR  118.10 ± 66.33 | KL 0.0034 (β=0.0001) | Lclip 0.0018 Lkl 0.0034 Lent 0.8856 | steps 5875965 eps 7872 | time 2.07s


GRPO:  82%|██████████████████████████████████████████████          | 247/300 [09:12<01:57,  2.21s/it, KL=0.006, avgR=130.1, beta=0.0001, it_s=2.23]

Iter 0246 | avgR  130.15 ± 56.93 | KL 0.0063 (β=0.0001) | Lclip 0.0032 Lkl 0.0063 Lent 0.9453 | steps 5901592 eps 7904 | time 2.23s


GRPO:  83%|██████████████████████████████████████████████▎         | 248/300 [09:14<01:52,  2.16s/it, KL=0.004, avgR=121.3, beta=0.0001, it_s=2.06]

Iter 0247 | avgR  121.28 ± 74.80 | KL 0.0043 (β=0.0001) | Lclip 0.0016 Lkl 0.0043 Lent 0.9746 | steps 5923560 eps 7936 | time 2.06s


GRPO:  83%|██████████████████████████████████████████████▍         | 249/300 [09:16<01:51,  2.18s/it, KL=0.003, avgR=132.9, beta=0.0001, it_s=2.21]

Iter 0248 | avgR  132.89 ± 56.85 | KL 0.0027 (β=0.0001) | Lclip 0.0008 Lkl 0.0027 Lent 0.8984 | steps 5949215 eps 7968 | time 2.21s


GRPO:  83%|██████████████████████████████████████████████▋         | 250/300 [09:18<01:51,  2.23s/it, KL=0.005, avgR=137.6, beta=0.0001, it_s=2.36]

Iter 0249 | avgR  137.61 ± 45.42 | KL 0.0046 (β=0.0001) | Lclip 0.0016 Lkl 0.0046 Lent 0.9128 | steps 5976275 eps 8000 | time 2.36s


GRPO:  84%|██████████████████████████████████████████████▊         | 251/300 [09:21<01:48,  2.22s/it, KL=0.004, avgR=111.4, beta=0.0001, it_s=2.20]

Iter 0250 | avgR  111.37 ± 66.72 | KL 0.0041 (β=0.0001) | Lclip 0.0013 Lkl 0.0041 Lent 0.9276 | steps 6000463 eps 8032 | time 2.20s


GRPO:  84%|███████████████████████████████████████████████         | 252/300 [09:23<01:45,  2.20s/it, KL=0.006, avgR=122.8, beta=0.0001, it_s=2.13]

Iter 0251 | avgR  122.78 ± 63.99 | KL 0.0056 (β=0.0001) | Lclip 0.0011 Lkl 0.0056 Lent 0.9027 | steps 6024606 eps 8064 | time 2.13s


GRPO:  84%|███████████████████████████████████████████████▏        | 253/300 [09:25<01:45,  2.24s/it, KL=0.005, avgR=116.2, beta=0.0001, it_s=2.35]

Iter 0252 | avgR  116.18 ± 50.94 | KL 0.0045 (β=0.0001) | Lclip 0.0011 Lkl 0.0045 Lent 0.9021 | steps 6052799 eps 8096 | time 2.35s


GRPO:  85%|███████████████████████████████████████████████▍        | 254/300 [09:27<01:45,  2.29s/it, KL=0.003, avgR=101.3, beta=0.0001, it_s=2.41]

Iter 0253 | avgR  101.35 ± 50.70 | KL 0.0031 (β=0.0001) | Lclip 0.0007 Lkl 0.0031 Lent 0.9230 | steps 6079527 eps 8128 | time 2.41s


GRPO:  85%|███████████████████████████████████████████████▌        | 255/300 [09:30<01:40,  2.24s/it, KL=0.005, avgR=128.3, beta=0.0001, it_s=2.11]

Iter 0254 | avgR  128.28 ± 64.80 | KL 0.0049 (β=0.0001) | Lclip 0.0023 Lkl 0.0049 Lent 0.9196 | steps 6103454 eps 8160 | time 2.11s


GRPO:  85%|███████████████████████████████████████████████▊        | 256/300 [09:32<01:36,  2.20s/it, KL=0.003, avgR=133.9, beta=0.0001, it_s=2.11]

Iter 0255 | avgR  133.93 ± 60.91 | KL 0.0033 (β=0.0001) | Lclip 0.0003 Lkl 0.0033 Lent 0.8803 | steps 6127351 eps 8192 | time 2.11s


GRPO:  86%|███████████████████████████████████████████████▉        | 257/300 [09:34<01:37,  2.26s/it, KL=0.003, avgR=128.3, beta=0.0001, it_s=2.39]

Iter 0256 | avgR  128.34 ± 56.73 | KL 0.0035 (β=0.0001) | Lclip 0.0012 Lkl 0.0035 Lent 0.9418 | steps 6153110 eps 8224 | time 2.39s


GRPO:  86%|████████████████████████████████████████████████▏       | 258/300 [09:37<01:38,  2.34s/it, KL=0.005, avgR=126.6, beta=0.0001, it_s=2.53]

Iter 0257 | avgR  126.63 ± 53.98 | KL 0.0047 (β=0.0001) | Lclip 0.0007 Lkl 0.0047 Lent 0.9947 | steps 6180110 eps 8256 | time 2.53s


GRPO:  86%|████████████████████████████████████████████████▎       | 259/300 [09:39<01:31,  2.23s/it, KL=0.004, avgR=150.9, beta=0.0001, it_s=1.96]

Iter 0258 | avgR  150.88 ± 73.40 | KL 0.0040 (β=0.0001) | Lclip 0.0006 Lkl 0.0040 Lent 0.8879 | steps 6201202 eps 8288 | time 1.96s


GRPO:  87%|████████████████████████████████████████████████▌       | 260/300 [09:41<01:31,  2.30s/it, KL=0.003, avgR=146.3, beta=0.0001, it_s=2.46]

Iter 0259 | avgR  146.28 ± 51.98 | KL 0.0027 (β=0.0001) | Lclip 0.0013 Lkl 0.0027 Lent 0.9035 | steps 6228650 eps 8320 | time 2.46s


GRPO:  87%|████████████████████████████████████████████████▋       | 261/300 [09:43<01:27,  2.26s/it, KL=0.004, avgR=136.6, beta=0.0001, it_s=2.15]

Iter 0260 | avgR  136.59 ± 73.67 | KL 0.0043 (β=0.0001) | Lclip 0.0020 Lkl 0.0043 Lent 0.8897 | steps 6249981 eps 8352 | time 2.15s


GRPO:  87%|████████████████████████████████████████████████▉       | 262/300 [09:45<01:23,  2.19s/it, KL=0.004, avgR=127.6, beta=0.0001, it_s=2.04]

Iter 0261 | avgR  127.62 ± 67.94 | KL 0.0039 (β=0.0001) | Lclip 0.0007 Lkl 0.0039 Lent 0.8838 | steps 6271323 eps 8384 | time 2.04s


GRPO:  88%|█████████████████████████████████████████████████       | 263/300 [09:47<01:20,  2.17s/it, KL=0.004, avgR=134.1, beta=0.0001, it_s=2.11]

Iter 0262 | avgR  134.09 ± 59.72 | KL 0.0043 (β=0.0001) | Lclip 0.0008 Lkl 0.0043 Lent 0.9091 | steps 6294658 eps 8416 | time 2.11s


GRPO:  88%|█████████████████████████████████████████████████▎      | 264/300 [09:49<01:17,  2.15s/it, KL=0.003, avgR=111.9, beta=0.0001, it_s=2.11]

Iter 0263 | avgR  111.90 ± 67.56 | KL 0.0034 (β=0.0001) | Lclip 0.0003 Lkl 0.0034 Lent 0.8890 | steps 6319109 eps 8448 | time 2.11s


GRPO:  88%|█████████████████████████████████████████████████▍      | 265/300 [09:52<01:15,  2.15s/it, KL=0.005, avgR=137.7, beta=0.0001, it_s=2.14]

Iter 0264 | avgR  137.68 ± 60.17 | KL 0.0046 (β=0.0001) | Lclip 0.0015 Lkl 0.0046 Lent 0.9193 | steps 6343399 eps 8480 | time 2.14s


GRPO:  89%|█████████████████████████████████████████████████▋      | 266/300 [09:54<01:13,  2.17s/it, KL=0.003, avgR=125.5, beta=0.0001, it_s=2.22]

Iter 0265 | avgR  125.49 ± 57.53 | KL 0.0026 (β=0.0001) | Lclip 0.0012 Lkl 0.0026 Lent 0.8989 | steps 6368934 eps 8512 | time 2.22s


GRPO:  89%|█████████████████████████████████████████████████▊      | 267/300 [09:56<01:14,  2.26s/it, KL=0.004, avgR=116.3, beta=0.0001, it_s=2.46]

Iter 0266 | avgR  116.26 ± 51.44 | KL 0.0039 (β=0.0001) | Lclip 0.0006 Lkl 0.0039 Lent 0.9816 | steps 6395089 eps 8544 | time 2.46s


GRPO:  89%|██████████████████████████████████████████████████      | 268/300 [09:59<01:12,  2.26s/it, KL=0.003, avgR=154.1, beta=0.0001, it_s=2.26]

Iter 0267 | avgR  154.12 ± 56.96 | KL 0.0031 (β=0.0001) | Lclip 0.0015 Lkl 0.0031 Lent 0.9227 | steps 6418311 eps 8576 | time 2.26s


GRPO:  90%|██████████████████████████████████████████████████▏     | 269/300 [10:01<01:08,  2.21s/it, KL=0.005, avgR=134.1, beta=0.0001, it_s=2.08]

Iter 0268 | avgR  134.07 ± 59.16 | KL 0.0046 (β=0.0001) | Lclip 0.0002 Lkl 0.0046 Lent 0.9292 | steps 6442996 eps 8608 | time 2.08s


GRPO:  90%|██████████████████████████████████████████████████▍     | 270/300 [10:03<01:05,  2.18s/it, KL=0.004, avgR=134.3, beta=0.0001, it_s=2.12]

Iter 0269 | avgR  134.28 ± 57.18 | KL 0.0041 (β=0.0001) | Lclip 0.0017 Lkl 0.0041 Lent 0.9726 | steps 6467768 eps 8640 | time 2.12s


GRPO:  90%|██████████████████████████████████████████████████▌     | 271/300 [10:05<01:01,  2.12s/it, KL=0.004, avgR=118.3, beta=0.0001, it_s=1.99]

Iter 0270 | avgR  118.27 ± 74.68 | KL 0.0041 (β=0.0001) | Lclip 0.0014 Lkl 0.0041 Lent 0.9436 | steps 6489394 eps 8672 | time 1.99s


GRPO:  91%|██████████████████████████████████████████████████▊     | 272/300 [10:07<00:58,  2.10s/it, KL=0.004, avgR=112.6, beta=0.0001, it_s=2.05]

Iter 0271 | avgR  112.64 ± 74.90 | KL 0.0039 (β=0.0001) | Lclip 0.0015 Lkl 0.0039 Lent 0.8724 | steps 6511777 eps 8704 | time 2.05s


GRPO:  91%|██████████████████████████████████████████████████▉     | 273/300 [10:09<00:57,  2.13s/it, KL=0.005, avgR=126.6, beta=0.0001, it_s=2.18]

Iter 0272 | avgR  126.57 ± 67.56 | KL 0.0054 (β=0.0001) | Lclip 0.0020 Lkl 0.0054 Lent 0.9291 | steps 6535357 eps 8736 | time 2.18s


GRPO:  91%|███████████████████████████████████████████████████▏    | 274/300 [10:12<00:58,  2.26s/it, KL=0.006, avgR=130.6, beta=0.0001, it_s=2.57]

Iter 0273 | avgR  130.59 ± 39.67 | KL 0.0062 (β=0.0001) | Lclip 0.0026 Lkl 0.0062 Lent 0.9909 | steps 6564599 eps 8768 | time 2.57s


GRPO:  92%|███████████████████████████████████████████████████▎    | 275/300 [10:14<01:00,  2.42s/it, KL=0.005, avgR=123.5, beta=0.0001, it_s=2.78]

Iter 0274 | avgR  123.49 ± 43.59 | KL 0.0052 (β=0.0001) | Lclip 0.0017 Lkl 0.0052 Lent 0.9602 | steps 6594212 eps 8800 | time 2.78s


GRPO:  92%|███████████████████████████████████████████████████▌    | 276/300 [10:17<00:57,  2.38s/it, KL=0.004, avgR=140.3, beta=0.0001, it_s=2.29]

Iter 0275 | avgR  140.29 ± 58.45 | KL 0.0041 (β=0.0001) | Lclip 0.0009 Lkl 0.0041 Lent 0.9211 | steps 6619905 eps 8832 | time 2.29s


GRPO:  92%|███████████████████████████████████████████████████▋    | 277/300 [10:19<00:53,  2.33s/it, KL=0.005, avgR=128.2, beta=0.0001, it_s=2.19]

Iter 0276 | avgR  128.23 ± 58.51 | KL 0.0052 (β=0.0001) | Lclip 0.0019 Lkl 0.0052 Lent 0.9578 | steps 6646049 eps 8864 | time 2.19s


GRPO:  93%|███████████████████████████████████████████████████▉    | 278/300 [10:21<00:51,  2.35s/it, KL=0.003, avgR=148.8, beta=0.0001, it_s=2.41]

Iter 0277 | avgR  148.79 ± 54.01 | KL 0.0035 (β=0.0001) | Lclip 0.0013 Lkl 0.0035 Lent 0.8981 | steps 6673819 eps 8896 | time 2.41s


GRPO:  93%|████████████████████████████████████████████████████    | 279/300 [10:23<00:48,  2.32s/it, KL=0.005, avgR=135.6, beta=0.0001, it_s=2.23]

Iter 0278 | avgR  135.62 ± 66.58 | KL 0.0046 (β=0.0001) | Lclip 0.0025 Lkl 0.0046 Lent 0.9351 | steps 6698921 eps 8928 | time 2.23s


GRPO:  93%|████████████████████████████████████████████████████▎   | 280/300 [10:26<00:47,  2.39s/it, KL=0.005, avgR=137.4, beta=0.0001, it_s=2.57]

Iter 0279 | avgR  137.41 ± 55.92 | KL 0.0052 (β=0.0001) | Lclip 0.0020 Lkl 0.0052 Lent 0.9317 | steps 6726142 eps 8960 | time 2.57s


GRPO:  94%|████████████████████████████████████████████████████▍   | 281/300 [10:28<00:45,  2.40s/it, KL=0.005, avgR=127.1, beta=0.0001, it_s=2.40]

Iter 0280 | avgR  127.06 ± 53.08 | KL 0.0050 (β=0.0001) | Lclip 0.0022 Lkl 0.0050 Lent 0.9642 | steps 6752799 eps 8992 | time 2.40s


GRPO:  94%|████████████████████████████████████████████████████▋   | 282/300 [10:31<00:43,  2.41s/it, KL=0.003, avgR=139.1, beta=0.0001, it_s=2.42]

Iter 0281 | avgR  139.09 ± 42.04 | KL 0.0031 (β=0.0001) | Lclip 0.0012 Lkl 0.0031 Lent 0.9277 | steps 6781605 eps 9024 | time 2.42s


GRPO:  94%|████████████████████████████████████████████████████▊   | 283/300 [10:33<00:40,  2.41s/it, KL=0.006, avgR=155.2, beta=0.0001, it_s=2.40]

Iter 0282 | avgR  155.24 ± 52.30 | KL 0.0055 (β=0.0001) | Lclip 0.0031 Lkl 0.0055 Lent 0.9062 | steps 6808793 eps 9056 | time 2.40s


GRPO:  95%|█████████████████████████████████████████████████████   | 284/300 [10:36<00:38,  2.40s/it, KL=0.003, avgR=120.6, beta=0.0001, it_s=2.37]

Iter 0283 | avgR  120.62 ± 52.80 | KL 0.0031 (β=0.0001) | Lclip 0.0011 Lkl 0.0031 Lent 0.9766 | steps 6835776 eps 9088 | time 2.37s


GRPO:  95%|█████████████████████████████████████████████████████▏  | 285/300 [10:38<00:36,  2.44s/it, KL=0.003, avgR=131.8, beta=0.0001, it_s=2.54]

Iter 0284 | avgR  131.81 ± 42.38 | KL 0.0030 (β=0.0001) | Lclip 0.0013 Lkl 0.0030 Lent 0.9338 | steps 6865273 eps 9120 | time 2.54s


GRPO:  95%|█████████████████████████████████████████████████████▍  | 286/300 [10:41<00:34,  2.45s/it, KL=0.005, avgR=138.9, beta=0.0001, it_s=2.47]

Iter 0285 | avgR  138.93 ± 38.07 | KL 0.0049 (β=0.0001) | Lclip 0.0022 Lkl 0.0049 Lent 0.9211 | steps 6895447 eps 9152 | time 2.47s


GRPO:  96%|█████████████████████████████████████████████████████▌  | 287/300 [10:43<00:31,  2.43s/it, KL=0.003, avgR=139.4, beta=0.0001, it_s=2.37]

Iter 0286 | avgR  139.41 ± 49.90 | KL 0.0035 (β=0.0001) | Lclip 0.0013 Lkl 0.0035 Lent 0.9092 | steps 6922735 eps 9184 | time 2.37s


GRPO:  96%|█████████████████████████████████████████████████████▊  | 288/300 [10:46<00:29,  2.47s/it, KL=0.006, avgR=118.4, beta=0.0001, it_s=2.58]

Iter 0287 | avgR  118.39 ± 44.28 | KL 0.0063 (β=0.0001) | Lclip 0.0018 Lkl 0.0063 Lent 0.9129 | steps 6950358 eps 9216 | time 2.58s


GRPO:  96%|█████████████████████████████████████████████████████▉  | 289/300 [10:48<00:27,  2.52s/it, KL=0.004, avgR=118.5, beta=0.0001, it_s=2.61]

Iter 0288 | avgR  118.54 ± 40.27 | KL 0.0039 (β=0.0001) | Lclip 0.0007 Lkl 0.0039 Lent 0.8580 | steps 6979868 eps 9248 | time 2.61s


GRPO:  97%|██████████████████████████████████████████████████████▏ | 290/300 [10:51<00:25,  2.51s/it, KL=0.005, avgR=143.4, beta=0.0001, it_s=2.48]

Iter 0289 | avgR  143.38 ± 50.69 | KL 0.0055 (β=0.0001) | Lclip 0.0023 Lkl 0.0055 Lent 0.8771 | steps 7007164 eps 9280 | time 2.48s


GRPO:  97%|██████████████████████████████████████████████████████▎ | 291/300 [10:53<00:22,  2.49s/it, KL=0.004, avgR=139.6, beta=0.0001, it_s=2.46]

Iter 0290 | avgR  139.64 ± 38.55 | KL 0.0039 (β=0.0001) | Lclip 0.0013 Lkl 0.0039 Lent 0.8213 | steps 7037085 eps 9312 | time 2.46s


GRPO:  97%|██████████████████████████████████████████████████████▌ | 292/300 [10:56<00:19,  2.47s/it, KL=0.003, avgR=142.0, beta=0.0001, it_s=2.42]

Iter 0291 | avgR  142.04 ± 48.73 | KL 0.0034 (β=0.0001) | Lclip 0.0029 Lkl 0.0034 Lent 0.8811 | steps 7064465 eps 9344 | time 2.42s


GRPO:  98%|██████████████████████████████████████████████████████▋ | 293/300 [10:58<00:17,  2.52s/it, KL=0.003, avgR=141.1, beta=0.0001, it_s=2.63]

Iter 0292 | avgR  141.09 ± 40.69 | KL 0.0028 (β=0.0001) | Lclip 0.0013 Lkl 0.0028 Lent 0.9090 | steps 7094071 eps 9376 | time 2.63s


GRPO:  98%|██████████████████████████████████████████████████████▉ | 294/300 [11:01<00:15,  2.56s/it, KL=0.005, avgR=124.0, beta=0.0001, it_s=2.66]

Iter 0293 | avgR  124.02 ± 40.52 | KL 0.0046 (β=0.0001) | Lclip 0.0015 Lkl 0.0046 Lent 0.9614 | steps 7123146 eps 9408 | time 2.66s


GRPO:  98%|███████████████████████████████████████████████████████ | 295/300 [11:03<00:12,  2.56s/it, KL=0.004, avgR=141.8, beta=0.0001, it_s=2.54]

Iter 0294 | avgR  141.77 ± 35.53 | KL 0.0045 (β=0.0001) | Lclip 0.0028 Lkl 0.0045 Lent 0.8895 | steps 7152451 eps 9440 | time 2.54s


GRPO:  99%|███████████████████████████████████████████████████████▎| 296/300 [11:06<00:10,  2.52s/it, KL=0.005, avgR=130.3, beta=0.0001, it_s=2.43]

Iter 0295 | avgR  130.35 ± 43.67 | KL 0.0050 (β=0.0001) | Lclip 0.0013 Lkl 0.0050 Lent 0.9321 | steps 7182200 eps 9472 | time 2.43s


GRPO:  99%|███████████████████████████████████████████████████████▍| 297/300 [11:08<00:07,  2.51s/it, KL=0.004, avgR=135.7, beta=0.0001, it_s=2.49]

Iter 0296 | avgR  135.68 ± 55.21 | KL 0.0040 (β=0.0001) | Lclip 0.0026 Lkl 0.0040 Lent 0.8765 | steps 7208985 eps 9504 | time 2.49s


GRPO:  99%|███████████████████████████████████████████████████████▋| 298/300 [11:11<00:05,  2.51s/it, KL=0.006, avgR=119.4, beta=0.0001, it_s=2.50]

Iter 0297 | avgR  119.44 ± 24.35 | KL 0.0056 (β=0.0001) | Lclip 0.0025 Lkl 0.0056 Lent 0.9622 | steps 7239616 eps 9536 | time 2.50s


GRPO: 100%|███████████████████████████████████████████████████████▊| 299/300 [11:13<00:02,  2.50s/it, KL=0.004, avgR=116.2, beta=0.0001, it_s=2.48]

Iter 0298 | avgR  116.21 ± 54.36 | KL 0.0040 (β=0.0001) | Lclip 0.0010 Lkl 0.0040 Lent 0.9259 | steps 7267715 eps 9568 | time 2.48s


GRPO: 100%|████████████████████████████████████████████████████████| 300/300 [11:16<00:00,  2.25s/it, KL=0.005, avgR=121.9, beta=0.0001, it_s=2.39]

Iter 0299 | avgR  121.93 ± 49.22 | KL 0.0046 (β=0.0001) | Lclip 0.0018 Lkl 0.0046 Lent 0.9266 | steps 7296579 eps 9600 | time 2.39s





✅ Saved MP4 video: videos/GRPO/gamma_0.98_32/LunarLander-v3_ep01_R242.3.mp4 | Reward: 242.3




✅ Saved MP4 video: videos/GRPO/gamma_0.98_32/LunarLander-v3_ep02_R259.9.mp4 | Reward: 259.9




✅ Saved MP4 video: videos/GRPO/gamma_0.98_32/LunarLander-v3_ep03_R247.9.mp4 | Reward: 247.9




✅ Saved MP4 video: videos/GRPO/gamma_0.98_32/LunarLander-v3_ep04_R243.4.mp4 | Reward: 243.4




✅ Saved MP4 video: videos/GRPO/gamma_0.98_32/LunarLander-v3_ep05_R257.9.mp4 | Reward: 257.9




✅ Saved MP4 video: videos/GRPO/gamma_0.98_32/LunarLander-v3_ep06_R235.7.mp4 | Reward: 235.7




✅ Saved MP4 video: videos/GRPO/gamma_0.98_32/LunarLander-v3_ep07_R223.5.mp4 | Reward: 223.5




✅ Saved MP4 video: videos/GRPO/gamma_0.98_32/LunarLander-v3_ep08_R275.8.mp4 | Reward: 275.8




✅ Saved MP4 video: videos/GRPO/gamma_0.98_32/LunarLander-v3_ep09_R212.2.mp4 | Reward: 212.2




✅ Saved MP4 video: videos/GRPO/gamma_0.98_32/LunarLander-v3_ep10_R243.9.mp4 | Reward: 243.9




✅ Saved MP4 video: videos/GRPO/gamma_0.98_32/LunarLander-v3_ep11_R134.0.mp4 | Reward: 134.0




✅ Saved MP4 video: videos/GRPO/gamma_0.98_32/LunarLander-v3_ep12_R97.9.mp4 | Reward: 97.9


In [11]:
record_videos(pi_star, env_id, video_dir=f"videos/GRPO/gamma_{gamma}", episodes=6, device=device)



✅ Saved MP4 video: videos/GRPO/gamma_0.975/LunarLander-v3_ep01_R209.8.mp4 | Reward: 209.8




✅ Saved MP4 video: videos/GRPO/gamma_0.975/LunarLander-v3_ep02_R193.2.mp4 | Reward: 193.2




✅ Saved MP4 video: videos/GRPO/gamma_0.975/LunarLander-v3_ep03_R197.3.mp4 | Reward: 197.3




✅ Saved MP4 video: videos/GRPO/gamma_0.975/LunarLander-v3_ep04_R210.8.mp4 | Reward: 210.8




✅ Saved MP4 video: videos/GRPO/gamma_0.975/LunarLander-v3_ep05_R169.3.mp4 | Reward: 169.3




✅ Saved MP4 video: videos/GRPO/gamma_0.975/LunarLander-v3_ep06_R217.5.mp4 | Reward: 217.5


In [5]:
from dense_scripts.utils import record_videos
device = "cuda"
env_id = "LunarLander-v3"
pi_star.to(device)
record_videos(pi_star, env_id, video_dir=f"videos/GRPO/gamma_{gamma}_{G}", episodes=12, device=device)



✅ Saved MP4 video: videos/GRPO/gamma_0.975_64/LunarLander-v3_ep01_R195.2.mp4 | Reward: 195.2




✅ Saved MP4 video: videos/GRPO/gamma_0.975_64/LunarLander-v3_ep02_R159.3.mp4 | Reward: 159.3




✅ Saved MP4 video: videos/GRPO/gamma_0.975_64/LunarLander-v3_ep03_R159.2.mp4 | Reward: 159.2




✅ Saved MP4 video: videos/GRPO/gamma_0.975_64/LunarLander-v3_ep04_R187.3.mp4 | Reward: 187.3




✅ Saved MP4 video: videos/GRPO/gamma_0.975_64/LunarLander-v3_ep05_R206.7.mp4 | Reward: 206.7




✅ Saved MP4 video: videos/GRPO/gamma_0.975_64/LunarLander-v3_ep06_R218.9.mp4 | Reward: 218.9




✅ Saved MP4 video: videos/GRPO/gamma_0.975_64/LunarLander-v3_ep07_R227.3.mp4 | Reward: 227.3




✅ Saved MP4 video: videos/GRPO/gamma_0.975_64/LunarLander-v3_ep08_R233.3.mp4 | Reward: 233.3




✅ Saved MP4 video: videos/GRPO/gamma_0.975_64/LunarLander-v3_ep09_R220.4.mp4 | Reward: 220.4




✅ Saved MP4 video: videos/GRPO/gamma_0.975_64/LunarLander-v3_ep10_R149.5.mp4 | Reward: 149.5




✅ Saved MP4 video: videos/GRPO/gamma_0.975_64/LunarLander-v3_ep11_R221.1.mp4 | Reward: 221.1




✅ Saved MP4 video: videos/GRPO/gamma_0.975_64/LunarLander-v3_ep12_R167.5.mp4 | Reward: 167.5


In [None]:
from dense_scripts.GRPO.grpo import PerStepAdvGRPOTrainer, GRPOConfig
from dense_scripts.utils.policies import SimpleGRPOPolicy

gamma = 0.98
G = 32
cfg = GRPOConfig(
    env_id="LunarLander-v3",
    G=G, T=1024, epochs=8, minibatches=32, gamma=gamma,
    ent_coef=0.01, beta_kl=0.02, target_kl=0.015,
    n_workers=24, log_dir="./runs/GRPO"
)

pi = SimpleGRPOPolicy(obs_dim=8, act_dim=4, hidden=128)
trainer = PerStepAdvGRPOTrainer(policy=pi, config=cfg, device="cpu")

# automatically logs + saves config + records video at the end
pi_star = trainer.train(
    iters=30,
    video_dir="videos/GRPO",
    video_fps=30,
    video_format="mp4",
    video_episodes=2
)


  from pkg_resources import resource_stream, resource_exists
  from .autonotebook import tqdm as notebook_tqdm


📝 Saved hyperparameters to runs/GRPO/grpo_01h23_08112025/grpo_01h23_08112025.txt


  S.append(torch.tensor(ep["s"], dtype=torch.float32))
GRPO (01h23_08112025):   3%|█▍                                         | 1/30 [00:01<00:37,  1.30s/it, KL=0.011, avgR=-137.2, beta=0.02, it_s=1.30]

Iter 0000 | avgR -137.21 ± 64.06 | KL 0.0110 (β=0.02) | Lclip 0.0076 Lkl 0.0110 Lent 1.3626 | steps 3053 eps 32 | time 1.30s


GRPO (01h23_08112025):   7%|██▊                                        | 2/30 [00:02<00:32,  1.15s/it, KL=0.014, avgR=-255.0, beta=0.02, it_s=1.05]

Iter 0001 | avgR -255.04 ±111.17 | KL 0.0137 (β=0.02) | Lclip 0.0140 Lkl 0.0137 Lent 1.3447 | steps 6025 eps 64 | time 1.05s


In [6]:
from dense_scripts.PPO import PPOTrainer, PPOConfig
from dense_scripts.utils.policies import SimpleActorCriticPolicy

cfg = PPOConfig(
    env_id="LunarLander-v3",
    G=24, T=1024,
    epochs=8, minibatches=32,
    gamma=0.99, lam=0.95,
    lr=3e-4,
    vf_coef=0.5, ent_coef=0.01,
    objective="clip",        # or "klpen"
    target_kl=0.015,
    beta_kl=0.9,
    n_workers=24,
    log_dir="./runs/PPO"
)

pi = SimpleActorCriticPolicy(obs_dim=8, act_dim=4)

trainer = PPOTrainer(policy=pi, config=cfg, device="cpu")

pi_star, v_star = trainer.train(
    iters=50,
    video_dir="videos/PPO",
    video_fps=30,
    video_format="mp4",
    video_episodes=6
)

PPO (03h04_08112025):   2%|█▏                                                       | 1/50 [00:02<02:18,  2.82s/it, KL=0.011, avgR=-178, it_s=2.82]

Iter 000 | avgR -178.5 ±116.1 | KL 0.0105 β=0.900 (clip) | Lπ -0.0223 Lv 798.36 H 1.377 | steps 24576 | 2.82s


PPO (03h04_08112025):   4%|██▎                                                      | 2/50 [00:05<02:17,  2.87s/it, KL=0.020, avgR=-158, it_s=2.90]

Iter 001 | avgR -158.5 ±110.1 | KL 0.0203 β=0.900 (clip) | Lπ -0.0241 Lv 615.02 H 1.371 | steps 49152 | 2.90s


PPO (03h04_08112025):   6%|███▍                                                     | 3/50 [00:07<01:55,  2.45s/it, KL=0.030, avgR=-120, it_s=1.94]

Iter 002 | avgR -119.9 ±66.3 | KL 0.0297 β=0.900 (clip) | Lπ -0.0172 Lv 321.39 H 1.366 | steps 73728 | 1.94s


PPO (03h04_08112025):   8%|████▌                                                    | 4/50 [00:09<01:41,  2.20s/it, KL=0.059, avgR=-112, it_s=1.82]

Iter 003 | avgR -112.0 ±74.1 | KL 0.0586 β=0.900 (clip) | Lπ -0.0193 Lv 325.60 H 1.349 | steps 98304 | 1.82s


PPO (03h04_08112025):  10%|█████▊                                                    | 5/50 [00:11<01:33,  2.07s/it, KL=0.087, avgR=-94, it_s=1.83]

Iter 004 | avgR -93.6 ±60.3 | KL 0.0867 β=0.900 (clip) | Lπ -0.0185 Lv 239.15 H 1.313 | steps 122880 | 1.83s


PPO (03h04_08112025):  12%|██████▉                                                   | 6/50 [00:13<01:26,  1.97s/it, KL=0.130, avgR=-92, it_s=1.77]

Iter 005 | avgR -92.3 ±68.1 | KL 0.1301 β=0.900 (clip) | Lπ -0.0170 Lv 253.32 H 1.295 | steps 147456 | 1.77s


PPO (03h04_08112025):  14%|███████▉                                                 | 7/50 [00:14<01:22,  1.92s/it, KL=0.140, avgR=-106, it_s=1.81]

Iter 006 | avgR -105.9 ±95.8 | KL 0.1397 β=0.900 (clip) | Lπ -0.0181 Lv 331.66 H 1.252 | steps 172032 | 1.81s


PPO (03h04_08112025):  16%|█████████▎                                                | 8/50 [00:16<01:18,  1.86s/it, KL=0.200, avgR=-94, it_s=1.74]

Iter 007 | avgR -94.0 ±96.1 | KL 0.2003 β=0.900 (clip) | Lπ -0.0158 Lv 289.38 H 1.232 | steps 196608 | 1.74s


PPO (03h04_08112025):  18%|██████████▍                                               | 9/50 [00:18<01:14,  1.82s/it, KL=0.183, avgR=-82, it_s=1.72]

Iter 008 | avgR -81.9 ±86.0 | KL 0.1827 β=0.900 (clip) | Lπ -0.0142 Lv 229.72 H 1.202 | steps 221184 | 1.72s


PPO (03h04_08112025):  20%|███████████▍                                             | 10/50 [00:20<01:12,  1.82s/it, KL=0.240, avgR=-65, it_s=1.82]

Iter 009 | avgR -64.7 ±74.7 | KL 0.2404 β=0.900 (clip) | Lπ -0.0158 Lv 206.73 H 1.155 | steps 245760 | 1.82s


PPO (03h04_08112025):  22%|████████████▌                                            | 11/50 [00:21<01:09,  1.79s/it, KL=0.232, avgR=-63, it_s=1.70]

Iter 010 | avgR -63.2 ±93.7 | KL 0.2316 β=0.900 (clip) | Lπ -0.0134 Lv 198.67 H 1.164 | steps 270336 | 1.70s


PPO (03h04_08112025):  24%|█████████████▋                                           | 12/50 [00:23<01:08,  1.80s/it, KL=0.247, avgR=-54, it_s=1.82]

Iter 011 | avgR -54.4 ±100.9 | KL 0.2469 β=0.900 (clip) | Lπ -0.0133 Lv 208.88 H 1.145 | steps 294912 | 1.82s


PPO (03h04_08112025):  26%|██████████████▊                                          | 13/50 [00:25<01:07,  1.82s/it, KL=0.284, avgR=-56, it_s=1.86]

Iter 012 | avgR -55.5 ±91.1 | KL 0.2837 β=0.900 (clip) | Lπ -0.0125 Lv 210.61 H 1.102 | steps 319488 | 1.86s


PPO (03h04_08112025):  28%|███████████████▉                                         | 14/50 [00:27<01:04,  1.80s/it, KL=0.347, avgR=-57, it_s=1.76]

Iter 013 | avgR -56.5 ±87.4 | KL 0.3466 β=0.900 (clip) | Lπ -0.0135 Lv 197.15 H 1.079 | steps 344064 | 1.76s


PPO (03h04_08112025):  30%|█████████████████                                        | 15/50 [00:29<01:01,  1.76s/it, KL=0.303, avgR=-71, it_s=1.65]

Iter 014 | avgR -70.9 ±98.1 | KL 0.3029 β=0.900 (clip) | Lπ -0.0118 Lv 207.04 H 1.089 | steps 368640 | 1.65s


PPO (03h04_08112025):  32%|██████████████████▏                                      | 16/50 [00:30<00:59,  1.75s/it, KL=0.320, avgR=-34, it_s=1.73]

Iter 015 | avgR -34.2 ±82.2 | KL 0.3195 β=0.900 (clip) | Lπ -0.0122 Lv 153.17 H 1.075 | steps 393216 | 1.73s


PPO (03h04_08112025):  34%|███████████████████▍                                     | 17/50 [00:32<00:57,  1.74s/it, KL=0.393, avgR=-65, it_s=1.71]

Iter 016 | avgR -65.2 ±109.9 | KL 0.3933 β=0.900 (clip) | Lπ -0.0129 Lv 196.03 H 1.013 | steps 417792 | 1.71s


PPO (03h04_08112025):  36%|████████████████████▌                                    | 18/50 [00:34<00:55,  1.73s/it, KL=0.382, avgR=-90, it_s=1.71]

Iter 017 | avgR -90.2 ±100.4 | KL 0.3818 β=0.900 (clip) | Lπ -0.0126 Lv 176.43 H 1.021 | steps 442368 | 1.71s


PPO (03h04_08112025):  38%|█████████████████████▋                                   | 19/50 [00:35<00:53,  1.72s/it, KL=0.356, avgR=-24, it_s=1.71]

Iter 018 | avgR -23.5 ±87.3 | KL 0.3559 β=0.900 (clip) | Lπ -0.0114 Lv 136.66 H 1.014 | steps 466944 | 1.71s


PPO (03h04_08112025):  40%|██████████████████████▊                                  | 20/50 [00:37<00:52,  1.74s/it, KL=0.405, avgR=-16, it_s=1.77]

Iter 019 | avgR -16.2 ±78.6 | KL 0.4054 β=0.900 (clip) | Lπ -0.0117 Lv 108.11 H 0.992 | steps 491520 | 1.77s


PPO (03h04_08112025):  42%|███████████████████████▉                                 | 21/50 [00:39<00:49,  1.71s/it, KL=0.392, avgR=-27, it_s=1.65]

Iter 020 | avgR -27.5 ±80.9 | KL 0.3921 β=0.900 (clip) | Lπ -0.0114 Lv 117.12 H 0.994 | steps 516096 | 1.65s


PPO (03h04_08112025):  44%|█████████████████████████                                | 22/50 [00:41<00:49,  1.76s/it, KL=0.442, avgR=-24, it_s=1.87]

Iter 021 | avgR -24.2 ±70.4 | KL 0.4425 β=0.900 (clip) | Lπ -0.0106 Lv 106.18 H 0.960 | steps 540672 | 1.87s


PPO (03h04_08112025):  46%|██████████████████████████▏                              | 23/50 [00:42<00:47,  1.76s/it, KL=0.391, avgR=-24, it_s=1.74]

Iter 022 | avgR -24.3 ±92.6 | KL 0.3915 β=0.900 (clip) | Lπ -0.0116 Lv 91.50 H 0.977 | steps 565248 | 1.74s


PPO (03h04_08112025):  48%|████████████████████████████▎                              | 24/50 [00:44<00:46,  1.77s/it, KL=0.416, avgR=4, it_s=1.80]

Iter 023 | avgR 4.3 ±59.8 | KL 0.4156 β=0.900 (clip) | Lπ -0.0108 Lv 83.73 H 1.003 | steps 589824 | 1.80s


PPO (03h04_08112025):  50%|█████████████████████████████                             | 25/50 [00:46<00:44,  1.77s/it, KL=0.391, avgR=33, it_s=1.76]

Iter 024 | avgR 33.4 ±66.1 | KL 0.3906 β=0.900 (clip) | Lπ -0.0112 Lv 65.92 H 0.986 | steps 614400 | 1.76s


PPO (03h04_08112025):  52%|██████████████████████████████▏                           | 26/50 [00:48<00:43,  1.82s/it, KL=0.389, avgR=-3, it_s=1.94]

Iter 025 | avgR -3.1 ±73.4 | KL 0.3891 β=0.900 (clip) | Lπ -0.0111 Lv 65.87 H 0.970 | steps 638976 | 1.94s


PPO (03h04_08112025):  54%|███████████████████████████████▎                          | 27/50 [00:50<00:42,  1.86s/it, KL=0.444, avgR=11, it_s=1.94]

Iter 026 | avgR 10.8 ±73.9 | KL 0.4441 β=0.900 (clip) | Lπ -0.0110 Lv 76.10 H 0.943 | steps 663552 | 1.94s


PPO (03h04_08112025):  56%|█████████████████████████████████                          | 28/50 [00:52<00:40,  1.86s/it, KL=0.391, avgR=2, it_s=1.85]

Iter 027 | avgR 2.5 ±73.4 | KL 0.3908 β=0.900 (clip) | Lπ -0.0100 Lv 63.18 H 0.954 | steps 688128 | 1.85s


PPO (03h04_08112025):  58%|█████████████████████████████████▋                        | 29/50 [00:53<00:38,  1.83s/it, KL=0.403, avgR=16, it_s=1.76]

Iter 028 | avgR 16.0 ±64.4 | KL 0.4030 β=0.900 (clip) | Lπ -0.0131 Lv 52.91 H 0.962 | steps 712704 | 1.76s


PPO (03h04_08112025):  60%|██████████████████████████████████▊                       | 30/50 [00:55<00:36,  1.82s/it, KL=0.439, avgR=22, it_s=1.79]

Iter 029 | avgR 22.3 ±58.2 | KL 0.4388 β=0.900 (clip) | Lπ -0.0122 Lv 39.87 H 0.980 | steps 737280 | 1.79s


PPO (03h04_08112025):  62%|████████████████████████████████████▌                      | 31/50 [00:57<00:34,  1.82s/it, KL=0.404, avgR=1, it_s=1.84]

Iter 030 | avgR 1.0 ±50.6 | KL 0.4042 β=0.900 (clip) | Lπ -0.0121 Lv 39.25 H 0.963 | steps 761856 | 1.84s


PPO (03h04_08112025):  64%|█████████████████████████████████████                     | 32/50 [00:59<00:32,  1.81s/it, KL=0.407, avgR=26, it_s=1.78]

Iter 031 | avgR 26.4 ±49.5 | KL 0.4071 β=0.900 (clip) | Lπ -0.0113 Lv 39.86 H 0.968 | steps 786432 | 1.78s


PPO (03h04_08112025):  66%|██████████████████████████████████████▎                   | 33/50 [01:01<00:30,  1.79s/it, KL=0.442, avgR=35, it_s=1.73]

Iter 032 | avgR 34.9 ±52.1 | KL 0.4416 β=0.900 (clip) | Lπ -0.0111 Lv 28.25 H 0.968 | steps 811008 | 1.73s


PPO (03h04_08112025):  68%|███████████████████████████████████████▍                  | 34/50 [01:02<00:28,  1.78s/it, KL=0.377, avgR=-7, it_s=1.77]

Iter 033 | avgR -6.7 ±46.7 | KL 0.3770 β=0.900 (clip) | Lπ -0.0111 Lv 23.32 H 0.989 | steps 835584 | 1.77s


PPO (03h04_08112025):  70%|████████████████████████████████████████▌                 | 35/50 [01:04<00:27,  1.81s/it, KL=0.380, avgR=10, it_s=1.86]

Iter 034 | avgR 9.8 ±44.6 | KL 0.3802 β=0.900 (clip) | Lπ -0.0115 Lv 26.94 H 0.998 | steps 860160 | 1.86s


PPO (03h04_08112025):  72%|█████████████████████████████████████████▊                | 36/50 [01:06<00:25,  1.81s/it, KL=0.431, avgR=50, it_s=1.81]

Iter 035 | avgR 50.3 ±61.3 | KL 0.4309 β=0.900 (clip) | Lπ -0.0104 Lv 31.67 H 0.912 | steps 884736 | 1.81s


PPO (03h04_08112025):  74%|██████████████████████████████████████████▉               | 37/50 [01:08<00:22,  1.76s/it, KL=0.381, avgR=37, it_s=1.65]

Iter 036 | avgR 37.1 ±59.4 | KL 0.3814 β=0.900 (clip) | Lπ -0.0122 Lv 33.37 H 0.981 | steps 909312 | 1.65s


PPO (03h04_08112025):  76%|████████████████████████████████████████████              | 38/50 [01:09<00:20,  1.74s/it, KL=0.406, avgR=23, it_s=1.70]

Iter 037 | avgR 23.4 ±81.7 | KL 0.4057 β=0.900 (clip) | Lπ -0.0108 Lv 47.70 H 0.957 | steps 933888 | 1.70s


PPO (03h04_08112025):  78%|█████████████████████████████████████████████▏            | 39/50 [01:11<00:19,  1.76s/it, KL=0.468, avgR=55, it_s=1.78]

Iter 038 | avgR 55.3 ±70.6 | KL 0.4684 β=0.900 (clip) | Lπ -0.0104 Lv 45.98 H 0.903 | steps 958464 | 1.78s


PPO (03h04_08112025):  80%|██████████████████████████████████████████████▍           | 40/50 [01:13<00:17,  1.76s/it, KL=0.439, avgR=50, it_s=1.77]

Iter 039 | avgR 49.6 ±71.4 | KL 0.4390 β=0.900 (clip) | Lπ -0.0103 Lv 40.59 H 0.913 | steps 983040 | 1.77s


PPO (03h04_08112025):  82%|███████████████████████████████████████████████▌          | 41/50 [01:15<00:15,  1.75s/it, KL=0.424, avgR=57, it_s=1.71]

Iter 040 | avgR 56.9 ±63.7 | KL 0.4236 β=0.900 (clip) | Lπ -0.0100 Lv 37.63 H 0.951 | steps 1007616 | 1.71s


PPO (03h04_08112025):  84%|████████████████████████████████████████████████▋         | 42/50 [01:17<00:14,  1.77s/it, KL=0.413, avgR=51, it_s=1.82]

Iter 041 | avgR 51.0 ±68.8 | KL 0.4135 β=0.900 (clip) | Lπ -0.0099 Lv 35.48 H 0.922 | steps 1032192 | 1.82s


PPO (03h04_08112025):  86%|█████████████████████████████████████████████████▉        | 43/50 [01:18<00:12,  1.80s/it, KL=0.435, avgR=50, it_s=1.86]

Iter 042 | avgR 50.3 ±72.2 | KL 0.4346 β=0.900 (clip) | Lπ -0.0092 Lv 44.84 H 0.925 | steps 1056768 | 1.86s


PPO (03h04_08112025):  88%|███████████████████████████████████████████████████       | 44/50 [01:20<00:10,  1.82s/it, KL=0.450, avgR=44, it_s=1.86]

Iter 043 | avgR 43.7 ±61.5 | KL 0.4497 β=0.900 (clip) | Lπ -0.0092 Lv 32.96 H 0.908 | steps 1081344 | 1.86s


PPO (03h04_08112025):  90%|████████████████████████████████████████████████████▏     | 45/50 [01:22<00:09,  1.82s/it, KL=0.484, avgR=68, it_s=1.83]

Iter 044 | avgR 68.1 ±72.5 | KL 0.4841 β=0.900 (clip) | Lπ -0.0090 Lv 42.48 H 0.887 | steps 1105920 | 1.83s


PPO (03h04_08112025):  92%|█████████████████████████████████████████████████████▎    | 46/50 [01:24<00:07,  1.81s/it, KL=0.420, avgR=51, it_s=1.78]

Iter 045 | avgR 51.1 ±57.0 | KL 0.4195 β=0.900 (clip) | Lπ -0.0093 Lv 27.00 H 0.921 | steps 1130496 | 1.78s


PPO (03h04_08112025):  94%|██████████████████████████████████████████████████████▌   | 47/50 [01:26<00:05,  1.80s/it, KL=0.406, avgR=54, it_s=1.76]

Iter 046 | avgR 54.0 ±56.2 | KL 0.4063 β=0.900 (clip) | Lπ -0.0105 Lv 30.97 H 0.975 | steps 1155072 | 1.76s


PPO (03h04_08112025):  96%|███████████████████████████████████████████████████████▋  | 48/50 [01:27<00:03,  1.76s/it, KL=0.474, avgR=57, it_s=1.66]

Iter 047 | avgR 57.4 ±62.8 | KL 0.4738 β=0.900 (clip) | Lπ -0.0101 Lv 28.76 H 0.897 | steps 1179648 | 1.66s


PPO (03h04_08112025):  98%|████████████████████████████████████████████████████████▊ | 49/50 [01:29<00:01,  1.77s/it, KL=0.404, avgR=35, it_s=1.81]

Iter 048 | avgR 35.0 ±69.7 | KL 0.4035 β=0.900 (clip) | Lπ -0.0115 Lv 36.11 H 0.980 | steps 1204224 | 1.81s


PPO (03h04_08112025): 100%|██████████████████████████████████████████████████████████| 50/50 [01:31<00:00,  1.83s/it, KL=0.397, avgR=34, it_s=1.71]

Iter 049 | avgR 34.5 ±67.0 | KL 0.3969 β=0.900 (clip) | Lπ -0.0100 Lv 44.23 H 0.958 | steps 1228800 | 1.71s
✅ PPO training complete — logs at runs/PPO/ppo_03h04_08112025
🎥 Recording videos to videos/PPO/G24_γ0.99_03h04_08112025





✅ Saved MP4 video: videos/PPO/G24_γ0.99_03h04_08112025/LunarLander-v3_ep01_R-58.6.mp4 | Reward: -58.6




✅ Saved MP4 video: videos/PPO/G24_γ0.99_03h04_08112025/LunarLander-v3_ep02_R-45.7.mp4 | Reward: -45.7




✅ Saved MP4 video: videos/PPO/G24_γ0.99_03h04_08112025/LunarLander-v3_ep03_R-72.9.mp4 | Reward: -72.9




✅ Saved MP4 video: videos/PPO/G24_γ0.99_03h04_08112025/LunarLander-v3_ep04_R-69.3.mp4 | Reward: -69.3




✅ Saved MP4 video: videos/PPO/G24_γ0.99_03h04_08112025/LunarLander-v3_ep05_R-61.0.mp4 | Reward: -61.0




✅ Saved MP4 video: videos/PPO/G24_γ0.99_03h04_08112025/LunarLander-v3_ep06_R-28.6.mp4 | Reward: -28.6
