In [12]:
import sys
import os
from pathlib import Path
project_dir = Path(os.path.abspath('')).parent
sys.path.insert(0, project_dir.as_posix())

from tqdm import trange, tqdm
import numpy as np
import torch
from matplotlib import pyplot as plt
import wandb

%load_ext autoreload
%autoreload 2
from cs336_alignment import train_llm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
tokenizer = train_llm.load_tokenizer()
eval_path = project_dir / "evaluation"
logs = []
for epoch in range(5):
    epoch += 1
    step = epoch * 16
    df = train_llm.load_jsonl(
        eval_path / f"Qwen2.5-Math-1.5B-MATH-sample256-epoch{epoch}-step{step}.jsonl"
    )
    format_reward = df.format_reward.mean().item()
    answer_reward = df.answer_reward.mean().item()
    reward = df.reward.mean().item()
    num_tokens_problem = []
    num_tokens_right_problem = []
    num_tokens_wrong_problem = []
    num_tokens_response = []
    num_tokens_right_response = []
    num_tokens_wrong_response = []
    for i in range(len(df)):
        num_token_problem = len(tokenizer.encode(df.problem[i]))
        num_token_response = len(tokenizer.encode(df.response[i]))
        num_tokens_problem.append(num_token_problem)
        num_tokens_response.append(num_token_response)
        if df.reward[i] > 0:
            num_tokens_right_problem.append(num_token_problem)
            num_tokens_right_response.append(num_token_response)
        else:
            num_tokens_wrong_problem.append(num_token_problem)
            num_tokens_wrong_response.append(num_token_response)
    log = {
        "eval_step": step,
        "eval/epoch": epoch,
        "eval/reward": reward,
        "eval/format_reward": format_reward,
        "eval/answer_reward": answer_reward,
        "eval/num_tokens_problem": np.mean(num_tokens_problem).round(2).item(),
        "eval/num_tokens_right_problem": np.mean(num_tokens_right_problem).round(2).item(),
        "eval/num_tokens_wrong_problem": np.mean(num_tokens_wrong_problem).round(2).item(),
        "eval/num_tokens_response": np.mean(num_tokens_response).round(2).item(),
        "eval/num_tokens_right_response": np.mean(num_tokens_right_response).round(2).item(),
        "eval/num_tokens_wrong_response": np.mean(num_tokens_wrong_response).round(2).item(),
    }
    logs.append(log)
    print(log)

{'eval_step': 16, 'eval/epoch': 1, 'eval/reward': 0.43, 'eval/format_reward': 0.94, 'eval/answer_reward': 0.43, 'eval/num_tokens_problem': 51.34, 'eval/num_tokens_right_problem': 36.26, 'eval/num_tokens_wrong_problem': 62.72, 'eval/num_tokens_response': 353.36, 'eval/num_tokens_right_response': 289.88, 'eval/num_tokens_wrong_response': 401.25}
{'eval_step': 32, 'eval/epoch': 2, 'eval/reward': 0.63, 'eval/format_reward': 0.97, 'eval/answer_reward': 0.63, 'eval/num_tokens_problem': 51.34, 'eval/num_tokens_right_problem': 46.0, 'eval/num_tokens_wrong_problem': 60.43, 'eval/num_tokens_response': 403.62, 'eval/num_tokens_right_response': 327.33, 'eval/num_tokens_wrong_response': 533.51}
{'eval_step': 48, 'eval/epoch': 3, 'eval/reward': 0.66, 'eval/format_reward': 0.98, 'eval/answer_reward': 0.66, 'eval/num_tokens_problem': 51.34, 'eval/num_tokens_right_problem': 44.74, 'eval/num_tokens_wrong_problem': 64.15, 'eval/num_tokens_response': 354.28, 'eval/num_tokens_right_response': 299.92, 'eval

In [14]:
run = wandb.init(
    entity="ztzhu11",
    project="FT_Qwen2.5-Math-1.5B",
    resume="must",
    id="uerwbrkf",
)
for log in logs:
    run.log(log)
run.finish()

[34m[1mwandb[0m: Currently logged in as: [33mztzhu1[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


0,1
eval/answer_reward,▁▅▆▇█
eval/epoch,▁▃▅▆█
eval/format_reward,▁▆██▆
eval/num_tokens_problem,▁▁▁▁▁
eval/num_tokens_response,▁█▁▄▄
eval/num_tokens_right_problem,▁▇▆█▅
eval/num_tokens_right_response,▁█▃▆█
eval/num_tokens_wrong_problem,▂▁▃▁█
eval/num_tokens_wrong_response,▁█▄▇█
eval/reward,▁▅▆▇█

0,1
eval/answer_reward,0.77
eval/epoch,5
eval/format_reward,0.97
eval/num_tokens_problem,51.34
eval/num_tokens_response,375.44
eval/num_tokens_right_problem,42.94
eval/num_tokens_right_response,328.82
eval/num_tokens_wrong_problem,79.48
eval/num_tokens_wrong_response,531.52
eval/reward,0.77
