# Poker Bot Evaluation

Evaluate `YiPz/Qwen3-4B-pokerbench-sft` vs base `unsloth/Qwen3-4B-Thinking-2507`

**Quick Start:** Run all cells in order. Results saved to `/content/eval_results/`

## 1. Setup

In [None]:
# Mount Google Drive for caching
from google.colab import drive
drive.mount('/content/drive')

# Install dependencies
!pip install -q transformers accelerate bitsandbytes torch pokerkit tqdm pandas matplotlib

# Clone and install poker bot package
!git clone https://github.com/yilenpan/player_poker_bot.git /content/player_poker_bot 2>/dev/null || true
!pip install -q -e /content/player_poker_bot

In [None]:
import os
import random
import json

import torch
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

from src.eval import (
    HardwareConfig,
    EvalConfig,
    ModelConfig,
    TransformersPlayer,
    MetricsCollector,
    EvalPokerGame,
)

print("Imports loaded!")

## 2. Configuration

In [None]:
# Detect hardware
hw = HardwareConfig.detect()
print(f"GPU: {hw.gpu_name} ({hw.vram_gb:.0f}GB)")
print(f"Quantization: {hw.quantization.value}")

# Experiment config
NUM_HANDS = 100
NUM_SESSIONS = 3
STARTING_STACK = 10000
SMALL_BLIND = 50
BIG_BLIND = 100
SEED = 42

MODELS = {
    "SFT": "YiPz/Qwen3-4B-pokerbench-sft",
    "Base": "unsloth/Qwen3-4B-Thinking-2507",
}

os.makedirs("/content/eval_results", exist_ok=True)
print(f"\nConfig: {NUM_HANDS} hands Ã— {NUM_SESSIONS} sessions = {NUM_HANDS * NUM_SESSIONS} total")

## 3. Load Models

In [None]:
loaded_models = {}
tokenizers = {}
bnb_config = hw.get_bnb_config()

for name, model_id in MODELS.items():
    print(f"Loading {name}: {model_id}...")
    tokenizers[name] = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    
    load_kwargs = {"device_map": "auto", "trust_remote_code": True, "torch_dtype": torch.float16}
    if bnb_config:
        load_kwargs["quantization_config"] = bnb_config
    
    loaded_models[name] = AutoModelForCausalLM.from_pretrained(model_id, **load_kwargs)
    print(f"  VRAM: {torch.cuda.memory_allocated() / 1024**3:.1f}GB")

print(f"\nTotal VRAM: {torch.cuda.memory_allocated() / 1024**3:.1f}GB / {hw.vram_gb:.0f}GB")

## 4. Run Evaluation

In [None]:
random.seed(SEED)
all_results = []

for session_idx in range(NUM_SESSIONS):
    print(f"\n{'='*50}")
    print(f"Session {session_idx + 1}/{NUM_SESSIONS}")
    
    players = [
        TransformersPlayer("SFT", loaded_models["SFT"], tokenizers["SFT"]),
        TransformersPlayer("Base", loaded_models["Base"], tokenizers["Base"]),
    ]
    
    metrics = MetricsCollector(f"session_{session_idx}")
    pbar = tqdm(total=NUM_HANDS, desc=f"Session {session_idx+1}")
    
    game = EvalPokerGame(
        players=players,
        starting_stack=STARTING_STACK,
        small_blind=SMALL_BLIND,
        big_blind=BIG_BLIND,
        metrics=metrics,
        progress_callback=lambda cur, tot: (setattr(pbar, 'n', cur), pbar.refresh()),
    )
    
    result = game.play_session(NUM_HANDS)
    pbar.close()
    all_results.append(result)
    
    for name, stats in result.player_summaries.items():
        print(f"  {name}: {stats['hands_won']}/{stats['hands_played']} wins, BB/100: {stats['bb_per_100']:+.2f}")

print("\n" + "="*50 + "\nALL SESSIONS COMPLETE")

## 5. Results

In [None]:
# Aggregate results
aggregate = {}
for result in all_results:
    for name, stats in result.player_summaries.items():
        if name not in aggregate:
            aggregate[name] = {"hands_played": 0, "hands_won": 0, "total_chip_delta": 0, 
                               "vpip_sum": 0, "pfr_sum": 0, "sessions": 0}
        aggregate[name]["hands_played"] += stats["hands_played"]
        aggregate[name]["hands_won"] += stats["hands_won"]
        aggregate[name]["total_chip_delta"] += stats["total_chip_delta"]
        aggregate[name]["vpip_sum"] += stats.get("vpip", 0)
        aggregate[name]["pfr_sum"] += stats.get("pfr", 0)
        aggregate[name]["sessions"] += 1

rows = []
for name, agg in aggregate.items():
    hp = agg["hands_played"]
    rows.append({
        "Model": name,
        "Hands": hp,
        "Win%": agg["hands_won"] / hp * 100 if hp else 0,
        "BB/100": agg["total_chip_delta"] / hp * 100 / BIG_BLIND if hp else 0,
        "VPIP%": agg["vpip_sum"] / agg["sessions"] * 100 if agg["sessions"] else 0,
        "PFR%": agg["pfr_sum"] / agg["sessions"] * 100 if agg["sessions"] else 0,
    })

df = pd.DataFrame(rows).sort_values("BB/100", ascending=False)
print(df.to_string(index=False, float_format="%.2f"))

# SFT improvement
if "SFT" in aggregate and "Base" in aggregate:
    sft_bb = aggregate["SFT"]["total_chip_delta"] / aggregate["SFT"]["hands_played"] * 100 / BIG_BLIND
    base_bb = aggregate["Base"]["total_chip_delta"] / aggregate["Base"]["hands_played"] * 100 / BIG_BLIND
    print(f"\nSFT improvement: {sft_bb - base_bb:+.2f} BB/100")

In [None]:
# Visualization
fig, axes = plt.subplots(1, 3, figsize=(14, 4))

# BB/100
colors = ["green" if x >= 0 else "red" for x in df["BB/100"]]
axes[0].bar(df["Model"], df["BB/100"], color=colors)
axes[0].axhline(y=0, color="black", linestyle="-", linewidth=0.5)
axes[0].set_title("Profitability (BB/100)")

# Win rate
axes[1].bar(df["Model"], df["Win%"], color="steelblue")
axes[1].axhline(y=50, color="black", linestyle="--", linewidth=0.5)
axes[1].set_title("Win Rate (%)")

# Playing style
x = range(len(df))
axes[2].bar([i - 0.175 for i in x], df["VPIP%"], 0.35, label="VPIP", color="orange")
axes[2].bar([i + 0.175 for i in x], df["PFR%"], 0.35, label="PFR", color="purple")
axes[2].set_xticks(x)
axes[2].set_xticklabels(df["Model"])
axes[2].set_title("Playing Style")
axes[2].legend()

plt.tight_layout()
plt.savefig("/content/eval_results/comparison.png", dpi=150)
plt.show()

## 6. Export

In [None]:
df.to_csv("/content/eval_results/summary.csv", index=False)

results_json = {
    "config": {
        "num_hands": NUM_HANDS, "num_sessions": NUM_SESSIONS,
        "starting_stack": STARTING_STACK, "blinds": f"{SMALL_BLIND}/{BIG_BLIND}",
        "gpu": hw.gpu_name, "quantization": hw.quantization.value,
    },
    "aggregate": df.to_dict(orient="records"),
    "sessions": [{"session_id": r.session_id, "hands": r.total_hands} for r in all_results],
}

with open("/content/eval_results/results.json", "w") as f:
    json.dump(results_json, f, indent=2)

print("Saved to /content/eval_results/: summary.csv, results.json, comparison.png")