# Backgammon Training Debug Notebook

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/wmhowell18/claude-code/blob/claude/update-claude-md-c6hNw/transformer-backgammon/debug_training.ipynb)

**Purpose:** Systematically debug the 30% win rate issue before retraining.

**The Mystery:** 
- Pip count vs pip count: 30% white win rate (should be ~50%)
- This persists through all training phases
- Neural network vs pip count: 25% win rate

**Hypothesis:** There's a fundamental bug in game logic, board representation, or starting position.

**Tests:**
1. Pip count self-play (should be 50%)
2. Swapped colors test (should mirror)
3. Starting position symmetry check
4. Random agent baseline (should be 50%)
5. First-move advantage analysis
6. Win counting verification

## Setup: Install and Import

In [None]:
import os
import sys

print("=" * 70)
print("SETUP: Installing transformer-backgammon")
print("=" * 70)

# Clone repo with specific branch
print("\nüì¶ Step 1: Cloning repository...")
branch = "claude/update-claude-md-c6hNw"
if not os.path.exists('/content/claude-code'):
    !git clone -b {branch} https://github.com/wmhowell18/claude-code.git /content/claude-code 2>&1 | tail -5
    print(f"‚úÖ Repository cloned (branch: {branch})")
else:
    print(f"‚úÖ Repository already exists")
    # Pull latest changes
    !cd /content/claude-code && git fetch origin {branch} 2>&1 | tail -3
    !cd /content/claude-code && git checkout {branch} 2>&1 | tail -3
    !cd /content/claude-code && git pull origin {branch} 2>&1 | tail -3
    print(f"‚úÖ Updated to latest version")

# Change directory
os.chdir('/content/claude-code/transformer-backgammon')
print(f"\nüìÇ Step 2: Changed to directory: {os.getcwd()}")

# Verify src directory exists
if os.path.exists('src'):
    print(f"‚úÖ src/ directory found")
else:
    print(f"‚ùå ERROR: src/ directory not found!")
    print(f"   Available files: {os.listdir('.')[:10]}")
    raise FileNotFoundError("src/ directory missing from repository")

# Install package
print("\n‚öôÔ∏è  Step 3: Installing package (this may take 30-60 seconds)...")
!pip install -e . 2>&1 | tail -20

# Add src directory to path (where backgammon module actually lives)
src_path = '/content/claude-code/transformer-backgammon/src'
if src_path not in sys.path:
    sys.path.insert(0, src_path)
    print(f"\n‚úÖ Added {src_path} to Python path")

# Verify installation
print("\nüîç Step 4: Verifying installation...")
try:
    import backgammon
    from backgammon.core.board import initial_board
    print("‚úÖ backgammon module imported successfully!")
    print(f"   Module location: {backgammon.__file__}")
except ImportError as e:
    print(f"‚ùå FAILED: {e}")
    print("\nDebugging info:")
    print(f"   sys.path: {sys.path[:3]}")
    print(f"   Current dir: {os.getcwd()}")
    print(f"   src/ exists: {os.path.exists('src')}")
    if os.path.exists('src'):
        print(f"   Contents of src/: {os.listdir('src')}")
    raise

print("\n" + "=" * 70)
print("‚úÖ SETUP COMPLETE - Ready to run tests!")
print("=" * 70)

In [None]:
# Import all required modules
import numpy as np
from backgammon.core.board import initial_board, flip_board, pip_count
from backgammon.core.types import Player
from backgammon.evaluation.agents import pip_count_agent, random_agent
from backgammon.training.self_play import play_game, compute_game_statistics

print("‚úÖ All imports successful! Ready to run debugging tests.")

---
## Test 1: Pip Count Self-Play

**Expected:** ~50% win rate for White

**Why:** Both players use identical strategy, game is symmetric

In [None]:
print("=" * 70)
print("TEST 1: Pip Count vs Pip Count (100 games)")
print("=" * 70)
print("Expected: ~50% win rate for White (symmetric game, identical agents)")
print("\nRunning...\n")

pip_agent = pip_count_agent()
rng = np.random.default_rng(42)

results = []
for i in range(100):
    result = play_game(pip_agent, pip_agent, initial_board(), rng=rng)
    results.append(result)
    if (i + 1) % 20 == 0:
        print(f"  ‚úì Completed {i+1}/100 games")

stats = compute_game_statistics(results)

print(f"\nüìä Results:")
print(f"   White wins: {stats['white_wins']}/100 ({stats['white_win_rate']*100:.1f}%)")
print(f"   Black wins: {stats['black_wins']}/100 ({(1-stats['white_win_rate'])*100:.1f}%)")
print(f"   Draws:      {stats['draws']}/100")
print(f"   Avg moves:  {stats['avg_moves']:.1f}")

print(f"\nüí° Analysis:")
if abs(stats['white_win_rate'] - 0.5) > 0.1:
    print(f"   üö® BUG CONFIRMED! Win rate is {stats['white_win_rate']*100:.0f}%, not 50%!")
    print(f"   This proves the game logic has a systematic bias.")
    test1_bug = True
else:
    print(f"   ‚úÖ Win rate is close to 50% - pip count agent seems fair!")
    test1_bug = False

# Store for later
test1_white_wr = stats['white_win_rate']

---
## Test 2: Swapped Colors

**Expected:** If Test 1 showed 30% White, this should show ~70% White (or 30% Black)

**Why:** Flipping the board swaps the advantage

In [None]:
print("=" * 70)
print("TEST 2: Swapped Colors (Starting from Flipped Board)")
print("=" * 70)
print(f"Test 1 showed: White {test1_white_wr*100:.0f}%")
print(f"Expected here: White {(1-test1_white_wr)*100:.0f}% (if position bias)")
print(f"              OR similar {test1_white_wr*100:.0f}% (if agent/logic bias)")
print("\nRunning...\n")

results_flipped = []
for i in range(100):
    flipped_start = flip_board(initial_board())
    result = play_game(pip_agent, pip_agent, flipped_start, rng=rng)
    results_flipped.append(result)
    if (i + 1) % 20 == 0:
        print(f"  ‚úì Completed {i+1}/100 games")

stats_flipped = compute_game_statistics(results_flipped)

print(f"\nüìä Results:")
print(f"   White wins: {stats_flipped['white_wins']}/100 ({stats_flipped['white_win_rate']*100:.1f}%)")
print(f"   Black wins: {stats_flipped['black_wins']}/100 ({(1-stats_flipped['white_win_rate'])*100:.1f}%)")

print(f"\nüîç Comparison:")
print(f"   Normal board:  White {test1_white_wr*100:.0f}%, Black {(1-test1_white_wr)*100:.0f}%")
print(f"   Flipped board: White {stats_flipped['white_win_rate']*100:.0f}%, Black {(1-stats_flipped['white_win_rate'])*100:.0f}%")

print(f"\nüí° Analysis:")
if abs(stats_flipped['white_win_rate'] - (1 - test1_white_wr)) < 0.1:
    print("   ‚úÖ Results mirror each other!")
    print("   ‚Üí Bug is likely in STARTING POSITION (asymmetric setup)")
    test2_mirrors = True
elif abs(stats_flipped['white_win_rate'] - test1_white_wr) < 0.1:
    print("   ‚ö†Ô∏è  Results are similar, not mirrored!")
    print("   ‚Üí Bug is in MOVE GENERATION or EVALUATION LOGIC")
    test2_mirrors = False
else:
    print("   ‚ö†Ô∏è  Results are inconsistent")
    print("   ‚Üí Multiple bugs or high variance")
    test2_mirrors = None

---
## Test 3: Starting Position Symmetry

**Expected:** White and Black should have identical pip counts and setups

**Why:** Backgammon starting position is perfectly symmetric

In [None]:
print("=" * 70)
print("TEST 3: Starting Position Symmetry Check")
print("=" * 70)

board = initial_board()

print("\nüìã Board Arrays:")
print(f"   White checkers: {board.white_checkers}")
print(f"   Black checkers: {board.black_checkers}")

white_pip = pip_count(board, Player.WHITE)
black_pip = pip_count(board, Player.BLACK)

print(f"\nüé≤ Pip Counts:")
print(f"   White: {white_pip}")
print(f"   Black: {black_pip}")

if white_pip == black_pip:
    print(f"   ‚úÖ Pip counts are equal - starting position is symmetric")
    test3_symmetric = True
else:
    print(f"   üö® BUG! Pip counts differ by {abs(white_pip - black_pip)}")
    test3_symmetric = False

print(f"\nüìç White Checker Positions:")
for point in range(1, 25):
    if board.white_checkers[point] > 0:
        print(f"   Point {point:2d}: {board.white_checkers[point]} checkers")
if board.white_checkers[0] > 0:
    print(f"   Bar:      {board.white_checkers[0]} checkers")
if board.white_checkers[25] > 0:
    print(f"   Off:      {board.white_checkers[25]} checkers")

print(f"\nüìç Black Checker Positions:")
for point in range(1, 25):
    if board.black_checkers[point] > 0:
        print(f"   Point {point:2d}: {board.black_checkers[point]} checkers")
if board.black_checkers[0] > 0:
    print(f"   Bar:      {board.black_checkers[0]} checkers")
if board.black_checkers[25] > 0:
    print(f"   Off:      {board.black_checkers[25]} checkers")

print(f"\nüí° Standard Backgammon Setup (for reference):")
print(f"   White: 2 on 24, 5 on 13, 3 on 8, 5 on 6")
print(f"   Black: 2 on 1, 5 on 12, 3 on 17, 5 on 19")

---
## Test 4: Random Agent Baseline

**Expected:** ~50% win rate (random play should be fair)

**Why:** If even random agents show bias, the bug is in core game logic

In [None]:
print("=" * 70)
print("TEST 4: Random Agent vs Random Agent (100 games)")
print("=" * 70)
print("Expected: ~50% win rate (random moves should have no bias)")
print("\nRunning...\n")

random_agent1 = random_agent(seed=42)
random_agent2 = random_agent(seed=43)

results_random = []
for i in range(100):
    result = play_game(random_agent1, random_agent2, initial_board(), rng=rng)
    results_random.append(result)
    if (i + 1) % 20 == 0:
        print(f"  ‚úì Completed {i+1}/100 games")

stats_random = compute_game_statistics(results_random)

print(f"\nüìä Results:")
print(f"   White wins: {stats_random['white_wins']}/100 ({stats_random['white_win_rate']*100:.1f}%)")
print(f"   Black wins: {stats_random['black_wins']}/100 ({(1-stats_random['white_win_rate'])*100:.1f}%)")

print(f"\nüí° Analysis:")
if abs(stats_random['white_win_rate'] - 0.5) > 0.1:
    print(f"   üö® CRITICAL BUG! Even random agents show bias!")
    print(f"   ‚Üí Bug is in CORE GAME LOGIC (move generation, win detection, etc.)")
    test4_random_fair = False
else:
    print(f"   ‚úÖ Random agents are fair!")
    print(f"   ‚Üí Bug is in PIP COUNT AGENT logic, not core game")
    test4_random_fair = True

test4_white_wr = stats_random['white_win_rate']

---
## Test 5: First-Move Advantage

**Expected:** Small advantage for White (moves first) - maybe 51-52%

**Why:** In backgammon, the player who moves first has a slight edge

In [None]:
print("=" * 70)
print("TEST 5: First-Move Advantage Analysis")
print("=" * 70)

print(f"\nüìä Summary of Win Rates (White perspective):")
print(f"   Test 1 (Pip vs Pip, Normal):   {test1_white_wr*100:.1f}%")
print(f"   Test 2 (Pip vs Pip, Flipped):  {stats_flipped['white_win_rate']*100:.1f}%")
print(f"   Test 4 (Random vs Random):     {test4_white_wr*100:.1f}%")

avg_white_wr = np.mean([test1_white_wr, test4_white_wr])
print(f"\n   Average across tests: {avg_white_wr*100:.1f}%")

print(f"\nüí° Expected First-Move Advantage:")
print(f"   In real backgammon: ~51-52% for player moving first")
print(f"   Your data shows:    {avg_white_wr*100:.1f}% for White (moves first)")

if avg_white_wr > 0.48 and avg_white_wr < 0.52:
    print(f"   ‚úÖ Within expected range!")
elif avg_white_wr < 0.48:
    print(f"   ‚ö†Ô∏è  White (first mover) is disadvantaged!")
else:
    print(f"   ‚ö†Ô∏è  White (first mover) has excessive advantage!")

---
## Test 6: Win Detection Verification

Check if wins are being counted correctly

In [None]:
print("=" * 70)
print("TEST 6: Win Detection Verification")
print("=" * 70)

print("\nüìä Checking game outcomes from Test 1:")

# Count outcome types
normal_wins = sum(1 for r in results if r.outcome and r.outcome.points == 1)
gammons = sum(1 for r in results if r.outcome and r.outcome.points == 2)
backgammons = sum(1 for r in results if r.outcome and r.outcome.points == 3)
draws = sum(1 for r in results if r.outcome is None)

print(f"   Normal wins (1 pt):   {normal_wins}/100 ({normal_wins}%)")
print(f"   Gammons (2 pts):      {gammons}/100 ({gammons}%)")
print(f"   Backgammons (3 pts):  {backgammons}/100 ({backgammons}%)")
print(f"   Draws (timeout):      {draws}/100 ({draws}%)")

total_outcomes = normal_wins + gammons + backgammons + draws
print(f"\n   Total outcomes: {total_outcomes}/100")

if total_outcomes == 100:
    print(f"   ‚úÖ All games have valid outcomes")
else:
    print(f"   ‚ö†Ô∏è  {100 - total_outcomes} games have missing/invalid outcomes!")

# Check winner consistency
white_wins_manual = sum(1 for r in results if r.outcome and r.outcome.winner == Player.WHITE)
black_wins_manual = sum(1 for r in results if r.outcome and r.outcome.winner == Player.BLACK)

print(f"\nüîç Manual count verification:")
print(f"   compute_game_statistics: White {stats['white_wins']}, Black {stats['black_wins']}")
print(f"   Manual count:            White {white_wins_manual}, Black {black_wins_manual}")

if white_wins_manual == stats['white_wins'] and black_wins_manual == stats['black_wins']:
    print(f"   ‚úÖ Counts match - win detection seems correct")
else:
    print(f"   üö® Counts don't match - bug in compute_game_statistics!")

---
## Summary & Diagnosis

In [None]:
print("=" * 70)
print("SUMMARY & DIAGNOSIS")
print("=" * 70)

print("\nüîç Test Results:")
print(f"   Test 1 (Pip Self-Play):        White {test1_white_wr*100:.0f}% - {'‚ùå BIASED' if abs(test1_white_wr - 0.5) > 0.1 else '‚úÖ Fair'}")
print(f"   Test 2 (Swapped Colors):       {'‚úÖ Mirrors' if test2_mirrors else '‚ùå Does not mirror'}")
print(f"   Test 3 (Position Symmetry):    {'‚úÖ Symmetric' if test3_symmetric else '‚ùå Asymmetric'}")
print(f"   Test 4 (Random Agents):        White {test4_white_wr*100:.0f}% - {'‚ùå BIASED' if abs(test4_white_wr - 0.5) > 0.1 else '‚úÖ Fair'}")

print("\nüí° Diagnosis:")

if not test4_random_fair:
    print("\nüö® CRITICAL: Random agents show bias!")
    print("   ‚Üí Bug is in CORE GAME LOGIC")
    print("   ‚Üí Check: move generation, board.player_to_move, win detection")
    print("\n   Recommended actions:")
    print("   1. Inspect move generation for White vs Black")
    print("   2. Check if player_to_move switches correctly")
    print("   3. Verify win detection logic")
    
elif not test3_symmetric:
    print("\nüö® Starting position is asymmetric!")
    print("   ‚Üí Bug is in initial_board() function")
    print("\n   Recommended actions:")
    print("   1. Fix initial_board() to be symmetric")
    print("   2. White: 2 on 24, 5 on 13, 3 on 8, 5 on 6")
    print("   3. Black: 2 on 1, 5 on 12, 3 on 17, 5 on 19")
    
elif test1_bug and test2_mirrors:
    print("\n‚ö†Ô∏è  Position bias detected")
    print("   ‚Üí Starting position favors one side")
    print("   ‚Üí OR pip count evaluation is directionally biased")
    print("\n   Recommended actions:")
    print("   1. Review pip_count() calculation")
    print("   2. Check if pip count formula treats White/Black differently")
    print("   3. Test with simplified starting positions")
    
elif test1_bug and not test2_mirrors:
    print("\n‚ö†Ô∏è  Agent or evaluation logic bias")
    print("   ‚Üí Pip count agent evaluates positions differently for White/Black")
    print("\n   Recommended actions:")
    print("   1. Review pip_count_agent evaluate_position()")
    print("   2. Check if bonuses/penalties are symmetric")
    print("   3. Test simplified agent (pure pip count, no heuristics)")
    
else:
    print("\n‚úÖ No obvious bugs detected!")
    print("   ‚Üí Win rates are within expected range")
    print("   ‚Üí 30% in training might be due to complex variants")
    print("\n   Recommended actions:")
    print("   1. Simplify training to standard backgammon only")
    print("   2. Remove hypergammon, micro gammon variants")
    print("   3. Train longer (10K+ games)")

print("\n" + "=" * 70)
print("Debug complete! Review results above.")
print("=" * 70)