# ü§ñ Poker Bot RL Training - Overnight Run (8 Hours)

This notebook trains your ensemble agents using Actor-Critic reinforcement learning.

**Estimated training time:** 7-8 hours on Colab GPU  
**Expected hands:** 500K-800K  
**Expected improvement:** +10-15% win rate

---

## Setup Instructions

1. **Upload to Google Colab**
2. **Runtime ‚Üí Change runtime type ‚Üí GPU (T4)**
3. **Run all cells**
4. **Let it train overnight**
5. **Download trained models in the morning**

The notebook will save checkpoints every 10K hands, so you won't lose progress if it disconnects.

## Step 1: Setup Environment

In [None]:
# Check GPU availability
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\n‚úÖ Using device: {device}")

In [None]:
# Clone repository (or upload files manually)
import os

if not os.path.exists('AI-Texas-Holdem-CSC4444'):
    !git clone https://github.com/vekoLSU/AI-Texas-Holdem-CSC4444.git
    %cd AI-Texas-Holdem-CSC4444
else:
    %cd AI-Texas-Holdem-CSC4444
    !git pull

print("‚úÖ Repository ready")

In [None]:
# Install dependencies
!pip install -q torch numpy websockets tqdm
print("‚úÖ Dependencies installed")

## Step 2: Import and Setup Training

In [None]:
import sys
sys.path.insert(0, 'src')

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from tqdm.auto import tqdm
import random
from collections import deque
import time
import json
from datetime import datetime

# Import poker bot components
from poker_bot.training.networks import ActorCriticAgent
from poker_bot.evaluation import HandEvaluator

print("‚úÖ Imports successful")

## Step 3: Simplified Poker Environment for Self-Play

In [None]:
class SimplePokerEnv:
    """Simplified poker environment for self-play training."""
    
    def __init__(self):
        self.hand_evaluator = HandEvaluator()
        self.reset()
    
    def reset(self):
        """Reset environment for new hand."""
        # 2 players, heads-up
        self.players = [{'chips': 1000, 'bet': 0, 'folded': False} for _ in range(2)]
        self.pot = 0
        self.current_bet = 10  # Big blind
        self.phase = 'PREFLOP'
        self.community_cards = []
        
        # Deal cards (simplified)
        deck = self._create_deck()
        random.shuffle(deck)
        self.player_cards = [deck[:2], deck[2:4]]
        self.community_cards = deck[4:9]  # Will reveal progressively
        
        return self._get_state(0)
    
    def _create_deck(self):
        """Create standard 52-card deck."""
        ranks = ['2', '3', '4', '5', '6', '7', '8', '9', 'T', 'J', 'Q', 'K', 'A']
        suits = ['h', 'd', 'c', 's']
        return [r + s for r in ranks for s in suits]
    
    def _get_state(self, player_id, full_info=False):
        """Get state representation."""
        # Simplified state features (50 dims for actor, 70 for critic)
        state = []
        
        # Hand strength
        visible_community = self._get_visible_community()
        hand_eval = self.hand_evaluator.evaluate_hand_strength(
            self.player_cards[player_id],
            visible_community,
            self.phase
        )
        state.append(hand_eval.get('strength', 0.5))
        
        # Pot and bet info
        state.append(self.pot / 2000)  # Normalized
        state.append(self.current_bet / 1000)
        state.append(self.players[player_id]['chips'] / 1000)
        state.append(self.players[1-player_id]['chips'] / 1000)
        
        # Phase encoding (one-hot)
        phases = ['PREFLOP', 'FLOP', 'TURN', 'RIVER']
        phase_encoding = [1.0 if self.phase == p else 0.0 for p in phases]
        state.extend(phase_encoding)
        
        # Pad to correct dimension
        while len(state) < 50:
            state.append(0.0)
        
        # For critic (full info), add opponent cards (20 dims more)
        if full_info:
            # Encode opponent hand (simplified)
            opp_hand_eval = self.hand_evaluator.evaluate_hand_strength(
                self.player_cards[1-player_id],
                visible_community,
                self.phase
            )
            state.extend([opp_hand_eval.get('strength', 0.5)] * 20)
        
        return np.array(state[:70 if full_info else 50], dtype=np.float32)
    
    def _get_visible_community(self):
        """Get visible community cards based on phase."""
        if self.phase == 'PREFLOP':
            return []
        elif self.phase == 'FLOP':
            return self.community_cards[:3]
        elif self.phase == 'TURN':
            return self.community_cards[:4]
        else:  # RIVER
            return self.community_cards[:5]
    
    def step(self, player_id, action, amount_ratio):
        """Execute action and return next state, reward, done."""
        # Action: 0=fold, 1=call, 2=check, 3=raise
        
        if action == 0:  # Fold
            self.players[player_id]['folded'] = True
            reward = -self.players[player_id]['bet']  # Lost chips in pot
            return None, reward, True
        
        elif action == 1:  # Call
            to_call = self.current_bet - self.players[player_id]['bet']
            self.players[player_id]['bet'] += to_call
            self.players[player_id]['chips'] -= to_call
            self.pot += to_call
        
        elif action == 2:  # Check
            pass  # No chips change
        
        elif action == 3:  # Raise
            raise_amount = int(amount_ratio * self.pot + self.current_bet)
            raise_amount = min(raise_amount, self.players[player_id]['chips'])
            raise_amount = max(raise_amount, self.current_bet + 10)  # Minimum raise
            
            to_add = raise_amount - self.players[player_id]['bet']
            self.players[player_id]['bet'] += to_add
            self.players[player_id]['chips'] -= to_add
            self.pot += to_add
            self.current_bet = raise_amount
        
        # Check if hand is over
        done, reward = self._check_hand_end(player_id)
        
        if not done:
            # Advance phase if betting round complete
            if self._betting_round_complete():
                self._advance_phase()
                done, reward = self._check_hand_end(player_id)
        
        next_state = self._get_state(player_id) if not done else None
        
        return next_state, reward, done
    
    def _betting_round_complete(self):
        """Check if betting round is complete."""
        # Simplified: assume complete if bets are equal
        return self.players[0]['bet'] == self.players[1]['bet']
    
    def _advance_phase(self):
        """Move to next phase."""
        phases = ['PREFLOP', 'FLOP', 'TURN', 'RIVER', 'SHOWDOWN']
        idx = phases.index(self.phase)
        if idx < len(phases) - 1:
            self.phase = phases[idx + 1]
            # Reset bets for new round
            self.players[0]['bet'] = 0
            self.players[1]['bet'] = 0
            self.current_bet = 0
    
    def _check_hand_end(self, player_id):
        """Check if hand is over and calculate reward."""
        # Someone folded
        if self.players[1-player_id]['folded']:
            return True, self.pot
        
        # Showdown
        if self.phase == 'SHOWDOWN':
            winner = self._determine_winner()
            reward = self.pot if winner == player_id else -self.players[player_id]['bet']
            return True, reward
        
        return False, 0
    
    def _determine_winner(self):
        """Determine winner at showdown."""
        strength_0 = self.hand_evaluator.evaluate_hand_strength(
            self.player_cards[0], self.community_cards[:5], 'RIVER'
        )['strength']
        strength_1 = self.hand_evaluator.evaluate_hand_strength(
            self.player_cards[1], self.community_cards[:5], 'RIVER'
        )['strength']
        return 0 if strength_0 > strength_1 else 1

print("‚úÖ Poker environment ready")

## Step 4: Training Loop with PPO

In [None]:
def train_agent_selfplay(num_hands=500000, save_every=10000, checkpoint_dir='checkpoints'):
    """Train agent via self-play using PPO."""
    
    os.makedirs(checkpoint_dir, exist_ok=True)
    
    # Initialize agent
    agent = ActorCriticAgent(actor_state_dim=50, critic_state_dim=70).to(device)
    optimizer = optim.Adam(agent.parameters(), lr=3e-4)
    
    # Training params
    gamma = 0.99  # Discount factor
    clip_epsilon = 0.2  # PPO clip parameter
    
    # Stats tracking
    episode_rewards = deque(maxlen=1000)
    win_rate = deque(maxlen=1000)
    start_time = time.time()
    
    # Training loop
    env = SimplePokerEnv()
    
    pbar = tqdm(total=num_hands, desc="Training")
    
    for hand in range(num_hands):
        # Play hand
        state = env.reset()
        
        # Storage for episode
        states, actions, amounts, rewards, values, log_probs = [], [], [], [], [], []
        
        player_id = 0  # Training player 0
        done = False
        episode_reward = 0
        
        while not done:
            # Get state tensors
            actor_state = torch.FloatTensor(state[:50]).unsqueeze(0).to(device)
            critic_state = torch.FloatTensor(env._get_state(player_id, full_info=True)).unsqueeze(0).to(device)
            
            # Get action from policy
            with torch.no_grad():
                action_probs, amount, value = agent(actor_state, critic_state)
                dist = torch.distributions.Categorical(action_probs)
                action = dist.sample()
                log_prob = dist.log_prob(action)
            
            action_idx = action.item()
            amount_val = amount.item()
            
            # Execute action
            next_state, reward, done = env.step(player_id, action_idx, amount_val)
            
            # Store transition
            states.append(state)
            actions.append(action_idx)
            amounts.append(amount_val)
            rewards.append(reward)
            values.append(value.item())
            log_probs.append(log_prob.item())
            
            episode_reward += reward
            state = next_state
            
            # Alternate players (simplified - train only one agent)
            player_id = 1 - player_id
        
        # Update policy
        if len(rewards) > 0:
            # Calculate returns
            returns = []
            R = 0
            for r in reversed(rewards):
                R = r + gamma * R
                returns.insert(0, R)
            
            returns = torch.FloatTensor(returns).to(device)
            values_tensor = torch.FloatTensor(values).to(device)
            
            # Advantages
            advantages = returns - values_tensor
            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
            
            # PPO update
            for _ in range(4):  # Multiple epochs
                optimizer.zero_grad()
                
                # Recalculate log probs and values
                batch_actor_states = torch.FloatTensor([s[:50] for s in states]).to(device)
                batch_critic_states = torch.FloatTensor([env._get_state(0, full_info=True) for _ in states]).to(device)
                
                action_probs, amounts_pred, values_new = agent(batch_actor_states, batch_critic_states)
                
                dist = torch.distributions.Categorical(action_probs)
                actions_tensor = torch.LongTensor(actions).to(device)
                log_probs_new = dist.log_prob(actions_tensor)
                
                # PPO loss
                ratio = torch.exp(log_probs_new - torch.FloatTensor(log_probs).to(device))
                surr1 = ratio * advantages
                surr2 = torch.clamp(ratio, 1 - clip_epsilon, 1 + clip_epsilon) * advantages
                
                actor_loss = -torch.min(surr1, surr2).mean()
                critic_loss = F.mse_loss(values_new.squeeze(), returns)
                entropy = dist.entropy().mean()
                
                loss = actor_loss + 0.5 * critic_loss - 0.01 * entropy
                
                loss.backward()
                optimizer.step()
        
        # Track stats
        episode_rewards.append(episode_reward)
        win_rate.append(1.0 if episode_reward > 0 else 0.0)
        
        # Update progress
        pbar.update(1)
        if hand % 100 == 0:
            avg_reward = np.mean(episode_rewards)
            avg_win_rate = np.mean(win_rate)
            pbar.set_postfix({
                'reward': f'{avg_reward:.1f}',
                'win_rate': f'{avg_win_rate:.2%}',
                'elapsed': f'{(time.time() - start_time) / 3600:.1f}h'
            })
        
        # Save checkpoint
        if (hand + 1) % save_every == 0:
            checkpoint_path = f"{checkpoint_dir}/agent_checkpoint_{hand+1}.pt"
            agent.save(checkpoint_path)
            print(f"\n‚úÖ Checkpoint saved: {checkpoint_path}")
            
            # Save training stats
            stats = {
                'hands': hand + 1,
                'avg_reward': float(np.mean(episode_rewards)),
                'win_rate': float(np.mean(win_rate)),
                'elapsed_hours': (time.time() - start_time) / 3600
            }
            with open(f"{checkpoint_dir}/stats_{hand+1}.json", 'w') as f:
                json.dump(stats, f, indent=2)
    
    pbar.close()
    
    # Save final model
    final_path = f"{checkpoint_dir}/agent_final.pt"
    agent.save(final_path)
    print(f"\nüéâ Training complete! Final model saved: {final_path}")
    
    return agent

print("‚úÖ Training function ready")

## Step 5: Start Training (Run Overnight)

In [None]:
# Configure training
NUM_HANDS = 500000  # 500K hands (~7-8 hours on GPU)
SAVE_EVERY = 10000  # Save checkpoint every 10K hands
CHECKPOINT_DIR = 'trained_models'

print(f"üöÄ Starting training for {NUM_HANDS:,} hands")
print(f"üíæ Checkpoints will be saved every {SAVE_EVERY:,} hands")
print(f"üìÅ Output directory: {CHECKPOINT_DIR}")
print(f"\n‚è∞ Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("\n" + "="*60)

# Start training
trained_agent = train_agent_selfplay(
    num_hands=NUM_HANDS,
    save_every=SAVE_EVERY,
    checkpoint_dir=CHECKPOINT_DIR
)

print("\n" + "="*60)
print(f"‚úÖ Training finished at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("\nüéâ Your trained models are ready to download!")

## Step 6: Download Trained Models

In [None]:
# Download final model and checkpoints
from google.colab import files
import glob

print("üì• Downloading trained models...")

# Download final model
files.download(f'{CHECKPOINT_DIR}/agent_final.pt')

# Download latest checkpoint
checkpoints = sorted(glob.glob(f'{CHECKPOINT_DIR}/agent_checkpoint_*.pt'))
if checkpoints:
    files.download(checkpoints[-1])

# Download training stats
stats_files = sorted(glob.glob(f'{CHECKPOINT_DIR}/stats_*.json'))
if stats_files:
    files.download(stats_files[-1])

print("\n‚úÖ Downloads complete!")
print("\nüìã Next steps:")
print("1. Upload agent_final.pt to your project")
print("2. Load it into your ensemble agents")
print("3. Test against infrastructure")
print("4. DOMINATE THE COMPETITION! üèÜ")

## Optional: Visualize Training Progress

In [None]:
import matplotlib.pyplot as plt
import json

# Load and plot training stats
stats_files = sorted(glob.glob(f'{CHECKPOINT_DIR}/stats_*.json'))

if stats_files:
    hands_list = []
    rewards_list = []
    win_rates_list = []
    
    for stats_file in stats_files:
        with open(stats_file) as f:
            stats = json.load(f)
            hands_list.append(stats['hands'])
            rewards_list.append(stats['avg_reward'])
            win_rates_list.append(stats['win_rate'])
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
    
    # Plot rewards
    ax1.plot(hands_list, rewards_list, linewidth=2)
    ax1.set_xlabel('Hands Played')
    ax1.set_ylabel('Average Reward')
    ax1.set_title('Training Progress: Rewards')
    ax1.grid(True, alpha=0.3)
    
    # Plot win rate
    ax2.plot(hands_list, [w * 100 for w in win_rates_list], linewidth=2, color='green')
    ax2.set_xlabel('Hands Played')
    ax2.set_ylabel('Win Rate (%)')
    ax2.set_title('Training Progress: Win Rate')
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(f'{CHECKPOINT_DIR}/training_progress.png', dpi=150, bbox_inches='tight')
    plt.show()
    
    files.download(f'{CHECKPOINT_DIR}/training_progress.png')
    print("‚úÖ Training visualization saved and downloaded!")
else:
    print("‚ö†Ô∏è  No training stats found. Run training first.")