# Poker Agent V8: Realistic Opponent Modeling

This notebook expands the V7 Adaptive Agent to train against **Realistic Opponents**. 

### New Opponent Types (Heuristic Bots)
Humans don't play like pure Maniacs or pure Nits. New bots use **Monte Carlo Hand Evaluation** to make decisions:

1. **ValueBot (Tight-Aggressive)**: Only bets/raises with strong hands (high win probability). Folds weak hands. 
   * *Strategy*: If Win% > 70%, Raise. If Win% > 40%, Call. Else Fold.
2. **BluffBot (Loose-Aggressive)**: Plays like ValueBot but mixes in bluffs.
   * *Strategy*: Similar to ValueBot, but 30% of the time with weak hands, it raises to bluff.
3. **BalancedBot (Basic Strategy)**: Considers **Pot Odds**.
   * *Strategy*: Calls if Win% > Pot Odds. Raises with very strong hands.

### Architecture (Same as V7)
- **Dual-Branch DRQN**: (MLP for State + LSTM for History)
- **Persistent Memory**: Remembers action history across hands to identify opponent style.

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import gymnasium as gym
from gymnasium import spaces
from collections import deque
import random
import matplotlib.pyplot as plt
from typing import Optional, Tuple, List, Dict, Any
import copy
from itertools import combinations

from pokerkit import Automation, NoLimitTexasHoldem, Card, StandardHighHand, Deck

# Constants
SEED = 42
MAX_HISTORY_LEN = 100
ACTION_EMBED_DIM = 16
HIDDEN_DIM_LSTM = 128

# Actions
ENV_FOLD = 0
ENV_CHECK_CALL = 1
ENV_BET_RAISE = 2
NUM_ACTIONS = 3

# History Tokens
ACT_PAD = 0
ACT_V_FOLD = 1
ACT_V_CHECK_CALL = 2
ACT_V_BET_RAISE = 3
OPP_FOLD = 4
OPP_CHECK_CALL = 5
OPP_BET_RAISE = 6
OUT_AGENT_WIN = 7
OUT_AGENT_LOSS = 8
OUT_TIE = 9
OUT_NEW_HAND = 10
HISTORY_VOCAB_SIZE = 11

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [11]:
# Re-using V7 Environment and Model Components

class PersistentPokerEnv(gym.Env):
    def __init__(self, num_players: int = 2, starting_stack: int = 1000, 
                 small_blind: int = 5, big_blind: int = 10):
        super().__init__()
        self.num_players = num_players
        self.starting_stack = starting_stack
        self.small_blind = small_blind
        self.big_blind = big_blind
        self.game_state_dim = 52*2 + 52*5 + num_players + 1 + 1 + 4 + 1 
        self.observation_space = spaces.Dict({
            'game_state': spaces.Box(low=0, high=1, shape=(self.game_state_dim,), dtype=np.float32),
            'history': spaces.Box(low=0, high=HISTORY_VOCAB_SIZE-1, shape=(MAX_HISTORY_LEN,), dtype=np.int64)
        })
        self.action_space = spaces.Discrete(NUM_ACTIONS)
        self.state = None
        self.agent_player_index = 0
        self.global_history = deque(maxlen=MAX_HISTORY_LEN)
        for _ in range(MAX_HISTORY_LEN): self.global_history.append(ACT_PAD)
        
    def _card_to_index(self, card: Card) -> int:
        ranks = '23456789TJQKA'
        suits = 'cdhs'
        rank_idx = ranks.index(card.rank)
        suit_idx = suits.index(card.suit)
        return rank_idx * 4 + suit_idx
    
    def _encode_card(self, card: Optional[Card]) -> np.ndarray:
        encoding = np.zeros(52, dtype=np.float32)
        if card is not None: encoding[self._card_to_index(card)] = 1.0
        return encoding
    
    def _flatten_cards(self, cards) -> List:
        flat = []
        # Handle both list of cards and single card case safely
        if isinstance(cards, Card): return [cards]
        for item in cards:
            if hasattr(item, 'rank'): flat.append(item)
            else: flat.extend(self._flatten_cards(item))
        return flat
    
    def _get_observation(self) -> Dict[str, Any]:
        state_vector = []
        hole_cards = self._flatten_cards(self.state.hole_cards[self.agent_player_index])
        for i in range(2):
            if i < len(hole_cards): state_vector.extend(self._encode_card(hole_cards[i]))
            else: state_vector.extend(np.zeros(52, dtype=np.float32))
        board_cards = self._flatten_cards(self.state.board_cards)
        for i in range(5):
            if i < len(board_cards): state_vector.extend(self._encode_card(board_cards[i]))
            else: state_vector.extend(np.zeros(52, dtype=np.float32))
        for i in range(self.num_players):
            stack = self.state.stacks[i] / self.starting_stack
            state_vector.append(min(stack, 2.0))
        total_pot = sum(self.state.bets)
        state_vector.append(total_pot / (self.starting_stack * self.num_players))
        if self.state.actor_index is not None:
            state_vector.append(self.state.actor_index / max(1, self.num_players - 1))
        else: state_vector.append(0.0)
        street = [0.0, 0.0, 0.0, 0.0]
        num_board = len(board_cards)
        if num_board == 0: street[0] = 1.0
        elif num_board == 3: street[1] = 1.0
        elif num_board == 4: street[2] = 1.0
        else: street[3] = 1.0
        state_vector.extend(street)
        state_vector.append(float(self.agent_player_index))
        
        return {
            'game_state': np.array(state_vector, dtype=np.float32),
            'history': np.array(list(self.global_history), dtype=np.int64)
        }
    
    def _update_history(self, player_idx: int, action: int):
        if player_idx == self.agent_player_index:
            if action == ENV_FOLD: token = ACT_V_FOLD
            elif action == ENV_CHECK_CALL: token = ACT_V_CHECK_CALL
            else: token = ACT_V_BET_RAISE
        else:
            if action == ENV_FOLD: token = OPP_FOLD
            elif action == ENV_CHECK_CALL: token = OPP_CHECK_CALL
            else: token = OPP_BET_RAISE
        self.global_history.append(token)

    def append_outcome_token(self, final_reward: float):
        if final_reward > 0: self.global_history.append(OUT_AGENT_WIN)
        elif final_reward < 0: self.global_history.append(OUT_AGENT_LOSS)
        else: self.global_history.append(OUT_TIE)

    def _get_legal_actions(self) -> List[int]:
        legal = []
        if self.state.can_fold(): legal.append(ENV_FOLD)
        if self.state.can_check_or_call(): legal.append(ENV_CHECK_CALL)
        if self.state.can_complete_bet_or_raise_to(): legal.append(ENV_BET_RAISE)
        return legal if legal else [ENV_CHECK_CALL]
    
    def _execute_action(self, action: int) -> None:
        if action == ENV_FOLD:
            if self.state.can_fold(): self.state.fold()
            elif self.state.can_check_or_call(): self.state.check_or_call()
        elif action == ENV_CHECK_CALL:
            if self.state.can_check_or_call(): self.state.check_or_call()
            elif self.state.can_fold(): self.state.fold()
        elif action == ENV_BET_RAISE:
            if self.state.can_complete_bet_or_raise_to():
                min_r = self.state.min_completion_betting_or_raising_to_amount
                max_r = self.state.max_completion_betting_or_raising_to_amount
                self.state.complete_bet_or_raise_to(min(min_r * 2, max_r))
            elif self.state.can_check_or_call():
                self.state.check_or_call()
    
    def _run_automations(self) -> None:
        while self.state.can_burn_card(): self.state.burn_card('??')
        while self.state.can_deal_board(): self.state.deal_board()
        while self.state.can_push_chips(): self.state.push_chips()
        while self.state.can_pull_chips(): self.state.pull_chips()
    
    def reset(self, seed=None, options=None) -> Tuple[Dict, Dict]:
        self.global_history.append(OUT_NEW_HAND)
        super().reset(seed=seed)
        self.state = NoLimitTexasHoldem.create_state(
            automations=(Automation.ANTE_POSTING, Automation.BET_COLLECTION, Automation.BLIND_OR_STRADDLE_POSTING, Automation.HOLE_CARDS_SHOWING_OR_MUCKING, Automation.HAND_KILLING, Automation.CHIPS_PUSHING, Automation.CHIPS_PULLING),
            ante_trimming_status=True,
            raw_antes={-1: 0},
            raw_blinds_or_straddles=(self.small_blind, self.big_blind),
            min_bet=self.big_blind,
            raw_starting_stacks=[self.starting_stack] * self.num_players,
            player_count=self.num_players,
        )
        while self.state.can_deal_hole(): self.state.deal_hole()
        self._run_automations()
        return self._get_observation(), {'legal_actions': self._get_legal_actions()}
    
    def step(self, action: int) -> Tuple[Dict, float, bool, bool, Dict]:
        if self.state.actor_index is not None:
             self._update_history(self.state.actor_index, action)
        self._execute_action(action)
        self._run_automations()
        done = self.state.status is False
        reward = 0.0
        if done:
            reward = (self.state.stacks[self.agent_player_index] - self.starting_stack) / self.big_blind
        obs = self._get_observation()
        info = {'legal_actions': self._get_legal_actions() if not done else []}
        return obs, reward, done, False, info
    
    def get_final_reward(self) -> float:
        return (self.state.stacks[self.agent_player_index] - self.starting_stack) / self.big_blind
    
    def update_opponent_history(self, action: int):
        opp_idx = 1 - self.agent_player_index
        self._update_history(opp_idx, action)

class DualBranchV8(nn.Module):
    def __init__(self, state_dim: int, action_dim: int):
        super().__init__()
        self.state_net = nn.Sequential(
            nn.Linear(state_dim, 256), nn.LayerNorm(256), nn.ReLU(),
            nn.Linear(256, 128), nn.ReLU()
        )
        self.action_embedding = nn.Embedding(HISTORY_VOCAB_SIZE, ACTION_EMBED_DIM)
        self.lstm = nn.LSTM(input_size=ACTION_EMBED_DIM, hidden_size=HIDDEN_DIM_LSTM, batch_first=True)
        self.value_head = nn.Sequential(
            nn.Linear(128 + HIDDEN_DIM_LSTM, 256), nn.ReLU(),
            nn.Linear(256, action_dim)
        )
        
    def forward(self, state, history):
        s_feat = self.state_net(state)
        h_embed = self.action_embedding(history)
        lstm_out, (hn, cn) = self.lstm(h_embed)
        h_context = hn[-1]
        combined = torch.cat([s_feat, h_context], dim=1)
        return self.value_head(combined)

class ReplayBufferV8:
    def __init__(self, capacity=50000):
        self.buffer = deque(maxlen=capacity)
    def push(self, transition):
        self.buffer.append(transition)
    def sample(self, batch_size):
        return random.sample(self.buffer, min(len(self.buffer), batch_size))
    def __len__(self): return len(self.buffer)

class AdaptiveAgentV8:
    def __init__(self, state_dim, action_dim=NUM_ACTIONS, lr=1e-4):
        self.model = DualBranchV8(state_dim, action_dim).to(device)
        self.target_model = DualBranchV8(state_dim, action_dim).to(device)
        self.target_model.load_state_dict(self.model.state_dict())
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.gamma = 0.99
        self.epsilon = 1.0
        self.epsilon_min = 0.05
        self.epsilon_decay = 0.99995
        
    def select_action(self, obs, legal_actions, eval_mode=False):
        if not eval_mode and random.random() < self.epsilon:
            return random.choice(legal_actions)
        state_t = torch.FloatTensor(obs['game_state']).unsqueeze(0).to(device)
        h_t = torch.LongTensor(obs['history']).unsqueeze(0).to(device)
        with torch.no_grad():
            q_values = self.model(state_t, h_t)
        q_numpy = q_values.cpu().numpy().flatten()
        masked_q = np.full(NUM_ACTIONS, -np.inf)
        for a in legal_actions: masked_q[a] = q_numpy[a]
        return int(np.argmax(masked_q))

    def train(self, buffer, batch_size=64):
        if len(buffer) < batch_size: return None
        batch = buffer.sample(batch_size)
        states = torch.FloatTensor(np.array([t[0] for t in batch])).to(device)
        histories = torch.LongTensor(np.array([t[1] for t in batch])).to(device)
        actions = torch.LongTensor(np.array([t[2] for t in batch])).to(device)
        rewards = torch.FloatTensor(np.array([t[3] for t in batch])).to(device)
        next_states = torch.FloatTensor(np.array([t[4] for t in batch])).to(device)
        next_histories = torch.LongTensor(np.array([t[5] for t in batch])).to(device)
        dones = torch.FloatTensor(np.array([t[6] for t in batch])).to(device)
        
        current_q = self.model(states, histories).gather(1, actions.unsqueeze(1)).squeeze(1)
        with torch.no_grad():
            next_actions = self.model(next_states, next_histories).argmax(1).unsqueeze(1)
            target_q_next = self.target_model(next_states, next_histories).gather(1, next_actions).squeeze(1)
            target = rewards + (1 - dones) * self.gamma * target_q_next
        loss = F.mse_loss(current_q, target)
        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
        self.optimizer.step()
        return loss.item()
    
    def update_target(self): self.target_model.load_state_dict(self.model.state_dict())

In [12]:
# Heuristic Hand Evaluator

def monte_carlo_equity(hole_cards: List[Card], board_cards: List[Card], iterations=50) -> float:
    """Simulate hand outcome against a random hand to estimate equity."""
    if not hole_cards: return 0.5 
    
    wins = 0
    known_cards = set(hole_cards + board_cards)
    
    for _ in range(iterations):
        deck_cards = [c for c in Deck.STANDARD if c not in known_cards]
        random.shuffle(deck_cards)
        
        opp_hole = deck_cards[:2]
        
        needed_board = 5 - len(board_cards)
        sim_board = board_cards + deck_cards[2:2+needed_board]
        
        # Total 7 cards for each
        my_total = hole_cards + sim_board
        opp_total = opp_hole + sim_board
        
        # FIX: StandardHighHand takes exactly 5 cards. 
        # We must find the best 5-card subset from the 7 available cards.
        my_hand = max(StandardHighHand(c) for c in combinations(my_total, 5))
        opp_hand = max(StandardHighHand(c) for c in combinations(opp_total, 5))
        
        if my_hand > opp_hand:
            wins += 1
        elif my_hand == opp_hand:
            wins += 0.5
            
    return wins / iterations

# Flattening helper for extracting cards safely
def flatten_cards_list(items):
    out = []
    # Handle single card or list
    if isinstance(items, Card): return [items]
    for x in items:
        if isinstance(x, (list, tuple)):
            out.extend(flatten_cards_list(x))
        else:
            out.append(x)
    return out

class HeuristicBot:
    def __init__(self, player_idx=1):
        self.player_idx = player_idx
        
    def get_equity(self, state):
        # Extract cards from state and flatten them to ensure simple list of Cards
        # state.hole_cards[idx] might be [c1, c2]
        hole = flatten_cards_list(state.hole_cards[self.player_idx])
        board = flatten_cards_list(state.board_cards)
        return monte_carlo_equity(hole, board, iterations=40)
        
    def select_action(self, state, legal_actions):
        raise NotImplementedError

class ValueBot(HeuristicBot):
    """Bets only with strong hands."""
    def select_action(self, state, legal_actions):
        equity = self.get_equity(state)
        
        if equity > 0.70:
            if ENV_BET_RAISE in legal_actions: return ENV_BET_RAISE
        
        if equity > 0.45:
            if ENV_CHECK_CALL in legal_actions: return ENV_CHECK_CALL
        
        # Fold if weak
        if ENV_FOLD in legal_actions: return ENV_FOLD
        return ENV_CHECK_CALL

class BluffBot(HeuristicBot):
    """Mixes value bets with bluffs."""
    def select_action(self, state, legal_actions):
        equity = self.get_equity(state)
        
        # Strong: Raise 
        if equity > 0.70:
             if ENV_BET_RAISE in legal_actions: return ENV_BET_RAISE
        
        # Weak: Bluff chance (30%)
        if equity < 0.40:
            if random.random() < 0.30:
                 if ENV_BET_RAISE in legal_actions: return ENV_BET_RAISE
        
        # Medium: Check/Call
        if equity > 0.45:
             if ENV_CHECK_CALL in legal_actions: return ENV_CHECK_CALL
             
        if ENV_FOLD in legal_actions: return ENV_FOLD
        return ENV_CHECK_CALL

class BalancedBot(HeuristicBot):
    """Considers Pot Odds."""
    def select_action(self, state, legal_actions):
        equity = self.get_equity(state)
        
        # Calculate Pot Odds
        # Pot odds = Call Amount / (Total Pot + Call Amount)
        # We approximate relative to pot size
        
        # If equity is huge, raise
        if equity > 0.8:
             if ENV_BET_RAISE in legal_actions: return ENV_BET_RAISE
             
        # Call logic based on Odds
        if ENV_CHECK_CALL in legal_actions:
            # Simple threshold: if > 50% equity always call
            if equity > 0.5: return ENV_CHECK_CALL
            
            # If equity > 0.3 and it's just a check or small bet, call
            current_bet = max(state.bets)
            my_bet = state.bets[self.player_idx]
            to_call = current_bet - my_bet
            total_pot = sum(state.bets)
            
            if to_call == 0: return ENV_CHECK_CALL # Check
            
            pot_odds = to_call / (total_pot + to_call + 1e-5)
            if equity > pot_odds:
                return ENV_CHECK_CALL
        
        if ENV_FOLD in legal_actions: return ENV_FOLD
        return ENV_CHECK_CALL

# Original Opponents
class ManiacAgent:
    def select_action(self, state, legal_actions):
        if ENV_BET_RAISE in legal_actions: return ENV_BET_RAISE
        if ENV_CHECK_CALL in legal_actions: return ENV_CHECK_CALL
        return ENV_FOLD
class NitAgent:
    def select_action(self, state, legal_actions):
        if ENV_FOLD in legal_actions and ENV_CHECK_CALL in legal_actions:
            if random.random() < 0.9: return ENV_FOLD
        if ENV_CHECK_CALL in legal_actions: return ENV_CHECK_CALL
        return ENV_FOLD
class RandomAgent:
    def select_action(self, state, legal_actions): return random.choice(legal_actions)

In [13]:
def train_v8(num_hands=50000):
    env = PersistentPokerEnv()
    agent = AdaptiveAgentV8(env.game_state_dim)
    buffer = ReplayBufferV8(capacity=50000)
    
    # Expanded Opponent Pool
    opps = {
        'Maniac': ManiacAgent(),
        'Nit': NitAgent(),
        'Random': RandomAgent(),
        'ValueBot': ValueBot(),
        'BluffBot': BluffBot(),
        'Balanced': BalancedBot()
    }
    opp_names = list(opps.keys())
    
    stats = {name: {'rewards': [], 'wins': 0, 'hands': 0} for name in opp_names}
    
    print(f"Training V8 for {num_hands} hands against full realistic opponent pool...")
    current_opp_name = 'Random'
    
    for hand in range(num_hands):
        if hand % 20 == 0:
            current_opp_name = random.choice(opp_names)
            
        opponent = opps[current_opp_name]
        
        obs, info = env.reset()
        done = False
        episode_transitions = []
        pending_agent_obs = None
        pending_agent_action = None
        
        while not done:
            if env.state.actor_index == env.agent_player_index:
                if pending_agent_obs is not None:
                    episode_transitions.append((pending_agent_obs['game_state'], pending_agent_obs['history'], pending_agent_action, 0.0, obs['game_state'], obs['history'], False, info['legal_actions']))
                
                action = agent.select_action(obs, info['legal_actions'])
                pending_agent_obs = obs
                pending_agent_action = action
                obs, reward, done, _, info = env.step(action)
                
                if done:
                    pass
            else:
                # Pass 'state' to opponent logic for hand evaluation
                action = opponent.select_action(env.state, info['legal_actions'])
                env.update_opponent_history(action)
                env._execute_action(action)
                env._run_automations()
                done = env.state.status is False
                if not done:
                    obs = env._get_observation()
                    info['legal_actions'] = env._get_legal_actions()
        
        final_reward = env.get_final_reward()
        env.append_outcome_token(final_reward)
        term_obs = env._get_observation()
        
        if pending_agent_obs is not None:
             episode_transitions.append((pending_agent_obs['game_state'], pending_agent_obs['history'], pending_agent_action, 0.0, term_obs['game_state'], term_obs['history'], True, []))
        
        stats[current_opp_name]['rewards'].append(final_reward)
        stats[current_opp_name]['hands'] += 1
        if final_reward > 0: stats[current_opp_name]['wins'] += 1
        
        for i in range(len(episode_transitions)):
            s, h, a, r, ns, nh, d, l = episode_transitions[i]
            buffer.push((s, h, a, final_reward, ns, nh, d, l))
            
        if len(buffer) > 1000:
            agent.train(buffer)
        agent.epsilon = max(agent.epsilon_min, agent.epsilon * agent.epsilon_decay)
        if hand % 500 == 0: agent.update_target()
        
        if hand % 2500 == 0 and hand > 0:
            print(f"\n=== Checkpoint Hand {hand} (Eps: {agent.epsilon:.2f}) ===")
            # Split output for readability
            print("--- Extreme Opponents ---")
            for name in ['Maniac', 'Nit', 'Random']:
                 if stats[name]['hands'] > 0:
                     print(f"{name}: Avg {np.mean(stats[name]['rewards'][-200:]):.2f} BB | Win {stats[name]['wins']/stats[name]['hands']:.1%}")
            print("--- Realistic Opponents ---")
            for name in ['ValueBot', 'BluffBot', 'Balanced']:
                 if stats[name]['hands'] > 0:
                     print(f"{name}: Avg {np.mean(stats[name]['rewards'][-200:]):.2f} BB | Win {stats[name]['wins']/stats[name]['hands']:.1%}")

    return agent, stats

def plot_v8_results(stats):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Ax1: Extreme
    for name in ['Maniac', 'Nit', 'Random']:
        data = stats[name]['rewards']
        if len(data) > 100:
            ma = np.convolve(data, np.ones(100)/100, mode='valid')
            ax1.plot(ma, label=name)
    ax1.set_title("Performance vs Extreme Bots")
    ax1.legend()
    
    # Ax2: Realistic
    for name in ['ValueBot', 'BluffBot', 'Balanced']:
        data = stats[name]['rewards']
        if len(data) > 100:
            ma = np.convolve(data, np.ones(100)/100, mode='valid')
            ax2.plot(ma, label=name)
    ax2.set_title("Performance vs Realistic Bots")
    ax2.legend()
    
    plt.show()
    plt.savefig("v8_results.png")

agent, stats = train_v8(50000)
plot_v8_results(stats)

Training V8 for 50000 hands against full realistic opponent pool...

=== Checkpoint Hand 2500 (Eps: 0.88) ===
--- Extreme Opponents ---
Maniac: Avg -10.44 BB | Win 11.7%
Nit: Avg 0.50 BB | Win 98.8%
Random: Avg -2.23 BB | Win 66.4%
--- Realistic Opponents ---
ValueBot: Avg -3.32 BB | Win 74.2%
BluffBot: Avg -4.68 BB | Win 67.5%
Balanced: Avg -4.46 BB | Win 66.3%

=== Checkpoint Hand 5000 (Eps: 0.78) ===
--- Extreme Opponents ---
Maniac: Avg -14.62 BB | Win 9.5%
Nit: Avg 0.58 BB | Win 99.2%
Random: Avg 2.06 BB | Win 65.5%
--- Realistic Opponents ---
ValueBot: Avg -5.74 BB | Win 73.3%
BluffBot: Avg -0.81 BB | Win 67.4%
Balanced: Avg -2.36 BB | Win 66.5%

=== Checkpoint Hand 7500 (Eps: 0.69) ===
--- Extreme Opponents ---
Maniac: Avg -8.07 BB | Win 8.5%
Nit: Avg 0.53 BB | Win 99.0%
Random: Avg -0.35 BB | Win 64.4%
--- Realistic Opponents ---
ValueBot: Avg -0.27 BB | Win 72.4%
BluffBot: Avg -2.23 BB | Win 64.9%
Balanced: Avg -2.78 BB | Win 64.5%

=== Checkpoint Hand 10000 (Eps: 0.61) ===
--

KeyboardInterrupt: 