In [None]:
import torch
import random
import numpy as np
import math
from collections import deque
from pypokerengine.api.emulator import Emulator
from pypokerengine.utils.card_utils import gen_cards, estimate_hole_card_win_rate
from pypokerengine.players import BasePokerPlayer
import datetime
from torch.utils.tensorboard import SummaryWriter




#GLOBAL VAR
total_loss_p1 = 0
total_loss_p2 = 0
total_rewards_p1 = 0
total_rewards_p2 = 0
starting_stack = 1000
num_wins_p1 = 0
num_wins_p2 = 0
total_profits_p1 = 0
total_profits_p2 = 0
amount_won_p1 = 0
amount_won_p2 = 0



# Define Experience Replay class
class ReplayMemory:
    def __init__(self, capacity):
        self.memory = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
       

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

# Define Poker DQN
class PokerDQN(torch.nn.Module):
    def __init__(self, n_states, n_actions):
        super(PokerDQN, self).__init__()
        self.layer1 = torch.nn.Linear(n_states, 128)
        self.layer2 = torch.nn.Linear(128, 128)
        self.layer3 = torch.nn.Linear(128, n_actions)
    
    def forward(self, x):
        x = torch.nn.functional.relu(self.layer1(x))
        x = torch.nn.functional.relu(self.layer2(x))
        return self.layer3(x)  # Outputs Q-values for Fold, Call, Raise

# Define a custom player class
class DQNPlayer(BasePokerPlayer):
    def __init__(self, uuid, policy_net, target_net, optimizer, memory):
        self.uuid = uuid
        self.policy_net = policy_net
        self.target_net = target_net
        self.optimizer = optimizer
        self.memory = memory
        self.steps_done = 0

    def declare_action(self, valid_actions, hole_card, game_state, round_state):
        """ Choose an action based on the DQN's predicted Q-values with epsilon-greedy exploration. """
        #Retrieve hole cards through player object

        # Encode the state based on the filtered hole cards
        state = self.encode_state(hole_card, round_state, starting_stack)
        
        action_index = self.select_action(state, valid_actions)  #0 = fold, 1 = call, 2 = raise
        # Retrieve corresponding action name
        action = valid_actions[action_index]["action"]

        # Handle amount selection for "raise" action
        amount = valid_actions[action_index].get("amount", 0)
        if isinstance(amount, dict): 
            amount = amount.get("min", 0)                                                                             
        


        #always default to action = call 0 (check)
        if action_index == 0 and valid_actions[1]["amount"] == 0:
            return "call", 0
        
        return action, amount  

    def get_hole_cards(self, game_state):
        """ Retrieve hole cards for this player using the player object. """
        for player in game_state["table"].seats.players:
            if player.uuid == self.uuid:
                card1 = str(player.hole_card[0])
                card2 = str(player.hole_card[1])
                return card1, card2  # Get hole cards through player object

    def select_action(self, state, valid_actions):
        """ Epsilon-greedy action selection: sometimes random, sometimes best Q-value """
        eps_threshold = EPS_END + (EPS_START - EPS_END) *  math.exp(-1. * self.steps_done / EPS_DECAY)
        self.steps_done += 1
        state = state.to(torch.float32)
        

        writer.add_scalar(f"Exploration/{self.uuid}_Epsilon", eps_threshold, self.steps_done)

        if random.random() > eps_threshold:
            with torch.no_grad():
                q_values = self.policy_net(state)
                writer.add_scalar(f"Call Q-Values/{self.uuid}", q_values[0][0], self.steps_done)
                writer.add_scalar(f"Fold Q-Values/{self.uuid}", q_values[0][1], self.steps_done)
                writer.add_scalar(f"Raise Q-Values/{self.uuid}", q_values[0][2], self.steps_done)
                return q_values.argmax(1).item()
        else:
            return random.choice(range(len(valid_actions)))

    def encode_state(self, hole_card, round_state, starting_stack):
        """ Convert poker game state into a fixed-length vector suitable for the DQN. """
        hand_strength = estimate_hole_card_win_rate(
            nb_simulation=50, 
            nb_player=2, 
            hole_card=gen_cards([hole_card[0],hole_card[1]]),  
            community_card=gen_cards(round_state["community_card"])
        )

        pot_size = round_state["pot"]["main"]["amount"] / starting_stack
        if round_state["seats"][0]["uuid"] == self.uuid:
            stack_size = round_state["seats"][0]["stack"] / starting_stack
        else: stack_size = round_state["seats"][1]["stack"] / starting_stack


        bb_position = int(round_state["big_blind_pos"] == 0)
        opponent_uuid = "p2_uuid" if self.uuid == "p1_uuid" else "p1_uuid"
        opponent_action = 0  # Default action index
        opponent_bet_size = 0
        
        if opponent_uuid in round_state["action_histories"]:
            street = round_state["street"]
            history = round_state["action_histories"][street]
            if history:
                opponent_action = history[-1]["action"]
                opponent_bet_size = history[-1]["amount"] / starting_stack
        
        state_vector = np.array([hand_strength, pot_size, stack_size, bb_position, opponent_action, opponent_bet_size], dtype=np.float32)
        return torch.tensor(state_vector, device=device).unsqueeze(0)

    def receive_game_start_message(self, game_info):
        pass

    def receive_round_start_message(self, round_count, hole_card, seats):
        pass

    def receive_street_start_message(self, street, round_state):
        pass

    def receive_game_update_message(self, action, round_state):
        pass

    def receive_round_result_message(self, winners, hand_info, round_state):
        pass



def soft_update(target_net, policy_net, tau):
    for target_param, policy_param in zip(target_net.parameters(), policy_net.parameters()):
        target_param.data.copy_(tau * policy_param.data + (1 - tau) * target_param.data)


def process_batch(batch):
    """Processes a batch of transitions into tensors."""
    state_batch = torch.cat([b[0] for b in batch]).to(device)
    action_batch = torch.tensor([b[1] for b in batch], device=device).unsqueeze(1)
    reward_batch = torch.tensor([b[2] for b in batch], device=device).unsqueeze(1)

    non_final_mask = torch.tensor([b[3] is not None for b in batch], device=device, dtype=torch.bool)
    if non_final_mask.any():
        non_final_next_states = torch.cat([b[3] for b in batch if b[3] is not None]).to(device)
    else:
        non_final_next_states = None

    return state_batch, action_batch, reward_batch, non_final_next_states, non_final_mask


def optimize_model():
    """Optimizes both players separately using their own replay memory."""
    global total_loss_p1, total_loss_p2
    #Optimize Player 1
    if len(memory1) >= BATCH_SIZE and len(memory2) >= BATCH_SIZE:
        batch_p1 = memory1.sample(BATCH_SIZE)
        state_p1, action_p1, reward_p1, next_state_p1, mask_p1 = process_batch(batch_p1)

        q_values_p1 = p1_policy_net(state_p1).gather(1, action_p1)

        with torch.no_grad():
            target_q_values_p1 = reward_p1.clone()
            if mask_p1.any():
                next_actions_p1 = p1_policy_net(next_state_p1).argmax(1, keepdim=True)  #Policy net selects best action
                max_next_q_values_p1 = p1_target_net(next_state_p1).gather(1, next_actions_p1)  #Target net evaluates action
                target_q_values_p1[mask_p1] += GAMMA * max_next_q_values_p1
                
        loss_p1 = torch.nn.functional.smooth_l1_loss(q_values_p1, target_q_values_p1)

        optimizer_p1.zero_grad()
        loss_p1.backward()
        
        torch.nn.utils.clip_grad_norm_(p1_policy_net.parameters(), 1.0)

        optimizer_p1.step()

        soft_update(p1_target_net, p1_policy_net, TAU)

        writer.add_scalar("Loss/P1_Loss", loss_p1.item(), dqn_player1.steps_done)
        writer.add_scalar("Q-Values/P1_Max", q_values_p1.max().item(), dqn_player1.steps_done)
        writer.add_scalar("Q-Values/P1_Mean", q_values_p1.mean().item(), dqn_player1.steps_done)
        total_loss_p1 += loss_p1.item()

    # Optimize Player 2
        batch_p2 = memory2.sample(BATCH_SIZE)
        state_p2, action_p2, reward_p2, next_state_p2, mask_p2 = process_batch(batch_p2)

        q_values_p2 = p2_policy_net(state_p2).gather(1, action_p2)

        with torch.no_grad():
            target_q_values_p2 = reward_p2.clone()
            if mask_p2.any():
                next_actions_p2 = p2_policy_net(next_state_p2).argmax(1, keepdim=True)  #Policy net selects best action
                max_next_q_values_p2 = p2_target_net(next_state_p2).gather(1, next_actions_p2)  #Target net evaluates action
                target_q_values_p2[mask_p2] += GAMMA * max_next_q_values_p2

        loss_p2 = torch.nn.functional.smooth_l1_loss(q_values_p2, target_q_values_p2)

        optimizer_p2.zero_grad()
        loss_p2.backward()
        
        torch.nn.utils.clip_grad_norm_(p2_policy_net.parameters(), 1.0)

        optimizer_p2.step()

        soft_update(p2_target_net, p2_policy_net, TAU)

        writer.add_scalar("Loss/P2_Loss", loss_p2.item(), dqn_player2.steps_done)
        writer.add_scalar("Q-Values/P2_Max", q_values_p2.max().item(), dqn_player2.steps_done)
        writer.add_scalar("Q-Values/P2_Mean", q_values_p2.mean().item(), dqn_player2.steps_done)
        total_loss_p2 += loss_p2.item()


def calculate_reward(player_uuid, events, prev_stack, action, prev_hand_strength, new_hand_strength, winners,):
    round_state = events[-1]["round_state"]
    
    # Get player's final stack after the action
    current_stack = next(seat["stack"] for seat in round_state["seats"] if seat["uuid"] == player_uuid)

    # stack-based Reward (Normalized by Big Blind)
    stack_reward = (current_stack - prev_stack) / 10  

    # Free Card
    check_reward = 0.02 if action == "call" and round_state["pot"]["main"]["amount"] == prev_stack else 0

    #Hand Strength Improvement Reward
    hand_strength_reward = (new_hand_strength - prev_hand_strength) * 0.5  

    # Win/Loss Reward
    win_reward = 0
    if winners:
        win_reward = 1.0 if player_uuid == events[-1]["winners"][0]["uuid"] else -1.0  

    # Fold Penalty 
    fold_penalty = -0.05 if action == "fold" else 0  

    # Compute Final Reward
    total_reward = stack_reward + check_reward + hand_strength_reward + win_reward + fold_penalty
    
    # Normalize & Clip Reward Between [-1, 1]
    total_reward = np.clip(total_reward, -1, 1)

    return total_reward


from pypokerengine.utils.game_state_utils import restore_game_state, attach_hole_card_from_deck


def train_poker_bot():
    global total_profits_p1
    global total_profits_p2
    global amount_won_p1
    global amount_won_p2
    p1_prev_stack = 0
    p2_prev_stack = 0
    p1_action = "blind"
    p2_action = "blind"
    p1_prev_hand_strength = 0
    p2_prev_hand_strength = 0
    game_state, events = emulator.start_new_round(initial_game_state)

    global total_rewards_p1
    global total_rewards_p2

    roundcounter = 0
    x_axis = 0
    for episode in range(num_episodes):
        
        while True:
            '''
            if episode % 50 == 0:
                print(events[-1]["round_state"]["street"])
                print(events[-1]["round_state"]["pot"]["main"]["amount"])
                print(events[-1]["round_state"]["community_card"])'''
            try:
                events[-1]["uuid"]  # Attempt to access
            except (IndexError, KeyError):
                optimize_model()
                game_state, events = emulator.start_new_round(initial_game_state)
                break
            if events[-1]["uuid"] == "p1_uuid":
                p2_hole_cards = dqn_player2.get_hole_cards(game_state)
                p2_state = dqn_player1.encode_state(p2_hole_cards, events[-1]["round_state"], starting_stack)
                p1_action, amount_p1 = dqn_player1.declare_action(
                    events[-1]["valid_actions"], 
                    dqn_player1.get_hole_cards(game_state),
                    game_state,  # ✅ Get hole cards from player object
                    events[-1]["round_state"]) 
                game_state, events = emulator.apply_action(game_state, p1_action, amount_p1)
                if events[-1]["type"] == "event_game_finish":
                    optimize_model()
                    game_state, events = emulator.start_new_round(initial_game_state)

                    
                    break

                p2_next_state = dqn_player2.encode_state(p2_hole_cards, events[-1]["round_state"], starting_stack)
                p2_new_hand_strength = estimate_hole_card_win_rate(nb_simulation=50, nb_player=2, hole_card=gen_cards([*p2_hole_cards]), community_card=gen_cards(events[-1]["round_state"]["community_card"]))
                
                reward2 = calculate_reward("p2_uuid", events, p1_prev_stack, p1_action, p1_prev_hand_strength, p2_new_hand_strength, events[-1]["type"] == "event_round_finish")
                writer.add_scalar("P2 Rewards", reward2, x_axis)
                total_rewards_p2 += reward2
                done = events[-1]["type"] == "event_round_finish" or events[-1]["type"] == "event_game_finish"
                memory2.push(p2_state, actions.index(p1_action.upper()), reward2, p2_next_state, done)

                writer.add_scalar("Actions/Player1_Action", actions.index(p1_action.upper()), episode)

            else:
                p1_hole_cards = dqn_player1.get_hole_cards(game_state)
                p1_state = dqn_player1.encode_state(dqn_player1.get_hole_cards(game_state), events[-1]["round_state"], starting_stack)
                p2_action, amount_p2 = dqn_player2.declare_action(
                                events[-1]["valid_actions"], 
                                dqn_player2.get_hole_cards(game_state),
                                game_state,   # ✅ Get hole cards from player object
                                events[-1]["round_state"])
                game_state, events = emulator.apply_action(game_state, p2_action, amount_p2)
                if events[-1]["type"] == "event_game_finish":
                    optimize_model()

                    game_state, events = emulator.start_new_round(initial_game_state)

                    
                    break


                p1_next_state = dqn_player1.encode_state(p1_hole_cards, events[-1]["round_state"], starting_stack)
                p1_new_hand_strength = estimate_hole_card_win_rate(nb_simulation=50, nb_player=2, hole_card=gen_cards([*p1_hole_cards]), community_card=gen_cards(events[-1]["round_state"]["community_card"]))
                reward1 = calculate_reward("p1_uuid", events, p2_prev_stack, p2_action, p2_prev_hand_strength, p1_new_hand_strength, events[-1]["type"] == "event_round_finish")
                total_rewards_p1 += reward1
                writer.add_scalar("P1 Rewards", reward1, x_axis)
                done = events[-1]["type"] == "event_round_finish" or events[-1]["type"] == "event_game_finish"
                memory1.push(p1_state, actions.index(p2_action.upper()), reward1, p1_next_state, done)
                
                
                writer.add_scalar("Actions/Player2_Action", actions.index(p2_action.upper()), episode)
            


            
            global num_wins_p1
            global num_wins_p2
            if events[-1]["type"] == "event_round_finish":              
                if events[-1]['winners'][0] == "p1_uuid":
                    num_wins_p1 += 1
                    amount_won_p2 += events[-1]['winners'][0]['stack']
                else:
                    num_wins_p2 += 1
                    amount_won_p2 += events[-1]['winners'][0]['stack']

                p1_current_stack = events[-1]["round_state"]["seats"][0]["stack"]
                total_profits_p1 += (p1_current_stack - starting_stack)
                writer.add_scalar("Game/P1 Profits", total_profits_p1, x_axis)  
                
                p2_current_stack = events[-1]["round_state"]["seats"][1]["stack"]
                total_profits_p2 += (p2_current_stack - starting_stack)
                writer.add_scalar("Game/P2 Profits", total_profits_p2, x_axis) 
                game_state, events = emulator.start_new_round(game_state)

                roundcounter += 1
            
            x_axis += 1
     
            

        
            optimize_model()
        roundcounter = 0
        print(f"Finished episode {episode}")
    
           
        
    


log_dir = "runs/poker_dqn_" + datetime.datetime.now().strftime("%m-%d_%H-%M")
print(log_dir)
writer = SummaryWriter(log_dir)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

emulator = Emulator()

###############################
#ADJUST NUMBER OF ROUNDS HERE
###############################
emulator.set_game_rule(player_num=2, max_round=100, small_blind_amount=5, ante_amount=0)


players_info = {
    "p1_uuid": {"name": "Player 1", "stack": starting_stack},
    "p2_uuid": {"name": "Player 2", "stack": starting_stack}
}

initial_game_state = emulator.generate_initial_game_state(players_info)


# Hyperparameters
num_episodes = 50
BATCH_SIZE = 128                         #min num of rounds before replay mem
GAMMA = 0.99                            #higher prio long term rewards
EPS_START = 1.0                         #100% rand at start
EPS_END = 0.2                           #10% rand at end                            
EPS_DECAY = 2000                         #decay speed             
TAU = 0.005                             #how oftern target is updated by policy
LR = 1e-5                               #learing rate
MEMORY_CAPACITY = 10000


hparams = {
    "batch_size": BATCH_SIZE,
    "gamma": GAMMA,
    "eps_start": EPS_START,
    "eps_end": EPS_END,
    "eps_decay": EPS_DECAY,
    "tau": TAU,
    "learning_rate": LR,
    "memory_capacity": MEMORY_CAPACITY
}


###############################
#ADJUST NUMBER OF ROUNDS ABOVE
###############################

actions = ["FOLD", "CALL", "RAISE"]
n_actions = len(actions)
n_states = 6  # Hand strength, pot size, stack size, BB position, opp last action, opp bet size

p1_policy_net = PokerDQN(n_states, n_actions).to(device)
p1_target_net = PokerDQN(n_states, n_actions).to(device)
p1_target_net.load_state_dict(p1_policy_net.state_dict())
p1_target_net.eval()

p2_policy_net = PokerDQN(n_states, n_actions).to(device)
p2_target_net = PokerDQN(n_states, n_actions).to(device)
p2_target_net.load_state_dict(p2_policy_net.state_dict())
p2_target_net.eval()

optimizer_p1 = torch.optim.AdamW(p1_policy_net.parameters(), lr=LR)
optimizer_p2 = torch.optim.AdamW(p2_policy_net.parameters(), lr=LR)

memory1 = ReplayMemory(MEMORY_CAPACITY)
memory2 = ReplayMemory(MEMORY_CAPACITY)

dqn_player1 = DQNPlayer("p1_uuid", p1_policy_net, p1_target_net, optimizer_p1, memory1)
dqn_player2 = DQNPlayer("p2_uuid", p2_policy_net, p2_target_net, optimizer_p2, memory2)

emulator.register_player("p1_uuid", dqn_player1)
emulator.register_player("p2_uuid", dqn_player2)



            


train_poker_bot()
final_metrics = {
    "final_loss_p1": total_loss_p1 / num_episodes,
    "final_loss_p2": total_loss_p2 / num_episodes,
    "avg_reward_p1": total_rewards_p1 / num_episodes,
    "avg_reward_p2": total_rewards_p2 / num_episodes,

    }

writer.add_scalar("P1 Average Win Size", amount_won_p1 / (num_wins_p1+1), 0)
writer.add_scalar("P2 Average Win Size", amount_won_p2 / (num_wins_p2+1), 0)

writer.add_hparams(hparams, final_metrics)
writer.close()

runs/poker_dqn_03-14_22-13
cuda
Finished episode 0
Finished episode 1
Finished episode 2
Finished episode 3
Finished episode 4
Finished episode 5
Finished episode 6
Finished episode 7
Finished episode 8
Finished episode 9
Finished episode 10
Finished episode 11
Finished episode 12
Finished episode 13
Finished episode 14
Finished episode 15
Finished episode 16
Finished episode 17
Finished episode 18
Finished episode 19
Finished episode 20
Finished episode 21
Finished episode 22
Finished episode 23
Finished episode 24
Finished episode 25
Finished episode 26
Finished episode 27
Finished episode 28
Finished episode 29
Finished episode 30
Finished episode 31
Finished episode 32
Finished episode 33
Finished episode 34
Finished episode 35
Finished episode 36
Finished episode 37
Finished episode 38
Finished episode 39
Finished episode 40
Finished episode 41
Finished episode 42
Finished episode 43
Finished episode 44
Finished episode 45
Finished episode 46
Finished episode 47
Finished episode 4

In [57]:
import os

# Define directory to save models
model_dir = "saved_models"
os.makedirs(model_dir, exist_ok=True)  # ✅ Create directory if it doesn't exist

# Save Player 1's model
torch.save(p1_policy_net.state_dict(), os.path.join(model_dir, "poker_p1_dqn.pth decay 3000"))

# Save Player 2's model
torch.save(p2_policy_net.state_dict(), os.path.join(model_dir, "poker_p2_dqn.pth decay 3000"))

print("✅ Model weights saved successfully!")


✅ Model weights saved successfully!


In [None]:
# Load Player 1's Model (if available)
p1_model_path = os.path.join(model_dir, "poker_p1_dqn.pth loser")
if os.path.exists(p1_model_path):
    p1_policy_net.load_state_dict(torch.load(p1_model_path))
    print("✅ Loaded Player 1's model weights.")
#p1_policy_net.load_state_dict(torch.load(os.path.join(model_dir, "poker_p1_dqn.pth"))


# Load Player 2's Model (if available)
p2_model_path = os.path.join(model_dir, "poker_p1_dqn.pth loser")
if os.path.exists(p2_model_path):
    p2_policy_net.load_state_dict(torch.load(p2_model_path))
    print("✅ Loaded Player 2's model weights.")

#p2_policy_net.load_state_dict(torch.load(os.path.join(model_dir, "poker_p2_dqn.pth"))
    

✅ Loaded Player 1's model weights.
✅ Loaded Player 2's model weights.
