In [None]:
import os
from typing import *
from collections import deque, namedtuple

import numpy as np
from tqdm.auto import tqdm, trange

import torch
import torch.nn as nn
import torch.optim as optim

from game.api import BlackjackWrapper
from game.models.model import GameState
from training.agent import BlackjackDQN

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

In [None]:
Transition = namedtuple("Transition", ("state", "action", "next_state", "reward", "mask"))

class ReplayBuffer(deque):

    def __init__(self, capacity):
        super().__init__([], maxlen=capacity)

    def push(self, transition: Transition):
        self.append(transition)

    def sample(self, batch_size: int):
        mean_reward = abs(np.mean([x.reward.item() for x in self]))
        weights = [max(abs(x.reward.item()), mean_reward) for x in self]
        weights = weights / np.sum(weights)
        indices = [i for i in range(len(self))]
        selected_indices = np.random.choice(a=indices, size=batch_size, replace=False, p=weights)
        return [self[i] for i in selected_indices]

In [None]:
def train(
    policy_model: BlackjackDQN,
    target_model: BlackjackDQN,
    optimizer: optim.Optimizer,
    replay_buffer: ReplayBuffer,
    batch_size: int,
    gamma: float,
):
    if len(replay_buffer) < batch_size:
        return
    transitions = replay_buffer.sample(batch_size)
    # Transpose batch of transitions to get transitions with batches
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action).unsqueeze(-1)
    reward_batch = torch.cat(batch.reward)
    mask_batch = torch.cat(batch.mask)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    q_batch = policy_model.batched_forward_with_concat(state_batch)
    masked_q_batch = q_batch * mask_batch
    state_action_values = masked_q_batch.gather(dim=1, index=action_batch)

    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1)[0].
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    next_state_values = torch.zeros(batch_size, device=device)
    with torch.no_grad():
        next_state_q_batch = target_model.batched_forward_with_concat(non_final_next_states)
        masked_next_state_q_batch = next_state_q_batch * mask_batch[non_final_mask]
        next_state_values[non_final_mask] = masked_next_state_q_batch.max(dim=1)[0]
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * gamma) + reward_batch

    # Compute Huber loss
    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    # In-place gradient clipping
    torch.nn.utils.clip_grad_value_(policy_model.parameters(), 100)
    optimizer.step()

In [None]:
gamma = 0.9
learning_rate = 1e-2
lr_gamma = 0.99
num_eps = 100_000
batch_size = 128
max_steps = 10
train_step = 10
schedule_every_train = 100
update_step = 500

epsilon = 0.9
min_epsilon = 0.05
tau = 0.01

initial_cash = 10000
deck_num = 8

In [None]:
bet_choices = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
card_choices = [True, False]

policy_model = BlackjackDQN(
    in_features=GameState.get_state_size(),
    bet_choices=bet_choices,
    card_choices=card_choices,
    epsilon=epsilon,
    min_epsilon=min_epsilon,
).to(device)
target_model = BlackjackDQN(
    in_features=GameState.get_state_size(),
    bet_choices=bet_choices,
    card_choices=card_choices,
    epsilon=epsilon,
    min_epsilon=min_epsilon,
).to(device)
target_model.load_state_dict(policy_model.state_dict())

game_wrapper = BlackjackWrapper(initial_cash, deck_num)
optimizer = optim.Adam(policy_model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=lr_gamma)
replay_buffer = ReplayBuffer(10000)

In [None]:
model_name = "blackjack_dqn"
proj_path = os.path.join(os.getcwd(), "..")
model_path = os.path.join(proj_path, "models", model_name)
scores_path = os.path.join(proj_path, "models", f"{model_name}_training_scores")

In [None]:
all_scores: List[List[float]] = []
eps_scores: List[float] = []
last_logged_eps_scores: List[float] = []
total_steps = 0
train_steps = 0

policy_model.train()

for i_episode in trange(num_eps):
    game_wrapper = game_wrapper.reset()
    game_state = game_wrapper.get_state()
    state = game_state.torch_flatten(device)
    rewards: List[float] = []
    for i_step in range(max_steps):
        if i_step == 0:
            bet_percent, action, mask = policy_model.get_bet_percent(
                normalized_state=state, allow_explore=True, num_steps=total_steps
            )
            outcome = game_wrapper.bet_step(bet_percent)
        else:
            card_action, action, mask = policy_model.get_card_action(
                normalized_state=state, allow_explore=True, num_steps=total_steps
            )
            outcome = game_wrapper.card_step(take_card=card_action)
        terminated = outcome.terminated
        reward = outcome.reward
        reward_tensor = torch.Tensor([reward], device=device)
        action_tensor = torch.Tensor([action], device=device).type(torch.int64)
        rewards.append(reward)
        next_state = outcome.new_state.torch_flatten(device)

        # Store the transition in memory
        replay_buffer.push(Transition(state, action_tensor, next_state, reward_tensor, mask))

        # Move to the next state
        state = next_state

        total_steps += 1

        if total_steps % train_step == 0:
            # Perform one step of the optimization (on the policy network)
            train(
                policy_model=policy_model,
                target_model=target_model,
                optimizer=optimizer,
                replay_buffer=replay_buffer,
                batch_size=batch_size,
                gamma=gamma,
            )
            if train_steps % schedule_every_train == 0:
                scheduler.step()
            train_steps += 1

        if total_steps % update_step == 0:
            # Update target model to weighted sum of policy and target model
            target_state_dict = target_model.state_dict()
            policy_state_dict = policy_model.state_dict()
            for key in target_state_dict:
                target_state_dict[key] = tau * policy_state_dict[key] + (1 - tau) * target_state_dict[key]
            target_model.load_state_dict(target_state_dict)

        if terminated:
            break

    all_scores.append(rewards)
    eps_reward = sum(rewards)
    last_logged_eps_scores.append(eps_reward)
    eps_scores.append(eps_reward)

    if i_episode % (num_eps // 100) == 0:
        tqdm.write(
            f"Episode {i_episode}"
            f"\t\tLast Logged Average Score: {round(np.mean(last_logged_eps_scores).item(), 3)}"
            f"\t\tRunning Average Score: {round(np.mean(eps_scores).item(), 3)}"
        )

torch.save(policy_model.state_dict(), model_path)
with open(scores_path, "w") as f:
    f.writelines([", ".join([str(x) for x in arr]) + "\n" for arr in all_scores])

In [None]:
def evaluate_agent(
    game_wrapper: BlackjackWrapper,
    policy_model: BlackjackDQN,
    num_eps: int,
    max_steps: int = 10,
) -> Tuple[float, float]:
    rewards: List[float] = []

    for i_eps in range(num_eps):
        game_wrapper = game_wrapper.reset()
        state = game_wrapper.get_state()
        eps_reward = 0.0
        for i_step in range(max_steps):
            if i_step == 0:
                bet_percent, _, _ = policy_model.get_bet_percent(
                    state.torch_flatten(device), allow_explore=False, num_steps=0
                )
                outcome = game_wrapper.bet_step(bet_percent)
            else:
                card_action, _, _ = policy_model.get_card_action(
                    state.torch_flatten(device), allow_explore=False, num_steps=0
                )
                outcome = game_wrapper.card_step(take_card=card_action)
            state = outcome.new_state
            terminated = outcome.terminated
            eps_reward += outcome.reward
            if terminated:
                break
        rewards.append(eps_reward)

    mean_reward = np.mean(rewards).item()
    std_reward = np.std(rewards).item()
    return mean_reward, std_reward

In [None]:
policy_model.eval()
evaluate_agent(
    game_wrapper=game_wrapper,
    policy_model=policy_model,
    num_eps=1000,
)