In [None]:
import os
import random
from typing import *
from collections import deque

import numpy as np
from tqdm.auto import tqdm, trange

import torch
import torch.optim as optim

from game.api import BlackjackWrapper
from game.models.model import GameState
from training.agent import BlackjackPolicyModel

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

In [None]:
def reinforce(
    game_wrapper: BlackjackWrapper,
    policy_model: BlackjackPolicyModel,
    optimizer: optim.Optimizer,
    scheduler: optim.lr_scheduler._LRScheduler,
    scheduler_step_every: int,
    num_eps: int,
    batch_size: int,
    gamma: float,
    max_steps: int = 10,
    add_step: bool = False,
    add_card_counting: bool = True,
    log_every: int = 10,
):
    print("Starting RL training process...")
    all_scores: List[List[float]] = []
    eps_scores: List[float] = []
    last_logged_eps_scores: List[float] = []

    for i_eps in trange(num_eps):
        model_loss = []
        batch_returns = []
        batch_outputs = []

        for _ in range(batch_size):
            outputs = []
            rewards: List[float] = []
            game_wrapper = game_wrapper.reset()
            state = game_wrapper.get_state()
            for i_step in range(max_steps):
                step_num = i_step / max_steps if add_step else None
                state_features = state.flatten(include_discarded=add_card_counting, step_num=step_num)
                if i_step == 0:
                    bet_percent = policy_model.get_bet_percent(state_features)
                    outcome = game_wrapper.bet_step(bet_percent)
                    outputs.append(bet_percent)
                else:
                    card_action = policy_model.get_card_action(state_features)
                    outputs.append(card_action)
                    take_card = card_action.item() > random.random()
                    outcome = game_wrapper.card_step(take_card=take_card)
                state = outcome.new_state
                terminated = outcome.terminated
                rewards.append(outcome.reward)
                if terminated:
                    break

            n_steps = len(rewards)
            eps_reward = sum(rewards)
            all_scores.append(rewards)
            eps_scores.append(eps_reward)
            last_logged_eps_scores.append(eps_reward)
            returns = deque(maxlen=n_steps)

            for t in range(n_steps)[::-1]:
                disc_return_t = returns[0] if len(returns) > 0 else 0
                returns.appendleft(gamma * disc_return_t + rewards[t])
            returns = np.array(returns)

            batch_returns.extend(returns)
            batch_outputs.extend(outputs)

        batch_returns = np.array(batch_returns)
        batch_returns = (batch_returns - batch_returns.mean()) / (batch_returns.std() + 1e-8)
        for pred_return, output in zip(batch_returns, batch_outputs):
            model_loss.append(-pred_return * (output + 1e-8).log())

        model_loss = torch.cat(model_loss)
        model_loss = model_loss.sum()
        optimizer.zero_grad()
        model_loss.backward()
        optimizer.step()

        if i_eps % scheduler_step_every == 0:
            scheduler.step()

        if i_eps % log_every == 0:
            tqdm.write(
                f"Episode {i_eps}"
                f"\t\tLast Logged Average Score: {round(np.mean(last_logged_eps_scores).item(), 3)}"
                f"\t\tRunning Average Score: {round(np.mean(eps_scores).item(), 3)}"
            )
            last_logged_eps_scores = []

    return all_scores

In [None]:
gamma = 0.9
learning_rate = 1e-2
batch_size = 8
num_eps = 10_000
lr_gamma = 0.99
num_lr_decay = 100
log_eps = int(num_eps / 100)
add_steps_info = True
add_card_counting = True

initial_cash = 1000
deck_num = 8
min_bet = 10

In [None]:
game_wrapper = BlackjackWrapper(initial_cash, deck_num, min_bet)
in_features = GameState.get_state_size(add_steps=add_steps_info)
model = BlackjackPolicyModel(in_features=in_features, device=device).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=lr_gamma)

In [None]:
model_name = "blackjack_policy_model_eps10000_batch8"
proj_path = os.path.join(os.getcwd(), "..")
# proj_path = os.getcwd()
model_dir = os.path.join(proj_path, "models")
model_path = os.path.join(model_dir, model_name)
scores_path = os.path.join(model_dir, f"{model_name}_training_scores")

In [None]:
model.train()
scores = reinforce(
    game_wrapper=game_wrapper,
    policy_model=model,
    optimizer=optimizer,
    scheduler=scheduler,
    scheduler_step_every=num_eps//num_lr_decay,
    num_eps=num_eps,
    batch_size=batch_size,
    gamma=gamma,
    add_step=add_steps_info,
    add_card_counting=add_card_counting,
    log_every=log_eps,
)
torch.save(model.state_dict(), model_path)
with open(scores_path, "w") as f:
    f.writelines([", ".join([str(x) for x in arr]) + "\n" for arr in scores])

In [None]:
def evaluate_agent(
    game_wrapper: BlackjackWrapper,
    policy_model: BlackjackPolicyModel,
    num_eps: int,
    max_steps: int = 10,
    add_step: bool = False,
    add_card_counting: bool = True,
) -> Tuple[float, float]:
    rewards: List[float] = []

    for i_eps in range(num_eps):
        game_wrapper = game_wrapper.reset()
        state = game_wrapper.get_state()
        eps_reward = 0.0
        for i_step in range(max_steps):
            step_num = i_step / max_steps if add_step else None
            state_features = state.flatten(include_discarded=add_card_counting, step_num=step_num)
            if i_step == 0:
                bet_percent = policy_model.get_bet_percent(state_features)
                outcome = game_wrapper.bet_step(bet_percent)
            else:
                card_action = policy_model.get_card_action(state_features)
                outcome = game_wrapper.card_step(take_card=card_action.item() > random.random())
            state = outcome.new_state
            terminated = outcome.terminated
            eps_reward += outcome.reward
            if terminated:
                break
        rewards.append(eps_reward)

    mean_reward = np.mean(rewards).item()
    std_reward = np.std(rewards).item()
    return mean_reward, std_reward

In [None]:
model.eval()

evaluate_agent(
    game_wrapper=game_wrapper,
    policy_model=model,
    num_eps=1000,
    add_step=add_steps_info,
    add_card_counting=True,
)