In [None]:
import os
import random
from typing import *
from collections import deque

import numpy as np
from tqdm.auto import tqdm, trange

import torch
import torch.optim as optim

from game.api import BlackjackWrapper
from game.models.model import GameState
from training.agent import BlackjackPolicyModel

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

In [None]:
def reinforce(
    game_wrapper: BlackjackWrapper,
    policy_model: BlackjackPolicyModel,
    optimizer: optim.Optimizer,
    scheduler: optim.lr_scheduler._LRScheduler,
    scheduler_step_every: int,
    num_eps: int,
    gamma: float,
    max_steps: int = 1000,
    log_every: int = 10,
):
    print("Starting RL training process...")
    eps_scores: List[float] = []
    last_logged_eps_scores: List[float] = []

    for i_eps in trange(num_eps):
        outputs = []
        expected_outputs = []
        rewards: List[float] = []
        game_wrapper = game_wrapper.reset()
        state = game_wrapper.get_state()

        for i_step in range(max_steps):
            if i_step == 0:
                bet_percent = policy_model.get_bet_percent(state.flatten())
                outcome = game_wrapper.bet_step(bet_percent)
                outputs.append(bet_percent)
            else:
                card_action = policy_model.get_card_action(state.flatten())
                outputs.append(card_action)
                take_card = card_action.item() > random.random()
                outcome = game_wrapper.card_step(take_card=take_card)
            state = outcome.new_state
            terminated = outcome.terminated
            rewards.append(outcome.reward)
            if terminated:
                break

        n_steps = len(rewards)
        eps_reward = sum(rewards)
        eps_scores.append(eps_reward)
        last_logged_eps_scores.append(eps_reward)
        returns = deque(maxlen=n_steps)

        for t in range(n_steps)[::-1]:
            disc_return_t = returns[0] if len(returns) > 0 else 0
            returns.appendleft(gamma * disc_return_t + rewards[t])
        returns = torch.tensor(returns)

        model_loss_arr = []
        for output, pred_return in zip(outputs, returns):
            model_loss_arr.append(-pred_return * (output + 1e-8).log())
        model_loss = torch.cat(model_loss_arr).sum()

        optimizer.zero_grad()
        model_loss.backward()
        optimizer.step()

        if i_eps % scheduler_step_every == 0:
            scheduler.step()

        if i_eps % log_every == 0:
            tqdm.write(
                f"Episode {i_eps}"
                f"\t\tLast Logged Average Score: {round(np.mean(last_logged_eps_scores).item(), 3)}"
                f"\t\tRunning Average Score: {round(np.mean(eps_scores).item(), 3)}"
            )
            last_logged_eps_scores = []

    return eps_scores

In [None]:
gamma = 0.75
learning_rate = 1e-2
num_eps = 10_000
lr_gamma = 0.99
num_lr_decay = 100
log_eps = int(num_eps / 100)

initial_cash = 1000
deck_num = 8
min_bet = 10

In [None]:
game_wrapper = BlackjackWrapper(initial_cash, deck_num, min_bet)
model = BlackjackPolicyModel(in_features=GameState.get_state_size(), device=device).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=lr_gamma)

In [None]:
model_name = "blackjack_policy_model"
proj_path = os.path.join(os.getcwd(), "..")
model_path = os.path.join(proj_path, "models", model_name)
scores_path = os.path.join(proj_path, "models", f"{model_name}_training_scores")

In [None]:
model.train()
scores = reinforce(
    game_wrapper=game_wrapper,
    policy_model=model,
    optimizer=optimizer,
    scheduler=scheduler,
    scheduler_step_every=num_eps//num_lr_decay,
    num_eps=num_eps,
    gamma=gamma,
    log_every=log_eps,
)
torch.save(model.state_dict(), model_path)
with open(scores_path, "w") as f:
    f.writelines([str(x) + "\n" for x in scores])

In [None]:
def evaluate_agent(
    game_wrapper: BlackjackWrapper,
    policy_model: BlackjackPolicyModel,
    num_eps: int,
    max_steps: int = 1000,
) -> Tuple[float, float]:
    rewards: List[float] = []

    for i_eps in range(num_eps):
        game_wrapper = game_wrapper.reset()
        state = game_wrapper.get_state()
        eps_reward = 0.0
        for i_step in range(max_steps):
            if i_step == 0:
                bet_percent = policy_model.get_bet_percent(state.flatten())
                outcome = game_wrapper.bet_step(bet_percent)
            else:
                card_action = policy_model.get_card_action(state.flatten())
                outcome = game_wrapper.card_step(take_card=card_action.item() > random.random())
            state = outcome.new_state
            terminated = outcome.terminated
            eps_reward += outcome.reward
            if terminated:
                break
        rewards.append(eps_reward)

    mean_reward = np.mean(rewards).item()
    std_reward = np.std(rewards).item()
    return mean_reward, std_reward

In [None]:
model.eval()

evaluate_agent(
    game_wrapper=game_wrapper,
    policy_model=model,
    num_eps=1000,
)