### Every-visit Monte Carlo with Exploring Starts for estimating an optimal Blackjack policy

This corresponds to the algorithm described in Chapter 5.3 of [_Reinforcement Learning: An Introduction_](http://incompleteideas.net/book/the-book-2nd.html), by Sutton and Barto.

In [31]:
# Dependencies
import pickle
import time

import gym
from tqdm import tqdm

from rl_agents.blackjack.MonteCarloAgent import MonteCarloAgent

### Initialize Blackjack agent and environment

In [32]:
RANDOM_SEED = 2

env = gym.make('Blackjack-v0')
env.seed(seed=RANDOM_SEED)
agent = MonteCarloAgent(action_space=env.action_space,
                        obs_space=env.observation_space,
                        seed=RANDOM_SEED)

### Run Experiment

In [61]:
NUM_EPISODES = 100000

for i_episode in tqdm(range(NUM_EPISODES)):
    observation = env.reset()
    reward = 0
    done = False
    t = 0
    episode_ts = []
    while not done:
        action = agent.agent_step(reward=reward,
                                  observation=observation)

        observation, reward, done, info = env.step(action)

        episode_ts.append((action, observation, reward))

        if done:
            break

        t += 1

    agent.agent_end(episode_ts=episode_ts)

100%|██████████| 100000/100000 [00:08<00:00, 11829.09it/s]


In [62]:
# Save the agent
with open("/home/protoman/dev/rl_agents/rl_agents/blackjack/agents/{}_mc_episodes={}.pkl".format(int(time.time()), NUM_EPISODES), "wb") as fp:
    pickle.dump(agent, fp)

### Analyze agent's policy

In [41]:
import matplotlib.pyplot as plt
x_labels = ["A", "2", "3", "4", "5", "6", "7", "8", "9", "10"]
x_title = "Dealer showing"
y_labels = ["11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21"]
y_title = "Player Sum"

In [58]:
print(agent.action_values[18][3][0])
print(agent.policy.shape)


[ 0.06185567 -0.03333333]
(32, 11, 2)


In [60]:
import numpy as np
np.max(agent.action_values)

1.0

In [10]:
# Has Usable Ace


In [None]:
# No Usable Ace