In [1]:
import numpy as np
import pandas as pd
from typing import Tuple, Dict
from enum import Enum
from itertools import product

# Blackjack

The cards to be used in Blackjack are:

In [2]:
class Card(Enum):
    ACE = 1
    TWO = 2
    THREE = 3
    FOUR = 4
    FIVE = 5
    SIX = 6
    SEVEN = 7
    EIGHT = 8
    NINE = 9
    FACE = 10   # This groups 10, Jack, Queen and King whose value is the same

card_values = {
    Card.ACE: [1, 11],
    Card.TWO: 2,
    Card.THREE: 3,
    Card.FOUR: 4,
    Card.FIVE: 5,
    Card.SIX: 6,
    Card.SEVEN: 7,
    Card.EIGHT: 8,
    Card.NINE: 9,
    Card.FACE: 10   
}

Both the player and the dealer are given 2 cards. The first one is shown, the second is face down. Then, the player starts requesting cards and what matters is the sum of the values. Therefore, a state in our system is made by:
- The current sum of cards of the player. Note that below 11 we don't need to make decisions: we simply draw the next card. So this sum is valuable between 12 and 21.
- The card shown by the dealer. It can be an ace or any number up to 10.

We also add an indicator for **usable aces**, which are aces the player can use as a 11 without going bust

In [3]:
states = [{
        'player_sum': player_sum, 
        'dealer_card': dealer_card, 
        'usable_ace': usable_ace
    }
    for player_sum in range(12,21+1)
    for dealer_card in Card
    for usable_ace in [True, False]
]

print(len(states))

200


Player's actions are to hit or stick

In [4]:
class Action(Enum):
    HIT = 0
    STICK = 1

The policy for the player is to only stick at 20 or 21.

In [5]:
player_policy = {
    (s['player_sum'], s['dealer_card'], s['usable_ace']): {
        Action.HIT: 1*(s['player_sum'] < 18),
        Action.STICK: 1*(s['player_sum'] >= 18)
    }
    for s in states
}

We define a function for playing an episode:

In [6]:
def select_action_according_to_policy(current_state: Tuple[int,int], 
        policy:Dict[Tuple[int,int], Dict[Action, float]])-> Action:
    max_val = max(policy[current_state].values())
    # Randomly break ties
    keys = [key for key, value in policy[current_state].items() if value == max_val]
    choice = np.random.choice(keys)
    return choice

def select_random_action(current_state: Tuple[int,int], 
        policy:Dict[Tuple[int,int], Dict[Action, float]])-> Action:
    return np.random.choice(list(policy[current_state].keys()))

def deal_cards(num=1):
    return list(np.random.choice(Card, num)) if num > 1 else np.random.choice(Card, num)[0]

def get_value(cards, aces_as_11):
    total_sum = 0
    for i, c in enumerate(cards):
        if c == Card.ACE:
            if len(aces_as_11) > 0 and i in aces_as_11:
                total_sum += 11
            else:
                total_sum += 1
        else:
            total_sum += card_values[c]
    return total_sum

def play_episode(policy:Dict[Tuple[int, int], Dict[Action,float]], epsilon=0.1, verbose=False):
    player_cards = deal_cards(num=2)
    dealer_cards = deal_cards(num=2)
    player_aces_as_11 = np.where(np.array(player_cards) == Card.ACE)[0]
    dealer_aces_as_11 = np.where(np.array(dealer_cards) == Card.ACE)[0]
    # Note: if there are two aces, only the first one counts as 11
    if len(player_aces_as_11) > 1:
        player_aces_as_11 = [player_aces_as_11[0]]
    if len(dealer_aces_as_11) > 1:
        dealer_aces_as_11 = [dealer_aces_as_11[0]]
    dealer_visible = dealer_cards[0]
    states = []
    actions = []
    rewards = []
    # If the player has 21, it has won unless the dealer also has 21
    player_sum = get_value(player_cards, aces_as_11=player_aces_as_11)
    if player_sum == 21:
        states.append((21, dealer_visible, True))
        actions.append(Action.STICK)
        if not get_value(dealer_cards, aces_as_11=dealer_aces_as_11):
            rewards.append(-1)
        else:
            rewards.append(1)
        if verbose:
            print("Someone has 21 in two cards")
            print("Player cards: {}".format(player_cards))
            print("Dealer cards: {}".format(dealer_cards))
        return states, actions, rewards
    # If the player has less than 12, he hits until he reaches it.
    while player_sum < 12:
        new_card = deal_cards(num=1)
        player_cards.append(new_card)
        player_sum = get_value(player_cards, aces_as_11=player_aces_as_11)
    # At this point, the player has >= 12. We can let the agent start making choices.
    current_state = (player_sum, dealer_visible, len(player_aces_as_11) > 0)
    states.append(current_state)
    action = None
    while not action == Action.STICK:
        # Epsilon-greedy selection
        if np.random.random() <= epsilon:
            action = select_random_action(current_state, policy)
        else:
            action = select_action_according_to_policy(current_state, policy)
        actions.append(action)
        # If the action is to hit, get a new card
        if action == Action.HIT:
            new_card = deal_cards(num=1)
            player_cards.append(new_card)
            player_sum = get_value(player_cards, 
                aces_as_11=player_aces_as_11
            )
            current_state = (player_sum, dealer_visible, len(player_aces_as_11) > 0)
            # To get the reward, check what is the player's sum at this point to see if we have lost:
            if player_sum > 21:
                # Player has lost
                rewards.append(-1)
                if verbose:
                    print("Player has gone bust!!")
                    print("Player cards: {}".format(player_cards))
                    print("Dealer cards: {}".format(dealer_cards))
                return states, actions, rewards
            else:
                # Go on with the choices
                states.append(current_state)
                rewards.append(0)
    # Here we have chosen to stick and not have bust
    # Now it's the dealer's turn. The dealer always stops when he goes over 17.
    # He wins if he has 21 with two cards, or if his score goes over the player's.
    # He loses if he stops before the player's score.
    dealer_score = get_value(dealer_cards,
        aces_as_11=dealer_aces_as_11
    )
    if dealer_score == 21:
        # Player has lost
        rewards.append(-1)
        return states, actions, rewards
    else:
        while dealer_score < 17:
            new_card = deal_cards(num=1)
            dealer_cards.append(new_card)
            dealer_score = get_value(dealer_cards,
                aces_as_11=dealer_aces_as_11
            )
            # Check if the new card has made the dealer bust
            if dealer_score > 21:
                # Player has won
                rewards.append(1)
                if verbose:
                    print("Dealer has bust")
                    print("Player cards: {}".format(player_cards))
                    print("Dealer cards: {}".format(dealer_cards))
                return states, actions, rewards
        # Here the dealer has not bust and has decided to stick. We need to compare
        # his score with the player's score.
        if verbose:
            print("Final comparison...")
            print("Player cards: {}".format(player_cards))
            print("Dealer cards: {}".format(dealer_cards))
        if dealer_score >= player_sum:
            # The player has lost
            rewards.append(-1)
            return states, actions, rewards
        else:
            # The player has won
            rewards.append(1)
            return states, actions, rewards

play_episode(player_policy, verbose=True)

Final comparison...
Player cards: [<Card.NINE: 9>, <Card.FACE: 10>]
Dealer cards: [<Card.TWO: 2>, <Card.SEVEN: 7>, <Card.FACE: 10>]


([(19, <Card.TWO: 2>, False)], [<Action.STICK: 1>], [-1])

Now we need to calculate the Q value for each state-action pair. We can do that using a first-visit Monte Carlo method. 

In [7]:
def first_visit_MC_Q(policy:Dict[Tuple[int, int], Dict[Action,float]], 
        episodes_to_play:int=1000, discount=0.9):
    state_action_pairs = set(product(policy.keys(), Action))
    Q = {st: np.random.random() for st in state_action_pairs}
    Returns = {st: [] for st in state_action_pairs}
    for i in range(episodes_to_play):
        sts, acts, rews = play_episode(policy)
        cumulative_reward = 0
        # For each state-action pair
        for step in range(len(rews)-1, 0, -1):
            cumulative_reward = discount*cumulative_reward + rews[step]
            state_action = (sts[step], acts[step])
            if not state_action in [(sts[s], acts[s]) for s in range(step)]:
                Returns[state_action].append(cumulative_reward)     # The cumulative reward is appended only once for the first visit method
                Q[state_action] = np.mean(Returns[state_action])    # Compute average of rewards for Q
    return Q

Q = first_visit_MC_Q(player_policy, episodes_to_play=100000, discount=1)

If q* is well approximated by Q, an agent that always follows the action whose Q value is highest should behave optimally (or at least, be quite good in general).

In [8]:
new_agent_policy = {
    s : {
        Action.HIT: Q[(s, Action.HIT)],
        Action.STICK: Q[(s, Action.STICK)]
    }
    for s in player_policy.keys()
}

Play some games!

In [9]:
for _ in range(5):
    play_episode(new_agent_policy, verbose=True)
    print()

Player has gone bust!!
Player cards: [<Card.TWO: 2>, <Card.THREE: 3>, <Card.EIGHT: 8>, <Card.THREE: 3>, <Card.SEVEN: 7>]
Dealer cards: [<Card.EIGHT: 8>, <Card.SEVEN: 7>]

Final comparison...
Player cards: [<Card.ACE: 1>, <Card.SEVEN: 7>]
Dealer cards: [<Card.FACE: 10>, <Card.SIX: 6>, <Card.TWO: 2>]

Dealer has bust
Player cards: [<Card.THREE: 3>, <Card.FACE: 10>]
Dealer cards: [<Card.SEVEN: 7>, <Card.NINE: 9>, <Card.EIGHT: 8>]


Final comparison...
Player cards: [<Card.SEVEN: 7>, <Card.SEVEN: 7>, <Card.FIVE: 5>]
Dealer cards: [<Card.SEVEN: 7>, <Card.FACE: 10>]



We could use the Monte Carlo strategy to update the strategy as we go along, making the agent greedy on the Q value it's computing.

In [12]:
def first_visit_MC_Q_update(initial_policy:Dict[Tuple[int, int], Dict[Action,float]], 
        episodes_to_play:int=1000, discount=0.9):
    state_action_pairs = set(product(initial_policy.keys(), Action))
    Q = {st: np.random.random() for st in state_action_pairs}
    Returns = {st: [] for st in state_action_pairs}
    policy = initial_policy
    for i in range(episodes_to_play):
        sts, acts, rews = play_episode(policy)
        cumulative_reward = 0
        # For each state-action pair
        for step in range(len(rews)-1, 0, -1):
            cumulative_reward = discount*cumulative_reward + rews[step]
            state_action = (sts[step], acts[step])
            if not state_action in [(sts[s], acts[s]) for s in range(step)]:
                Returns[state_action].append(cumulative_reward)     # The cumulative reward is appended only once for the first visit method
                Q[state_action] = np.mean(Returns[state_action])    # Compute average of rewards for Q
                # UPDATE THE POLICY by making it greedy based on the Q value
                policy = {
                    s : {
                        Action.HIT: 1 if Q[(s, Action.HIT)] >= Q[(s, Action.STICK)] else 0,
                        Action.STICK: 1 if Q[(s, Action.STICK)] > Q[(s, Action.HIT)] else 0
                    }
                    for s in policy.keys()
                }
    return Q

Q = first_visit_MC_Q(player_policy, episodes_to_play=100000, discount=1)

Play some more games with this updated policy

In [13]:
for _ in range(5):
    play_episode(new_agent_policy, verbose=True)
    print()

Player has gone bust!!
Player cards: [<Card.SIX: 6>, <Card.ACE: 1>, <Card.SIX: 6>]
Dealer cards: [<Card.SIX: 6>, <Card.ACE: 1>]

Dealer has bust
Player cards: [<Card.FIVE: 5>, <Card.SEVEN: 7>, <Card.NINE: 9>]
Dealer cards: [<Card.NINE: 9>, <Card.FIVE: 5>, <Card.TWO: 2>, <Card.FACE: 10>]

Dealer has bust
Player cards: [<Card.NINE: 9>, <Card.ACE: 1>]
Dealer cards: [<Card.THREE: 3>, <Card.FOUR: 4>, <Card.EIGHT: 8>, <Card.EIGHT: 8>]

Player has gone bust!!
Player cards: [<Card.ACE: 1>, <Card.SIX: 6>, <Card.NINE: 9>]
Dealer cards: [<Card.FOUR: 4>, <Card.FOUR: 4>]

Final comparison...
Player cards: [<Card.SIX: 6>, <Card.THREE: 3>, <Card.SEVEN: 7>]
Dealer cards: [<Card.TWO: 2>, <Card.FOUR: 4>, <Card.THREE: 3>, <Card.EIGHT: 8>]

