In [2]:
import numpy as np
import pandas as pd
from typing import Tuple, Dict
from enum import Enum
from itertools import product

# Blackjack

The cards to be used in Blackjack are:

In [19]:
class Card(Enum):
    ACE = 1,
    TWO = 2,
    THREE = 3,
    FOUR = 4,
    FIVE = 5,
    SIX = 6,
    SEVEN = 7,
    EIGHT = 8,
    NINE = 9,
    FACE = 10   # This groups 10, Jack, Queen and King whose value is the same

card_values = {
    Card.ACE: [1, 11],
    Card.TWO: 2,
    Card.THREE: 3,
    Card.FOUR: 4,
    Card.FIVE: 5,
    Card.SIX: 6,
    Card.SEVEN: 7,
    Card.EIGHT: 8,
    Card.NINE: 9,
    Card.FACE: 10   
}

Both the player and the dealer are given 2 cards. The first one is shown, the second is face down. Then, the player starts requesting cards and what matters is the sum of the values. Therefore, a state in our system is made by:
- The current sum of cards of the player. Note that below 11 we don't need to make decisions: we simply draw the next card. So this sum is valuable between 12 and 21.
- The card shown by the dealer. It can be an ace or any number up to 10.

We also add an indicator for **usable aces**, which are aces the player can use as a 11 without going bust

In [22]:
states = [{
        'player_sum': player_sum, 
        'dealer_card': dealer_card, 
        'usable_ace': usable_ace
    }
    for player_sum in range(12,21+1)
    for dealer_card in Card
    for usable_ace in [True, False]
]

print(len(states))

200


Player's actions are to hit or stick

In [23]:
class Action(Enum):
    HIT = 0,
    STICK = 1

The policy for the player is to only stick at 20 or 21.

In [25]:
player_policy = {
    (s['player_sum'], s['dealer_card'], s['usable_ace']): {
        Action.HIT: 1*s['player_sum'] < 20,
        Action.STICK: 1*s['player_sum'] >= 20
    }
    for s in states
}

We define a function for playing an episode:

In [45]:
def select_action_according_to_policy(current_state: Tuple[int,int], 
        policy:Dict[Tuple[int,int], Dict[Action, float]])-> Action:
    max_val = max(policy[current_state].values())
    # Randomly break ties
    keys = [key for key, value in policy[current_state].items() if value == max_val]
    choice = np.random.choice(keys)
    return choice

def select_random_action(current_state: Tuple[int,int], 
        policy:Dict[Tuple[int,int], Dict[Action, float]])-> Action:
    return np.random.choice(list(policy[current_state].keys()))

def deal_cards(num=1):
    return list(np.random.choice(Card, num))

def get_value(cards, aces_as_11=set()):
    sum_no_aces = 0
    for i, c in enumerate(cards):
        if c == Card.ACE:
            if i in aces_as_11:
                sum_no_aces += 11
            else:
                sum_no_aces += 1
        else:
            sum_no_aces += card_values[c]
    return sum_no_aces

def play_episode(policy:Dict[Tuple[int, int], Dict[Action,float]], 
        epsilon:float=0.1):
    player_cards = deal_cards(num=2)
    dealer_cards = deal_cards(num=2)
    dealer_visible = dealer_cards[0]
    states = []
    actions = []
    rewards = []
    # If the player has 21, it has won unless the dealer also has 21
    player_sum = get_value(player_cards, aces_as_11=np.where(np.array(player_cards) == Card.ACE))
    if player_sum == 21:
        states.append((21, dealer_visible, True))
        actions.append(Action.STICK)
        if not get_value(dealer_cards, aces_as_11=np.where(np.array(dealer_cards) == Card.ACE)):
            rewards.append(-1)
        else:
            rewards.append(1)
        return states, actions, rewards
    # If the player has less than 12, he hits until he reaches it.
    while player_sum < 12:
        new_card = deal_cards(num=1)
        player_cards.append(new_card)
        player_sum = get_value(player_cards, aces_as_11=np.where(np.array(player_cards) == Card.ACE))
    print(player_sum)
    print(player_cards)
    # states = [START_POS]
    # actions = []
    # rewards = []
    # current_state = start_state
    # while not reached_target(current_state):
    #     # Epsilon-greedy selection
    #     if np.random.random() <= epsilon:
    #         action = select_random_action(current_state, policy)
    #     else:
    #         action = select_action_according_to_policy(current_state, policy)
    #     next_state, reward = execute_action(current_state, action)
    #     current_state = next_state
    #     states.append(next_state)
    #     actions.append(action)
    #     rewards.append(reward)
    # return states, actions, rewards

play_episode(player_policy)

  player_sum = get_value(player_cards, aces_as_11=np.where(np.array(player_cards) == Card.ACE))


TypeError: unhashable type: 'list'