In [3]:
# Sutton's book Chapter 5: Blackjack with Monte Carlo Exploring Start
import random
import numpy as np
from scipy.stats import beta
import matplotlib.pyplot as plt

DECK_OF_CARDS = [1,2,3,4,5,6,7,8,9,10,10,10,10] * 4 #J, Q, K count as 10.
DEALDER_STICK_THRESHOLD = 17 # same as Sutton's and Vega's rule: https://www.youtube.com/watch?v=SWdPf21v5Ak

def sum_hand(hand):
    min_s = sum(hand)
    max_s = sum(hand)
    if 1 in hand:
        max_s = min_s  + 10
    if max_s <= 21 and max_s > min_s:
        ## Has a usable ACE.
        return (max_s, 1)
    else:
        return (min_s, 0)
    
def busted(hand):
    (s, _) = sum_hand(hand)
    return s > 21    
    
def convert_cards_to_state(my_cards, dealer_card):
    (s, usable_ace) = sum_hand(my_cards)
    # state is 0-based, for easy indexing with ndarray.
    return (s-1, dealer_card-1, usable_ace)

In [212]:
class BasePlayer(object):
    # Return 1 if hit, 0 if stick.
    def action(self, cards, rival_card):
        raise NotImplementedError("action() is not implemented.")

class Dealer(BasePlayer):
    def __init__(self):
        self.stick_threshold = DEALDER_STICK_THRESHOLD
    def action(self, cards):
        (s, _) = sum_hand(cards) #s>=1 
        return int(self.stick_threshold > s) #stick for 17 or greater, same as Sutton's 

In [5]:
class Policy(object):
    def __init__(self):
        # stats is a 5D tensor. 
        # The first 3 dimensions describe state. The 4th dimension describes
        # action (0 for stick, 1 for hit). The 5th dimension describes the
        # reward: {-1, 0, 1}.
        # self.states[i][j][k][a][r] saves counts of how many times we have
        # reward `r` when we take action `a` for state s = (i, j, k) (s is 0-based).
        self.stats = np.zeros((21, 10, 2, 2, 3), dtype='int64') # Init to 1 instead of 0 to smooth out.
        # q is a 4D tensor that represents q(s, a).
        self.q = np.zeros((21, 10, 2, 2), dtype='float64')
        # pi is a 4D tensor that represents pi(a|s), last dimension is the action
        self.pi = np.zeros((21, 10, 2, 2), dtype='float64')
        #pi[:,:,:,1] is the probability of hit while pi[:,:,:,0] is the probability of stick
        self.pi[0:11,:,:,:] = [0., 1.] #Always hit if the cards sum to [1,11]; note that this part of pi should never get updated
        self.pi[11:19,:,:,:]=[0.,1.] #same as Sutton's initial policy, hit if [12,19]; stick if 20,21
        self.pi[19,:,:,:] = [1., 0.] 
        
    def update(self, state_action_seq, final_reward):
        #update q(s,a) snd pi(a|s) using the episode(ie., state_action_seq) and the reward 
        #For blackjack, the states in the state_action_seq are all different; first visit MC and every visit MC are the same
        for state_action in state_action_seq:  
            (i,j,k,a) = state_action #i=sum(play_cards), 0<=i<=20, 0<=j<=9, k=0 or 1,  
            # reward is in {-1, 0, 1}    
            self.stats[i,j,k,a,final_reward + 1] += 1
            # q stores average reward so far.
            # Denominator is the # of samples; the numerator is the net reward accumulated from +1 and -1 cases.
            self.q[i,j,k,a] = (self.stats[i,j,k,a,2] - self.stats[i,j,k,a,0]) * 1.0 / np.sum(self.stats[i,j,k,a,:])
                        
            #greedy update pi(a|s) based on estimated q(s,a)
            if i <= 10:
                # No update: we always hit.
                continue
            if i == 20:
                # No update: we always stick.
                continue                     
            # If q[i,i,k,0] > q[i,j,k,1], argsort returns [1,0].            
            self.pi[i,j,k,:] = np.argsort(self.q[i,j,k,:]) 
  
    def action(self, state):#greedy 
        return np.argmax(self.pi[state])
    
    # Visualize matrix as a grayscale image, assuming entries are in [0,1].
    def imshow(self, ax, matrix, title):
        ax.set_title(title)
        ax.imshow(matrix, cmap='gray',vmin=0., vmax=1.)        

        ax.set_xlabel('Dealer Card')
        ax.set_xticks(np.arange(10))
        ax.set_xticklabels(np.arange(1,11,1).astype('S2'))
        
        ax.set_ylabel('Player Card')
        ax.set_yticks(np.arange(9))
        ax.set_yticklabels(np.arange(20,11,-1).astype('S2'))
        
        ax.spines['right'].set_color('none')
        ax.spines['top'].set_color('none')      
    
    # Visualize the hard decision matrix and the soft (Bayesian) decision matrix.
    def visualize(self):
        fig = plt.figure(figsize=(12, 9), dpi=80)
        
        usable_ace = 0
        ax = plt.subplot(221)
        decision = (self.pi[11:20,:,usable_ace,1] > self.pi[11:20,:,usable_ace,0]).astype(int)
        self.imshow(ax, np.flipud(decision), 'Decision (No usable Ace)')
        
        ax = plt.subplot(222)
        bayesian_decision = self.bayesian_decision(0)
        self.imshow(ax, bayesian_decision, 'Bayesian Decision (No usable Ace)')
        
        usable_ace = 1
        ax = plt.subplot(223)
        decision = (self.pi[11:,:,usable_ace,1] > self.pi[11:,:,usable_ace,0]).astype(int)
        self.imshow(ax, np.flipud(decision), 'Decision (with usable Ace)')

        ax = plt.subplot(224)
        bayesian_decision = self.bayesian_decision(usable_ace)
        self.imshow(ax, bayesian_decision, 'Bayesian Decision (with usable Ace)')

        plt.show()


In [78]:
class PolicyPlayer(BasePlayer):
    def __init__(self, policy):
        self.policy = policy
        self.state_action_pairs = []
        
    def action(self, cards, rival_card,a=None):
        s = convert_cards_to_state(cards, rival_card) #s is 0-based 
        if a is None: 
            a = self.policy.action(s)
        (i,j,k) = s  # 
        self.state_action_pairs.append((i,j,k,a))
        return a
    
    #def update_policy(self, reward): #this combines the policy evaulation and improvement step;  
       # self.policy.update(self.state_action_pairs, reward)
    def reset_states(self):
        self.state_action_pairs = []

In [79]:
# A simple debugger (to avoid clutter in main code.)
class Dbg(object):
    # level can be 0 or 1. (0 means silent.)
    def __init__(self, level):
        self.level = level
    def print_hands(self, game):
        if self.level == 0:
            return
        print 'dealer cards: ', game.dealer_cards 
        print 'player cards: ', game.player_cards
        
    def print_hand(self,cards):
        if self.level == 0:
            return
        print 'cards: ', cards 
        
    def on_dealer_action(self, action):
        if self.level == 0:
            return
        print 'dealer action: ', action
    def on_player_action(self, action):
        if self.level == 0:
            return
        print 'player action: ', action
    def print_bust_status(self, dealer, player):
        if self.level == 0:
            return
        if player and dealer:
            print 'both busted'
        elif player:
            print 'player busted'
        elif dealer:
            print 'dealer busted'
        else:
            pass
    def print_sum_of_hands(self, dealer_sum, player_sum):
        if self.level == 0:
            return
        print 'dealer sum: {}, player sum: {}'.format(dealer_sum, player_sum)
        

In [215]:
class Game(object):
    """
    Blackjack shoe game as in https://www.youtube.com/watch?v=SWdPf21v5Ak
    """
    def __init__(self, dealer, player, debug_level=0):
        self.dealer = dealer
        self.player = player
        self.cards = DECK_OF_CARDS[:]
        #random.shuffle(self.cards) #Sutton assumes infinite cards. Use sample with replacement  instead. 
        self.dbg = Dbg(debug_level)
        
    def on_player_action(self, action):
        if action == 1:
           #Sutton assumes infinite cards.
            self.player_cards.extend(np.random.choice(self.cards,size=1))
        self.dbg.on_player_action(action)
    def on_dealer_action(self, action):
        if action == 1:
            self.dealer_cards.extend(np.random.choice(self.cards,size=1))
        self.dbg.on_dealer_action(action)
        
    # Returns 1 iff player wins.    
    def play(self,dealer_init_cards, player_init_cards, a):
        """
        each play generates an episode starting from state the initial states and action 
        dealer_init_cards, player_init_cards: List[Int] of size 2; must be non-busted state; 
        a: player first action,  1 for hit ; 0 for stick, 
        return: 1 if the player wins, -1 if the player loses, 0 if draw
        """ 
        self.player.reset_states() #reset the player's memory of state_action pair
        self.dealer_cards = list(dealer_init_cards)
        self.player_cards = list(player_init_cards)
        
        assert not busted(dealer_init_cards) and not busted(player_init_cards), "Error: Invalid initial state!"
        
        #player's turn
        self.player.action(self.player_cards,self.dealer_cards[0],a)
        while a:
            self.on_player_action(a)  # update the states 
            if busted(self.player_cards): 
                return -1 
            a = self.player.action(self.player_cards, self.dealer_cards[0])
    
        self.on_player_action(a)
        
        #dealer's turn
        a = self.dealer.action(self.dealer_cards)   
        while a:
            self.on_dealer_action(a)  # update the states 
            if busted(self.dealer_cards): 
                return 1
            a = self.dealer.action(self.dealer_cards) #dealer sticks to the same rule regardless of the player's cards
            
        #both dealer and player stick 
        d_sum, _ = sum_hand(self.dealer_cards)
        p_sum, _ = sum_hand(self.player_cards)
        return np.sign(p_sum - d_sum)
        
  

In [216]:
class Learner:
    def __init__(self):
        self.player_policy = Policy()
        self.player = PolicyPlayer(self.player_policy)
        self.dealer = Dealer()
        self.game = Game(self.dealer,self.player)
    def train(self):
        """
        learn the optimal policy by MC with exploring starts 
        return: optimal policy pi(a|s)
        """
        np.random.seed(4)
        max_iter = int(10)
        for i in range(max_iter): 
            #dealer_init_cards = np.random.choice(self.game.cards,size=2,replace=True)
            #player_init_cards = np.random.choice(self.game.cards,size=2, replace=True)
            dealer_init_cards = np.array([10,2])
            player_init_cards = np.array([10,3])
            if not busted(dealer_init_cards) and not busted(player_init_cards):
                for a in [0,1]:
                    rt = self.game.play(dealer_init_cards, player_init_cards,a)
                    self.player_policy.update(self.player.state_action_pairs,rt) 
                    print "i=%d,a=%d,rt=%d,dealer_cards=%s,player_cards=%s, player_mem=%s" % (i,a,rt,self.game.dealer_cards,self.game.player_cards,self.game.player.state_action_pairs)
                    print "\t self.player_policy.stats[12,9,0,%d]:%s" % (a,self.player_policy.stats[12,9,0,a])
        return self.player_policy

In [217]:
learner = Learner()
optimal_policy = learner.train()


i=0,a=0,rt=-1,dealer_cards=[10, 2, 8],player_cards=[10, 3], player_mem=[(12, 9, 0, 0)]
	 self.player_policy.stats[12,9,0,0]:[1 0 0]
i=0,a=1,rt=1,dealer_cards=[10, 2, 2, 10],player_cards=[10, 3, 6, 2], player_mem=[(12, 9, 0, 1), (18, 9, 0, 1), (20, 9, 0, 0)]
	 self.player_policy.stats[12,9,0,1]:[0 0 1]
i=1,a=0,rt=-1,dealer_cards=[10, 2, 9],player_cards=[10, 3], player_mem=[(12, 9, 0, 0)]
	 self.player_policy.stats[12,9,0,0]:[2 0 0]
i=1,a=1,rt=-1,dealer_cards=[10, 2],player_cards=[10, 3, 10], player_mem=[(12, 9, 0, 1)]
	 self.player_policy.stats[12,9,0,1]:[1 0 1]
i=2,a=0,rt=1,dealer_cards=[10, 2, 10],player_cards=[10, 3], player_mem=[(12, 9, 0, 0)]
	 self.player_policy.stats[12,9,0,0]:[2 0 1]
i=2,a=1,rt=1,dealer_cards=[10, 2, 5],player_cards=[10, 3, 1, 7], player_mem=[(12, 9, 0, 1), (13, 9, 0, 1), (20, 9, 0, 0)]
	 self.player_policy.stats[12,9,0,1]:[1 0 2]
i=3,a=0,rt=1,dealer_cards=[10, 2, 2, 10],player_cards=[10, 3], player_mem=[(12, 9, 0, 0)]
	 self.player_policy.stats[12,9,0,0]:[2 0 2

In [140]:
optimal_policy.pi.shape


(21, 10, 2, 2)

In [192]:
optimal_policy.pi[11:,:,0,1]

array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])

In [190]:
i,j,k = 12,9,0
optimal_policy.pi[i,j,k,0], optimal_policy.pi[i,j,k,1]

(0.0, 1.0)

In [193]:
optimal_policy.q[i,j,k]

array([-0.29866836, -0.38576631])

In [195]:
(21394-40234)*0.1/(40234+1452+21394)

-0.02986683576410907

In [191]:
optimal_policy.stats[i,j,k,0],optimal_policy.stats[i,j,k,1]

(array([40234,  1452, 21394]), array([16779,  1452,  7033]))

In [189]:
np.sum(optimal_policy.stats[11:,:,:,:,1]), np.sum(optimal_policy.stats[11:,:,:,0,1]==optimal_policy.stats[11:,:,:,1,1])

(158744, 72)

In [157]:
optimal_policy.pi[i,j,k,:]=[0.0,1.0]

In [176]:
learner.game.prepare_new()
learner.game.play([10,2],[10,3],1)

1

In [178]:
learner.game.dealer_cards, learner.game.player_cards

([10, 2, 6], [10, 3, 6])

In [179]:
learner.player.state_action_pairs

[(12, 9, 0, 1), (18, 9, 0, 0)]

In [127]:
learner.game.prepare_new()
play2(learner.game,[7,2],[10,7],1)
#learner.game.play([9,5],[10,10],0)

a= [(20, 6, 0, 0)]


1

In [126]:
def play2(self,dealer_init_cards, player_init_cards, a):
        """
        each play generates an episode starting from state the initial states and action 
        dealer_init_cards, player_init_cards: List[Int] of size 2; must be non-busted state; 
        a: player first action,  1 for hit ; 0 for stick, 
        return: 1 if the player wins, -1 if the player loses, 0 if draw
        """ 
        self.dealer_cards = list(dealer_init_cards)
        self.player_cards = list(player_init_cards)
        
        assert not busted(dealer_init_cards) and not busted(player_init_cards), "Error: Invalid initial state!"
        
        #player's turn
        if a == 0: 
            #player memorize the action-state 
            self.player.action(self.player_cards,self.dealer_cards[0],a=0)
        while a:
            self.on_player_action(a)  # update the states 
            if busted(self.player_cards): 
                return -1 
            a = self.player.action(self.player_cards, self.dealer_cards[0])
            print 'a=',self.player.state_action_pairs
    
        self.on_player_action(a)
        
        #dealer's turn
        a = self.dealer.action(self.dealer_cards, self.player_cards[0])   
        while a:
            self.on_dealer_action(a)  # update the states 
            if busted(self.dealer_cards): 
                return 1
            a = self.dealer.action(self.player_cards, self.dealer_cards[0]) 
            
        #both dealer and player stick 
        d_sum, _ = sum_hand(self.dealer_cards)
        p_sum, _ = sum_hand(self.player_cards)
        return np.sign(p_sum - d_sum)

In [128]:
learner.game.player.state_action_pairs

[(20, 6, 0, 0)]