# MDP with value iteration

In [1]:
import numpy as np

'''==================================================
Initial set up
=================================================='''

#Hyperparameters
SMALL_ENOUGH = 0.005
GAMMA = 0.9         
NOISE = 0.10  

#Define all states
all_states=[]
for i in range(2):
    for j in range(2):
            all_states.append((i,j))
print(f"All states: {all_states}")

#Define rewards for all states
rewards = {}
for i in all_states:
    if i == (0,1):
        rewards[i] = -1
    elif i == (1,1):
        rewards[i] = 1
    else:
        rewards[i] = 0
print(f"All rewards: {rewards}")

#Dictionnary of possible actions. We have two "end" states (1,2 and 2,2)
actions = {
    (0,0):('D', 'R'), 
    (0,1):('D', 'L'),    
    (1,0):('U', 'R'),
    (1,1):('L', 'U'),
    }
print(f"All actions: {actions}")

#Define an initial policy
policy={}
for s in actions.keys():
    policy[s] = np.random.choice(actions[s])
print(f"Initial policy: {policy}")

#Define initial value function 
V={}
for s in all_states:
    if s in actions.keys():
        V[s] = 0
    if s ==(2,2):
        V[s]=-1
    if s == (1,2):
        V[s]=-1
    if s == (2,3):
        V[s]=1
print(f"Initial value function : {V}")

'''==================================================
Value Iteration
=================================================='''

iteration = 0

while True:
    biggest_change = 0
    print(f"Iteration : {V}")
    for s in all_states:            
        if s in policy:
            
            old_v = V[s]
            new_v = 0
            
            for a in actions[s]:
                if a == 'U':
                    nxt = [s[0]-1, s[1]]
                if a == 'D':
                    nxt = [s[0]+1, s[1]]
                if a == 'L':
                    nxt = [s[0], s[1]-1]
                if a == 'R':
                    nxt = [s[0], s[1]+1]

                #Choose a new random action to do (transition probability)
                random_1=np.random.choice([i for i in actions[s] if i != a])
                if random_1 == 'U':
                    act = [s[0]-1, s[1]]
                if random_1 == 'D':
                    act = [s[0]+1, s[1]]
                if random_1 == 'L':
                    act = [s[0], s[1]-1]
                if random_1 == 'R':
                    act = [s[0], s[1]+1]

                #Calculate the value
                nxt = tuple(nxt)
                act = tuple(act)
                v = rewards[s] + (GAMMA * ((1-NOISE)* V[nxt] + (NOISE * V[act]))) 
                print(f"v = r(s) + gamma ((1-noise))")
                if v > new_v: #Is this the best action so far? If so, keep it
                    new_v = v
                    policy[s] = a

       #Save the best of all actions for the state                                
            V[s] = new_v
            biggest_change = max(biggest_change, np.abs(old_v - V[s]))

            
   #See if the loop should stop now         
    if biggest_change < SMALL_ENOUGH:
        break
    iteration += 1

All states: [(0, 0), (0, 1), (1, 0), (1, 1)]
All rewards: {(0, 0): 0, (0, 1): -1, (1, 0): 0, (1, 1): 1}
All actions: {(0, 0): ('D', 'R'), (0, 1): ('D', 'L'), (1, 0): ('U', 'R'), (1, 1): ('L', 'U')}
Initial policy: {(0, 0): 'D', (0, 1): 'D', (1, 0): 'R', (1, 1): 'L'}
Initial value function : {(0, 0): 0, (0, 1): 0, (1, 0): 0, (1, 1): 0}
Iteration : {(0, 0): 0, (0, 1): 0, (1, 0): 0, (1, 1): 0}
v = r(s) + gamma ((1-noise))
v = r(s) + gamma ((1-noise))
v = r(s) + gamma ((1-noise))
v = r(s) + gamma ((1-noise))
v = r(s) + gamma ((1-noise))
v = r(s) + gamma ((1-noise))
v = r(s) + gamma ((1-noise))
v = r(s) + gamma ((1-noise))
Iteration : {(0, 0): 0, (0, 1): 0, (1, 0): 0, (1, 1): 1.0}
v = r(s) + gamma ((1-noise))
v = r(s) + gamma ((1-noise))
v = r(s) + gamma ((1-noise))
v = r(s) + gamma ((1-noise))
v = r(s) + gamma ((1-noise))
v = r(s) + gamma ((1-noise))
v = r(s) + gamma ((1-noise))
v = r(s) + gamma ((1-noise))
Iteration : {(0, 0): 0, (0, 1): 0, (1, 0): 0.81, (1, 1): 1.6561000000000001}
v = r(

In [1]:
import numpy as np
import pickle


class BlackJackSolution:

    def __init__(self, lr=0.1, exp_rate=0.3):
        self.player_Q_Values = {}  # key: [(player_value, show_card, usable_ace)][action] = value
        # initialise Q values | (12-21) x (1-10) x (True, False) x (1, 0) 400 in total
        for i in range(12, 22):
            for j in range(1, 11):
                for k in [True, False]:
                    self.player_Q_Values[(i, j, k)] = {}
                    for a in [1, 0]:
                        if (i == 21) and (a == 0):
                            self.player_Q_Values[(i, j, k)][a] = 1
                        else:
                            self.player_Q_Values[(i, j, k)][a] = 0

        self.player_state_action = []
        self.state = (0, 0, False)  # initial state
        self.actions = [1, 0]  # 1: HIT  0: STAND
        self.end = False
        self.lr = lr
        self.exp_rate = exp_rate

    # give card
    @staticmethod
    def giveCard():
        # 1 stands for ace
        c_list = list(range(1, 11)) + [10, 10, 10]
        return np.random.choice(c_list)

    def dealerPolicy(self, current_value, usable_ace, is_end):
        if current_value > 21:
            if usable_ace:
                current_value -= 10
                usable_ace = False
            else:
                return current_value, usable_ace, True
        # HIT17
        if current_value >= 17:
            return current_value, usable_ace, True
        else:
            card = self.giveCard()
            if card == 1:
                if current_value <= 10:
                    return current_value + 11, True, False
                return current_value + 1, usable_ace, False
            else:
                return current_value + card, usable_ace, False

    def chooseAction(self):
        # if current value <= 11, always hit
        current_value = self.state[0]
        if current_value <= 11:
            return 1

        if np.random.uniform(0, 1) <= self.exp_rate:
            action = np.random.choice(self.actions)
        #             print("random action", action)
        else:
            # greedy action
            v = -999
            action = 0
            for a in self.player_Q_Values[self.state]:
                if self.player_Q_Values[self.state][a] > v:
                    action = a
                    v = self.player_Q_Values[self.state][a]
        #             print("greedy action", action)
        return action

    # one can only has 1 usable ace
    # return next state
    def playerNxtState(self, action):
        current_value = self.state[0]
        show_card = self.state[1]
        usable_ace = self.state[2]

        if action:
            # action hit
            card = self.giveCard()
            if card == 1:
                if current_value <= 10:
                    current_value += 11
                    usable_ace = True
                else:
                    current_value += 1
            else:
                current_value += card
        else:
            # action stand
            self.end = True
            return (current_value, show_card, usable_ace)

        if current_value > 21:
            if usable_ace:
                current_value -= 10
                usable_ace = False
            else:
                self.end = True
                return (current_value, show_card, usable_ace)

        return (current_value, show_card, usable_ace)

    def winner(self, player_value, dealer_value):
        # player 1 | draw 0 | dealer -1
        winner = 0
        if player_value > 21:
            if dealer_value > 21:
                # draw
                winner = 0
            else:
                winner = -1
        else:
            if dealer_value > 21:
                winner = 1
            else:
                if player_value < dealer_value:
                    winner = -1
                elif player_value > dealer_value:
                    winner = 1
                else:
                    # draw
                    winner = 0
        return winner

    def _giveCredit(self, player_value, dealer_value):
        reward = self.winner(player_value, dealer_value)
        # backpropagate reward
        for s in reversed(self.player_state_action):
            state, action = s[0], s[1]
            reward = self.player_Q_Values[state][action] + self.lr*(reward - self.player_Q_Values[state][action])
            self.player_Q_Values[state][action] = round(reward, 3)

    def reset(self):
        self.player_state_action = []
        self.state = (0, 0, False)  # initial state
        self.end = False

    def deal2cards(self, show=False):
        # return value after 2 cards and usable ace
        value, usable_ace = 0, False
        cards = [self.giveCard(), self.giveCard()]
        if 1 in cards:
            value = sum(cards) + 10
            usable_ace = True
        else:
            value = sum(cards)
            usable_ace = False

        if show:
            return value, usable_ace, cards[0]
        else:
            return value, usable_ace

    def play(self, rounds=1000):
        for i in range(rounds):
            if i % 1000 == 0:
                print("round", i)

            # give 2 cards
            dealer_value, d_usable_ace, show_card = self.deal2cards(show=True)
            player_value, p_usable_ace = self.deal2cards(show=False)

            self.state = (player_value, show_card, p_usable_ace)
            print("init", self.state)

            # judge winner after 2 cards
            if player_value == 21 or dealer_value == 21:
                # game end
                next
            else:
                while True:
                    action = self.chooseAction()  # state -> action
                    if self.state[0] >= 12:
                        state_action_pair = [self.state, action]
                        self.player_state_action.append(state_action_pair)
                    # update next state
                    self.state = self.playerNxtState(action)
                    if self.end:
                        break

                        # dealer's turn
                is_end = False
                while not is_end:
                    dealer_value, d_usable_ace, is_end = self.dealerPolicy(dealer_value, d_usable_ace, is_end)

                # judge winner
                # give reward and update Q value
                player_value = self.state[0]
                print("player value {} | dealer value {}".format(player_value, dealer_value))
                self._giveCredit(player_value, dealer_value)
  
            self.reset()

    def savePolicy(self, file="policy"):
        fw = open(file, 'wb')
        pickle.dump(self.player_Q_Values, fw)
        fw.close()

    def loadPolicy(self, file="policy"):
        fr = open(file, 'rb')
        self.player_Q_Values = pickle.load(fr)
        fr.close()

    # trained robot play against dealer
    def playWithDealer(self, rounds=1000):
        self.reset()
        self.loadPolicy()
        self.exp_rate = 0

        result = np.zeros(3)  # player [win, draw, lose]
        for _ in range(rounds):
            # hit 2 cards each
            # give 2 cards
            dealer_value, d_usable_ace, show_card = self.deal2cards(show=True)
            player_value, p_usable_ace = self.deal2cards(show=False)

            self.state = (player_value, show_card, p_usable_ace)

            # judge winner after 2 cards
            if player_value == 21 or dealer_value == 21:
                if player_value == dealer_value:
                    result[1] += 1
                elif player_value > dealer_value:
                    result[0] += 1
                else:
                    result[2] += 1
            else:
                # player's turn
                while True:
                    action = self.chooseAction()
                    # update next state
                    self.state = self.playerNxtState(action)
                    if self.end:
                        break

                        # dealer's turn
                is_end = False
                while not is_end:
                    dealer_value, d_usable_ace, is_end = self.dealerPolicy(dealer_value, d_usable_ace, is_end)

                # judge
                player_value = self.state[0]
                # print("player value {} | dealer value {}".format(player_value, dealer_value))
                w = self.winner(player_value, dealer_value)
                if w == 1:
                    result[0] += 1
                elif w == 0:
                    result[1] += 1
                else:
                    result[2] += 1
            self.reset()
        return result


if __name__ == "__main__":
    # training
    b = BlackJackSolution()
    b.play(10000)
    print("Done training")

    # save policy
    b.savePolicy()

    # play
    result = b.playWithDealer(rounds=1000)
    print(result)

round 0
init (11, 10, False)
player value 21 | dealer value 18
init (18, 9, False)
player value 28 | dealer value 19
init (7, 10, False)
player value 18 | dealer value 17
init (17, 7, False)
player value 21 | dealer value 24
init (15, 5, False)
player value 25 | dealer value 25
init (12, 4, False)
player value 22 | dealer value 24
init (19, 10, False)
player value 29 | dealer value 17
init (20, 10, False)
player value 30 | dealer value 18
init (12, 1, False)
player value 22 | dealer value 19
init (18, 1, False)
player value 27 | dealer value 17
init (20, 7, False)
player value 20 | dealer value 18
init (17, 2, False)
player value 21 | dealer value 20
init (19, 1, False)
init (11, 5, False)
player value 20 | dealer value 18
init (18, 10, False)
player value 18 | dealer value 20
init (18, 3, False)
player value 25 | dealer value 18
init (7, 7, False)
player value 21 | dealer value 18
init (12, 10, False)
player value 19 | dealer value 21
init (17, 4, False)
player value 27 | dealer value