In [1]:
import numpy as np
from blackjack_env import BlackJackEnv, CARD_INDX

policy = np.zeros(shape=(10,10,2))

for player_sum_idx in range(10): # simulating dealers policy
    actual_player_sum = player_sum_idx + 12
    for dealer_card_idx in range(10):
        for usable_ace_idx in range(2):
            if actual_player_sum >= 17:
                policy[player_sum_idx][dealer_card_idx][usable_ace_idx] = 1 # Stick
            else:
                policy[player_sum_idx][dealer_card_idx][usable_ace_idx] = 0 # Hit
    
def firstvisit_mc_eval(policy, runs=10000, gamma=1):
    state_values = np.zeros(shape=(10,10,2))
    returns = [[[[], []] for _ in range(10)] for _ in range(10)]
    
    bj = BlackJackEnv()

    for _ in range(runs):
        episode = bj.run_episode(policy)
        states_seen = set()
        g = 0

        for t in reversed(range(len(episode))):
            state_dict, action, reward = episode[t]
            
            g = reward + gamma * g

            h_i = state_dict['hand_sum'] - 12
            d_i = CARD_INDX[state_dict['dealer_card']]
            ua_i = 1 if state_dict['usable_aces'] > 0 else 0

            state_key = (h_i, d_i, ua_i)

            if state_key not in states_seen:
                states_seen.add(state_key)
                returns[h_i][d_i][ua_i].append(g)
                state_values[h_i][d_i][ua_i] = np.mean(returns[h_i][d_i][ua_i])

    return state_values

firstvisit_mc_eval(policy, 100000)

1
1
1
1
0
2
3
2
1
3
0
7
4
8
3
4
6
5
2
1
2
1
9
1
1
1
8
3
8
5
8
9
8
8
9
6
4
5
4
5
8
0
3
8
8
2
8
8
5
8
1
6
4
3
5
3
2
0
0
5
3
6
9
6
9
5
4
2
7
1
3
9
8
8
8
8
4
0
9
3
4
6
4
8
8
5
2
5
1
0
1
3
1
8
6
0
7
5
6
7
6
8
5
5
0
5
1
4
2
2
1
3
9
1
3
6
9
8
0
8
1
6
0
1
6
9
1
2
6
4
9
2
9
8
0
9
5
9
3
2
8
1
5
4
1
6
3
0
3
6
1
7
1
2
1
5
8
8
7
2
3
7
8
6
9
9
8
2
4
2
4
2
5
4
8
2
2
7
0
1
3
6
4
7
8
5
1
8
8
3
6
0
5
8
0
2
4
9
5
3
2
8
1
3
0
7
2
9
3
3
6
3
7
2
1
1
8
1
8
8
7
0
0
9
2
1
7
1
0
2
3
9
0
7
1
7
2
4
4
1
4
9
0
0
9
0
1
2
7
1
3
3
4
9
8
8
1
8
3
7
9
2
8
2
0
2
0
6
7
1
2
2
3
5
4
2
8
0
7
1
8
7
0
6
8
2
4
8
8
7
8
6
7
6
0
3
2
2
7
4
0
1
0
7
2
4
2
8
0
1
2
1
9
7
6
2
1
0
8
5
8
4
7
9
8
4
2
1
6
8
8
1
9
5
6
9
8
0
7
2
4
7
1
0
0
7
8
1
4
1
8
3
1
1
7
5
9
6
0
5
8
0
0
0
0
7
4
5
2
7
8
9
3
8
1
0
8
0
3
4
2
2
2
3
4
6
3
3
3
1
4
0
7
8
1
0
8
6
2
0
0
0
4
4
8
3
1
6
4
0
5
1
0
2
2
2
3
8
3
7
5
2
7
8
5
8
7
3
0
0
3
6
7
5
4
2
3
1
7
0
1
6
3
5
1
7
2
5
4
7
2
9
2
2
5
8
0
1
8
5
0
8
1
2
8
2
0
6
8
4
0
5
0
1
8
1
2
8
6
1
8
3
4
0
3
6
9
4
8
8
2
3
1
4
4
9
3
7
8
0


array([[[-0.67791077, -0.66037736],
        [-0.29562433, -0.1       ],
        [-0.28617021, -0.29166667],
        [-0.22643554,  0.06779661],
        [-0.20565832, -0.16666667],
        [-0.14081408, -0.03508772],
        [-0.3257732 , -0.23333333],
        [-0.35058078, -0.25      ],
        [-0.45402299, -0.16666667],
        [-0.50231041, -0.42792793]],

       [[-0.64372919, -0.58333333],
        [-0.26387316, -0.03539823],
        [-0.21881607,  0.03960396],
        [-0.27455357, -0.1092437 ],
        [-0.23024055, -0.03      ],
        [-0.20792079, -0.04672897],
        [-0.31145717, -0.05882353],
        [-0.45039019, -0.144     ],
        [-0.44058296, -0.35416667],
        [-0.52345679, -0.3559322 ]],

       [[-0.6443418 , -0.6015625 ],
        [-0.33293124, -0.16535433],
        [-0.37617925,  0.        ],
        [-0.33135392, -0.15555556],
        [-0.28648069,  0.        ],
        [-0.26729191, -0.08823529],
        [-0.41888889, -0.16153846],
        [-0.49015064, -0