## BlackJack Implementation

In [None]:
import random
import gym
from collections import defaultdict

In [None]:
# Global BlackJack environment.
env = gym.make('Blackjack-v0')

# Global function to calculate the average of a list
avg = lambda lst : sum(lst) / len(lst)

In [None]:
def generate_an_episode(q_table, weights, rewards):

    # Used to store an episode
    episode = []

    state = env.reset()

    done = False
    while not done:

        # Take in an action
        action = random.choices(population=[0, 1], weights=weights[state], k=1)[0]

        # Interact with the environment to generate rewards and next states
        next_state, reward, done, _ = env.step(action)

        # Store state, action, and reward to episode
        episode.append([state, action, reward])

        # Update state
        state = next_state

    return episode

In [None]:
def reversely_traversal_episode(q_table, weights, rewards, episode):

  # Init the reward
  R = 0.0

  # Episode reverse traversal
  episode.reverse()
  for t in range(len(episode)):
    # Get state, action, and reward from each round in an episode   
    state, action, reward = episode[t]

    # Reward function
    # R = R + γ * reward
    # γ = 1 in this case because every game is independent. So ignored here.
    R += reward

    # Add R into the collection of rewards.
    rewards[state][action].append(R)

    # Updating Q-table
    q_table[state][action] = avg(rewards[state][action])

    # The Winning decision has higher chance be used on the next episode
    good_choise = max(q_table[state], key = q_table[state].get)
    weights[state][good_choise] = 0.9
    weights[state][1 - good_choise] = 0.1

In [None]:
def monte_carlo(sample_size = 50000):

  # Used on Q_learning
  q_table = defaultdict(lambda: {0:0.0, 1:0.0})

  # When the AI knows nothing, the weight of action is evenly distributed.
  weights = defaultdict(lambda: [0.5, 0.5])

  # Used to collect rewards
  rewards = defaultdict(lambda: {0:[], 1:[]})

  for i in range(sample_size):

    # Generate episode
    episode = generate_an_episode(q_table, weights, rewards)

    reversely_traversal_episode(q_table, weights, rewards, episode)

  return q_table

In [None]:
raw_policy = monte_carlo()

# gym.Env.BlackJackEnv only accept two input actions: 0 & 1 (False & True）， O represent stand and 1 represent hit.
# Here we convert from {state: [0's reward, 1's reward]} to {state: 0 or 1} 
act = lambda dic : max(dic, key = dic.get)
policy = {key: act(val) for key, val in raw_policy.items()}

## Test Policy

In [None]:
# A single round of game
def sample_simulation(policy):
  init_state = env.reset()
  current_state = init_state

  done = False
  while not done:
    # Using policy get from AI to play the game, and collect the result.
    action = policy[current_state]
    current_state, reward, done, _ = env.step(action)

  # Only collect initial state
  return init_state, reward

In [None]:
def policy_check(policy, sample_size = 100000):

  # Result
  result_set = {}
  for iter in range(sample_size):

    # Get init state of a game and it's result.
    state, reward = sample_simulation(policy)

    # Store result
    if state in result_set:
      result_set[state].append(reward)
    else:
      result_set[state] = [reward]

  return result_set

In [None]:
# Used to summarize win/tie/loss rate from policy_check() by state.
def over_all_rate(game_result):  
  rate = {}
  for result_state, result in game_result.items():
    win_rate = result.count(1.0) / len(result)
    tie_rate = result.count(0.0) / len(result)
    loss_rate = result.count(-1.0) / len(result)
    rate[result_state] = [win_rate, tie_rate, loss_rate]
  return rate

# Win rate & loss rate

Around 44.5% win rate, 47.5% loss rate.

Note that deviation may occur.

In [None]:
# Print win and loss rate.
def win_loss_rate(game_result):
  rate = over_all_rate(game_result)

  #result[0] is where win_rate in each state.
  win_rate = avg([result[0] for result in rate.values()])
  print("Win  Rate: " + str(win_rate))

  loss_rate = avg([result[2] for result in rate.values()])
  print("Loss Rate: " + str(loss_rate))

In [None]:
# Check policy and get policy win/loss rate.
overall_ratio = policy_check(policy=policy, sample_size=100000)
win_loss_rate(overall_ratio)

Win  Rate: 0.44763889242363675
Loss Rate: 0.4742434536263865
