In [1]:
import gym
import numpy as np
import matplotlib
import sys

from collections import defaultdict
if "../" not in sys.path:
  sys.path.append("../") 
from lib.envs.blackjack import BlackjackEnv
from lib import plotting

In [2]:
env = BlackjackEnv()

In [3]:
def create_random_policy(nA):
    """
    Creates a random policy function.
    
    Args:
        nA: Number of actions in the environment.
    
    Returns:
        A function that takes an observation as input and returns a vector
        of action probabilities
    """
    A = np.ones(nA, dtype=float) / nA
    def policy_fn(observation):
        return A
    return policy_fn

In [4]:
def create_greedy_policy(Q):
    """
    Creates a greedy policy based on Q values.
    
    Args:
        Q: A dictionary that maps from state -> action values
        
    Returns:
        A function that takes an observation as input and returns a vector
        of action probabilities.
    """
    
    def policy_fn(state):
        A = np.zeros_like(Q[state], dtype=float)
        best_action = np.argmax(Q[state])
        A[best_action] = 1.0
        return A
    return policy_fn

In [38]:
def mc_control_importance_sampling(env, num_episodes, behavior_policy, discount_factor=1.0):
    """
    Monte Carlo Control Off-Policy Control using Weighted Importance Sampling.
    Finds an optimal greedy policy.
    
    Args:
        env: OpenAI gym environment.
        num_episodes: Nubmer of episodes to sample.
        behavior_policy: The behavior to follow while generating episodes.
            A function that given an observation returns a vector of probabilities for each action.
        discount_factor: Lambda discount factor.
    
    Returns:
        A tuple (Q, policy).
        Q is a dictionary mapping state -> action values.
        policy is a function that takes an observation as an argument and returns
        action probabilities. This is the optimal greedy policy.
    """
    # The final action-value function.
    # A dictionary that maps state -> action values
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    # The cumulative denominator of the weighted importance sampling formula
    # (across all episodes)
    C = defaultdict(lambda: np.zeros(env.action_space.n))
    
    
    # Our greedily policy we want to learn
    target_policy = create_greedy_policy(Q)
    
    for i_episode in range(1, num_episodes + 1):
        episode = []
        state = env.reset()
        print "Episode Number: {}".format(i_episode)
        for t in range(100):
            probs = behavior_policy(state)
            action = np.random.choice(np.arange(len(probs)), p=probs)
            next_state, reward, done, _ = env.step(action)
            episode.append((state, action, reward))
            if done:
                break
            state = next_state
        
        G = 0.0
        W = 1.0
        #episode2 = episode
        #episode += episode2
        for t in range(len(episode))[::-1]:
            print "episode len: {}, step: {}".format(len(episode), t)
            print episode[t]
            print "W is : {}\n".format(W)
            state, action, reward = episode[t]
            G = G*discount_factor + reward
            print "G is : {}\n".format(G)
            C[state][action] += W
            print "C is : {}\n".format(C)
            Q[state][action] += (W/C[state][action]) - (G - Q[state][action])
            print "Q is :{}\n".format(Q)
            if action != np.argmax(target_policy(state)):
                print "action: {} is not equal to argmax: {}".format(action, np.argmax(target_policy(state)))
                break
            W = 1.0/behavior_policy(state)[action]
        print "End Episode {}".format(i_episode)
        print 15*'*'
    return Q, target_policy

In [39]:
random_policy = create_random_policy(env.action_space.n)
Q, policy = mc_control_importance_sampling(env, num_episodes=10, behavior_policy=random_policy)

Episode Number: 1
episode len: 1, step: 0
((12, 10, False), 0, -1)
W is : 1.0

G is : -1.0

C is : defaultdict(<function <lambda> at 0x7fb137f3c938>, {(12, 10, False): array([ 1.,  0.])})

Q is :defaultdict(<function <lambda> at 0x7fb137f09a28>, {(12, 10, False): array([ 2.,  0.])})

End Episode 1
***************
Episode Number: 2
episode len: 1, step: 0
((20, 3, False), 1, -1)
W is : 1.0

G is : -1.0

C is : defaultdict(<function <lambda> at 0x7fb137f3c938>, {(20, 3, False): array([ 0.,  1.]), (12, 10, False): array([ 1.,  0.])})

Q is :defaultdict(<function <lambda> at 0x7fb137f09a28>, {(20, 3, False): array([ 0.,  2.]), (12, 10, False): array([ 2.,  0.])})

End Episode 2
***************
Episode Number: 3
episode len: 3, step: 2
((19, 10, False), 1, -1)
W is : 1.0

G is : -1.0

C is : defaultdict(<function <lambda> at 0x7fb137f3c938>, {(19, 10, False): array([ 0.,  1.]), (20, 3, False): array([ 0.,  1.]), (12, 10, False): array([ 1.,  0.])})

Q is :defaultdict(<function <lambda> at 0