In [40]:
%matplotlib inline

import gym
import matplotlib
import numpy as np
import sys

from collections import defaultdict
if "../" not in sys.path:
  sys.path.append("../") 
from lib.envs.blackjack import BlackjackEnv
from lib import plotting

matplotlib.style.use('ggplot')

In [41]:
env = BlackjackEnv()

In [42]:
def create_random_policy(nA):
    """
    Creates a random policy function.
    
    Args:
        nA: Number of actions in the environment.
    
    Returns:
        A function that takes an observation as input and returns a vector
        of action probabilities
    """
    A = np.ones(nA, dtype=float) / nA
    def policy_fn(observation):
        return A
    return policy_fn

In [43]:
def create_greedy_policy(Q):
    """
    Creates a greedy policy based on Q values.
    
    Args:
        Q: A dictionary that maps from state -> action values
        
    Returns:
        A function that takes an observation as input and returns a vector
        of action probabilities.
    """
    
    def policy_fn(observation):
        # Implement this!
        prob = np.zeros(len(Q[0]))
        best_a = np.argmax(Q[observation])
        prob[best_a] = 1.0
        return prob
        # Implemented by sunsky
    return policy_fn

In [44]:
def mc_control_importance_sampling(env, num_episodes, behavior_policy, discount_factor=1.0):
    """
    Monte Carlo Control Off-Policy Control using Weighted Importance Sampling.
    Finds an optimal greedy policy.
    
    Args:
        env: OpenAI gym environment.
        num_episodes: Nubmer of episodes to sample.
        behavior_policy: The behavior to follow while generating episodes.
            A function that given an observation returns a vector of probabilities for each action.
        discount_factor: Lambda discount factor.
    
    Returns:
        A tuple (Q, policy).
        Q is a dictionary mapping state -> action values.
        policy is a function that takes an observation as an argument and returns
        action probabilities. This is the optimal greedy policy.
    """
    
    # The final action-value function.
    # A dictionary that maps state -> action values
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    N = defaultdict(lambda: np.zeros(env.action_space.n))
    D = defaultdict(lambda: np.zeros(env.action_space.n))
    
    # Our greedily policy we want to learn
    target_policy = create_greedy_policy(Q)
    
    # Implement this!
    for i_episode in range(num_episodes):
        state = env.reset()
        episode = []
        for t in range(100):
            prob = behavior_policy(state)
            prob2 = target_policy(state)
            a = np.random.choice(np.arange(len(prob)), p=prob)
            a2 = np.argmax(prob2)
            next_s, r, done, _ = env.step(a)
            episode.append((state, a, a2, r))
            if done:
                break
            state = next_s

        for tao in range(len(episode))[::-1]:
            if episode[tao][1] != episode[tao][2]:
                break
        sa_in_episodes = [(x[0],x[1],x[2]) for x in episode[tao+1:]]
        for s,a,r in sa_in_episodes:
            first_occur_idx = next(i for i,x in enumerate(sa_in_episodes) \
                                   if x[0]==s and x[1]==a)
            G = sum(x[2]*(discount_factor**i) for i,x in \
                    enumerate(sa_in_episodes[first_occur_idx:]))
            w = 1.0
            for i in range(len(sa_in_episodes[first_occur_idx+1:])):
                state, action, action2, reward = sa_in_episodes[first_occur_idx+i+1]
                w *= 1 / behavior_policy(state)[action]
            N[s][a] += w*G
            D[s][a] += w
            Q[s][a] = N[s][a] / D[s][a]
    # Implemented by sunsky
        
    return Q, target_policy

In [45]:
random_policy = create_random_policy(env.action_space.n)
Q, policy = mc_control_importance_sampling(env, num_episodes=500000, behavior_policy=random_policy)

In [46]:
# For plotting: Create value function from action-value function
# by picking the best action at each state
V = defaultdict(float)
for state, action_values in Q.items():
    action_value = np.max(action_values)
    V[state] = action_value
plotting.plot_value_function(V, title="Optimal Value Function")

TypeError: 'int' object is not subscriptable