In [1]:
import numpy as np

"""Monte Carlo function"""

def monte_carlo(env, V, policy, episodes=5000, max_steps=100,
                alpha=0.1, gamma=0.99):
    """Performs the Monte Carlo algorithm:
        env is the openAI environment instance.
        V is a numpy.ndarray of shape (s,) containing the value estimate.
        policy is a function that takes in a state and returns the
            next action to take.
        episodes is the total number of episodes to train over.
        max_steps is the maximum number of steps per episode.
        alpha is the learning rate.
        gamma is the discount rate.
        Returns: V, the updated value estimate."""
    for _ in range(episodes):
        state = env.reset()
        done = False
        steps = 0
        rewards = []
        states = []
        while not done and steps < max_steps:
            action = policy(state)
            next_state, reward, done, _ = env.step(action)
            states.append(state)
            rewards.append(reward)
            state = next_state
            steps += 1

        G = 0
        for t in range(steps-1, -1, -1):
            G = gamma * G + rewards[t]
            V[states[t]] = V[states[t]] + alpha * (G - V[states[t]])
    return V


In [4]:
import gym
import numpy as np
# monte_carlo = __import__('0-monte_carlo').monte_carlo

np.random.seed(0)

env = gym.make('FrozenLake8x8-v1')
LEFT, DOWN, RIGHT, UP = 0, 1, 2, 3

def policy(s):
    p = np.random.uniform()
    if p > 0.5:
        if s % 8 != 7 and env.desc[s // 8, s % 8 + 1] != b'H':
            return RIGHT
        elif s // 8 != 7 and env.desc[s // 8 + 1, s % 8] != b'H':
            return DOWN
        elif s // 8 != 0 and env.desc[s // 8 - 1, s % 8] != b'H':
            return UP
        else:
            return LEFT
    else:
        if s // 8 != 7 and env.desc[s // 8 + 1, s % 8] != b'H':
            return DOWN
        elif s % 8 != 7 and env.desc[s // 8, s % 8 + 1] != b'H':
            return RIGHT
        elif s % 8 != 0 and env.desc[s // 8, s % 8 - 1] != b'H':
            return LEFT
        else:
            return UP

V = np.where(env.desc == b'H', -1, 1).reshape(64).astype('float64')
np.set_printoptions(precision=2)
env.seed(0)
print(monte_carlo(env, V, policy).reshape((8, 8)))


[[ 8.78e-02  6.08e-02  3.38e-02  4.68e-02  8.12e-02  3.29e-05  6.10e-02
   1.43e-02]
 [ 4.69e-02  3.84e-02  4.29e-13  5.64e-03  9.84e-02  3.46e-02  6.94e-02
   4.35e-02]
 [ 4.38e-44  3.62e-44  6.37e-49 -1.00e+00  1.23e-01  1.30e-01  2.42e-01
   1.39e-01]
 [ 2.14e-26  4.49e-35  6.77e-27  4.16e-12  2.98e-07 -1.00e+00  2.35e-01
   9.48e-02]
 [ 3.07e-20  1.75e-94  3.42e-71 -1.00e+00  7.46e-04  4.11e-04  1.61e-01
   1.72e-01]
 [ 5.20e-23 -1.00e+00 -1.00e+00  9.00e-01  2.49e-02  3.94e-02 -1.00e+00
   2.58e-01]
 [ 4.16e-17 -1.00e+00  1.67e-01  6.56e-01 -1.00e+00  3.26e-01 -1.00e+00
   2.83e-01]
 [ 1.88e-07  4.05e-05  7.07e-03 -1.00e+00  1.00e+00  7.45e-01  8.29e-01
   1.00e+00]]
