# Monte Carlo estimation of action values

In [1]:
import numpy as np
import gym

In [2]:
env = gym.make('FrozenLake8x8-v0')

In [3]:
#define random policy
def policy(s):
    return np.random.randint(0,4)

In [7]:
def mc_estimation_action_values(n_episodes,env,gamma):
    
    n_states = env.observation_space.n
    n_actions = env.action_space.n
    
    #initialize action-value function
    Q = np.zeros((n_states,n_actions))+0.5
    #initialize state-action visit counter
    N = np.zeros((n_states,n_actions))
    
    for episode in range(n_episodes):
        observation = env.reset()
        state_action_reward_tuples = []
        #Generate episode
        for t in range(100):
            action = policy(observation)
            old_observation = observation
            observation, reward, done, info = env.step(action)
            state_action_reward_tuples.append((old_observation,action,reward))
            if done:
                break
    
        G = 0
        for state,action,reward in state_action_reward_tuples[::-1]:
            G = reward + gamma * G
            N[state][action] += 1
            Q[state][action] = (Q[state][action] + G) / N[state][action]
            
    return Q

In [8]:
gamma = 0.9
Q = mc_estimation_action_values(20000,env,gamma)

In [10]:
Q

array([[0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000],
       [0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000],
       [0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000],
       [0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        5.64209472e-147],
       [0.00000000e+000, 0.00000000e+000, 4.68263318e-101,
        8.42530285e-093],
       [3.49764133e-029, 9.57093726e-082, 1.04518320e-061,
        2.25833140e-049],
       [2.43488496e-047, 8.00536834e-037, 1.81442833e-039,
        3.93447232e-022],
       [1.21973167e-019, 1.29506362e-026, 4.98408956e-023,
        4.77901842e-026],
       [0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000],
       [0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000],
       [0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000],
       [0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
      