In [1]:
import numpy as np
import gym
env = gym.make('FrozenLake8x8-v0')

In [2]:
#state-value function
Q = {i:{j:0 for j in range(4)} for i in range(64)}
#counter (across episodes)
C = {i:{j:0 for j in range(4)} for i in range(64)}

gamma = 0.9

In [3]:
#behaviour policy mu
def mu(state):
    return np.random.randint(0,4)

#target policy pi
def pi(state):
    return max(Q[state],key=Q[state].get)

In [4]:
for episode in range(20000):
    observation = env.reset()
    state_action_reward_tuples = []
    done = False
    
    #generate episode using mu
    while not done:
        action = mu(observation)
        old_observation = observation
        observation, reward, done, info = env.step(action)
        state_action_reward_tuples.append((observation,action,reward))
        
    G = 0
    W = 1
    
    for state,action,reward in state_action_reward_tuples[::-1]:
        G = reward + gamma * G
        C[state][action] += W
        Q[state][action] += (W/C[state][action])*(G-Q[state][action])
        if action != pi(state):
            break
        W = W * 1/0.25

In [5]:
Q

{0: {0: 0, 1: 0, 2: 0, 3: 0},
 1: {0: 0, 1: 0, 2: 0, 3: 0},
 2: {0: 0, 1: 0, 2: 0, 3: 0},
 3: {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0},
 4: {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0},
 5: {0: 0, 1: 0.0, 2: 0.0, 3: 0.0},
 6: {0: 0, 1: 0, 2: 0, 3: 0.0},
 7: {0: 0, 1: 0, 2: 0, 3: 0},
 8: {0: 0, 1: 0, 2: 0, 3: 0},
 9: {0: 0.0, 1: 0, 2: 0.0, 3: 0},
 10: {0: 0, 1: 0, 2: 0.0, 3: 0},
 11: {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0},
 12: {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0},
 13: {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0},
 14: {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0},
 15: {0: 0, 1: 0, 2: 0, 3: 0},
 16: {0: 0.0, 1: 0.0, 2: 0, 3: 0},
 17: {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0},
 18: {0: 0, 1: 0.0, 2: 0.0, 3: 0.0},
 19: {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0},
 20: {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0},
 21: {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0},
 22: {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0},
 23: {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0},
 24: {0: 0, 1: 0.0, 2: 0, 3: 0},
 25: {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0},
 26: {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0},
 27: {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.