In [1]:
import gym
import numpy as np

In [2]:
env = gym.make('FrozenLake8x8-v0')

In [3]:
env.reset()
env.render()


[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG


In [8]:
discount = 0.25

In [4]:
def iterate_value(env, discount=1.0, max_iters=10000, convergence=1e-20):
    
    # initialize reward value for each state
    V = np.zeros(env.observation_space.n)
    
    for i in range(max_iters):
        # for convergence comparison
        old_V = np.copy(V) 
        
        # calculate reward value for each state
        for state in range(env.observation_space.n):
            state_rewards = []
            for action in range(env.action_space.n):
                expected_rewards = []
                # calculate total expected reward for each action
                for next_state_action in env.env.P[state][action]: 
                    transition_p, next_state, reward_p, _ = next_state_action 
                    expected_rewards.append((transition_p * (reward_p + discount * V[next_state]))) 
                
                state_rewards.append(np.sum(expected_rewards))
            
            # update value table with reward for action with the best reward
            V[state] = max(state_rewards) 
            
        # check convergence
        if (np.sum(np.fabs(V - old_V)) <= convergence):
            print(f'Converged on iteration {i}')
            return(V)
    
    print(f'Did not converge in {i} iterations, returning last value table')
    return(V)

In [5]:
def get_policy(value_array, discount=1.0):
 
    # initialize policy
    policy = np.zeros(env.observation_space.n) 
    
    for state in range(env.observation_space.n):
        # initialize the rewards for a state
        V = np.zeros(env.action_space.n)
        
        # compute expected reward for all actions of state
        for action in range(env.action_space.n):
            for next_state_action in env.env.P[state][action]: 
                transition_p, next_state, reward_p, _ = next_state_action 
                V[action] += (transition_p * (reward_p + discount * value_array[next_state]))
        
        # select the action which has max reward as an optimal action of the state
        policy[state] = np.argmax(V)
    
    return policy

In [9]:
value_array =  iterate_value(env, discount=discount)

Converged on iteration 28


In [10]:
value_array

array([7.34100032e-13, 5.98370123e-12, 4.94890146e-11, 4.02206929e-10,
       3.05048627e-09, 1.76142487e-08, 8.82179141e-08, 2.46647468e-07,
       2.09139912e-12, 1.63316990e-11, 1.42172231e-10, 1.37378995e-09,
       1.59411002e-08, 1.05538821e-07, 7.23749587e-07, 2.46647468e-06,
       6.67369135e-12, 4.78244554e-11, 2.82787813e-10, 0.00000000e+00,
       8.27038951e-08, 5.25102021e-07, 6.11298154e-06, 2.64074719e-05,
       2.55861495e-11, 2.74773953e-10, 3.20345707e-09, 3.78839231e-08,
       4.51403620e-07, 0.00000000e+00, 4.62245570e-05, 2.84369210e-04,
       6.50836892e-12, 4.60059086e-11, 2.70788582e-10, 0.00000000e+00,
       5.29625562e-06, 4.24886016e-05, 2.64212492e-04, 3.08183675e-03,
       6.82049295e-13, 0.00000000e+00, 0.00000000e+00, 1.73002055e-06,
       2.06150623e-05, 2.40354471e-04, 0.00000000e+00, 3.36158350e-02,
       9.94173328e-13, 0.00000000e+00, 1.21918363e-08, 1.45184366e-07,
       0.00000000e+00, 2.82114999e-03, 0.00000000e+00, 3.66692349e-01,
      

In [12]:
policy = get_policy(value_array, discount=discount)

In [13]:
policy

array([1., 2., 2., 2., 2., 2., 2., 2., 1., 2., 2., 3., 2., 2., 1., 1., 1.,
       2., 0., 0., 2., 3., 2., 1., 3., 2., 3., 1., 0., 0., 2., 1., 3., 3.,
       0., 0., 2., 1., 3., 2., 0., 0., 0., 1., 3., 0., 0., 2., 0., 0., 1.,
       0., 0., 0., 0., 2., 1., 1., 0., 0., 1., 1., 1., 0.])

In [14]:
def run_policy(env, policy):
    env.reset()
    env.render()
    reward = 0
    state = 0
    step = 0
    dead = False
    while not dead and reward == 0:
        next = env.step(int(policy[state]))
        dead = next[2]
        reward += next[1]
        state = next[0]
        step += 1
        env.render()
    
    if reward == 1:
        print(f'Gotem in {step} steps')
    else: 
        print(f'RIP in {step} steps')

In [16]:
run_policy(env, policy)


[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Down)
[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Down)
[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Down)
S[41mF[0mFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SF[41mF[0mFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SFF[41mF[0mFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SFFF[41mF[0mFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SFFFF[41mF[0mFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SFFFFFFF
FFFFF[41mF[0mFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SFFFFFFF
FFFFFF[41mF[0mF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Down)
SFFFFFFF
FFFFFFF[41mF[0m
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Down)
SFF