In [1]:
import gym
import gym_fire
import numpy as np

In [2]:
env = gym.make('FireLake-v0')

In [3]:
env.reset()
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [4]:
env.P[5]

{0: [(1.0, 5, -1, True)],
 1: [(1.0, 5, -1, True)],
 2: [(1.0, 5, -1, True)],
 3: [(1.0, 5, -1, True)]}

In [5]:
discount = .25

In [6]:
def iterate_value(env, discount=1.0, max_iters=10000, convergence=1e-20):
    
    # initialize reward value for each state
    V = np.zeros(env.observation_space.n)
    
    for i in range(max_iters):
        # for convergence comparison
        old_V = np.copy(V) 
        
        # calculate reward value for each state
        for state in range(env.observation_space.n):
            state_rewards = []
            for action in range(env.action_space.n):
                expected_rewards = []
                # calculate total expected reward for each action
                for next_state_action in env.P[state][action]: 
                    transition_p, next_state, reward_p, _ = next_state_action 
                    expected_rewards.append((transition_p * (reward_p + discount * V[next_state]))) 
                
                state_rewards.append(np.sum(expected_rewards))
            
            # update value table with reward for action with the best reward
            V[state] = max(state_rewards) 
            
        # check convergence
        if (np.sum(np.fabs(V - old_V)) <= convergence):
            print(f'Converged on iteration {i}')
            return(V)
    
    print(f'Did not converge in {i} iterations, returning last value table')
    return(V)

In [7]:
def get_policy(value_array, discount=1.0):
 
    # initialize policy
    policy = np.zeros(env.observation_space.n) 
    
    for state in range(env.observation_space.n):
        # initialize the rewards for a state
        V = np.zeros(env.action_space.n)
        
        # compute expected reward for all actions of state
        for action in range(env.action_space.n):
            for next_state_action in env.P[state][action]: 
                transition_p, next_state, reward_p, _ = next_state_action 
                V[action] += (transition_p * (reward_p + discount * value_array[next_state]))
        
        # select the action which has max reward as an optimal action of the state
        policy[state] = np.argmax(V)
    
    return policy

In [8]:
value_array =  iterate_value(env, discount=discount)

Converged on iteration 29


In [9]:
value_array

array([-0.00444109, -0.00447778, -0.00481448, -0.00848145, -0.00437416,
       -1.33333333, -0.10972812, -1.33333333, -0.00367469,  0.00395261,
        0.02141034, -1.33333333, -1.33333333,  0.02969566,  0.36269961,
        0.        ])

In [10]:
policy = get_policy(value_array, discount=discount)

In [11]:
policy

array([2., 3., 3., 3., 0., 0., 0., 0., 3., 1., 0., 0., 0., 2., 1., 0.])

In [12]:
def run_policy(env, policy):
    env.reset()
    env.render()
    reward = 0
    state = 0
    step = 0
    dead = False
    while not dead:
        next = env.step(int(policy[state]))
        dead = next[2]
        reward += next[1]
        state = next[0]
        step += 1
        print(f'Step reward: {next[1]}')
        print(f'Cumulative reward: {reward}')
        env.render()

    if reward > 0:
        print(f'Gotem in {step} steps with reward {reward}')
    else:
        print(f'RIP in {step} steps')

In [14]:
run_policy(env, policy)


[41mS[0mFFF
FHFH
FFFH
HFFG
Step reward: -0.01
Cumulative reward: -0.01
  (Right)
[41mS[0mFFF
FHFH
FFFH
HFFG
Step reward: -0.01
Cumulative reward: -0.02
  (Right)
[41mS[0mFFF
FHFH
FFFH
HFFG
Step reward: -0.01
Cumulative reward: -0.03
  (Right)
[41mS[0mFFF
FHFH
FFFH
HFFG
Step reward: 0.0
Cumulative reward: -0.03
  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG
Step reward: 0.0
Cumulative reward: -0.03
  (Up)
SF[41mF[0mF
FHFH
FFFH
HFFG
Step reward: 0.0
Cumulative reward: -0.03
  (Up)
S[41mF[0mFF
FHFH
FFFH
HFFG
Step reward: -0.01
Cumulative reward: -0.04
  (Up)
S[41mF[0mFF
FHFH
FFFH
HFFG
Step reward: -0.01
Cumulative reward: -0.05
  (Up)
S[41mF[0mFF
FHFH
FFFH
HFFG
Step reward: -0.01
Cumulative reward: -0.060000000000000005
  (Up)
S[41mF[0mFF
FHFH
FFFH
HFFG
Step reward: 0.0
Cumulative reward: -0.060000000000000005
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
Step reward: 0.0
Cumulative reward: -0.060000000000000005
  (Right)
SFFF
[41mF[0mHFH
FFFH
HFFG
Step reward: -0.01
Cumulative reward: