In [3]:
import gym
import gym_fire
import numpy as np

In [4]:
env = gym.make('HellLake-v0')

In [5]:
env.reset()
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [6]:
env.P[5]

{0: [(1.0, 5, -0.01, True)],
 1: [(1.0, 5, -0.01, True)],
 2: [(1.0, 5, -0.01, True)],
 3: [(1.0, 5, -0.01, True)]}

In [7]:
discount = .25

In [8]:
def iterate_value(env, discount=1.0, max_iters=10000, convergence=1e-20):
    
    # initialize reward value for each state
    V = np.zeros(env.observation_space.n)
    
    for i in range(max_iters):
        # for convergence comparison
        old_V = np.copy(V) 
        
        # calculate reward value for each state
        for state in range(env.observation_space.n):
            state_rewards = []
            for action in range(env.action_space.n):
                expected_rewards = []
                # calculate total expected reward for each action
                for next_state_action in env.P[state][action]: 
                    transition_p, next_state, reward_p, _ = next_state_action 
                    expected_rewards.append((transition_p * (reward_p + discount * V[next_state]))) 
                
                state_rewards.append(np.sum(expected_rewards))
            
            # update value table with reward for action with the best reward
            V[state] = max(state_rewards) 
            
        # check convergence
        if (np.sum(np.fabs(V - old_V)) <= convergence):
            print(f'Converged on iteration {i}')
            return(V)
    
    print(f'Did not converge in {i} iterations, returning last value table')
    return(V)

In [9]:
def get_policy(value_array, discount=1.0):
 
    # initialize policy
    policy = np.zeros(env.observation_space.n) 
    
    for state in range(env.observation_space.n):
        # initialize the rewards for a state
        V = np.zeros(env.action_space.n)
        
        # compute expected reward for all actions of state
        for action in range(env.action_space.n):
            for next_state_action in env.P[state][action]: 
                transition_p, next_state, reward_p, _ = next_state_action 
                V[action] += (transition_p * (reward_p + discount * value_array[next_state]))
        
        # select the action which has max reward as an optimal action of the state
        policy[state] = np.argmax(V)
    
    return policy

In [10]:
value_array =  iterate_value(env, discount=discount)

Converged on iteration 46


In [11]:
value_array

array([-3.90534022e-03, -1.46526362e-03, -3.44489854e-04, -4.87980211e-03,
       -1.49347882e-03, -1.33333333e-02,  2.21118748e-03, -1.33333333e-02,
       -6.83072291e-04,  6.62994466e-03,  4.02120730e-02, -1.33333333e-02,
       -1.33333333e-02,  4.00303353e-02,  4.73703744e-01,  1.21052911e+00])

In [12]:
policy = get_policy(value_array, discount=discount)

In [13]:
policy

array([1., 1., 1., 0., 2., 0., 0., 0., 2., 1., 0., 0., 0., 2., 2., 1.])

In [14]:
def run_policy(env, policy):
    env.reset()
    env.render()
    reward = 0
    state = 0
    step = 0
    stuck = 0
    dead = False
    while (state != 15) & (reward > -1.5):
        next = env.step(int(policy[state]))
        dead = next[2]
        reward += next[1]
        state = next[0]
        step += 1
        if dead: 
            stuck += 1
        print(f'Step reward: {next[1]}')
        print(f'Cumulative reward: {reward}')
        env.render()

    if reward > 0:
        print(f'Gotem in {step} steps with reward {reward}')
    else:
        print(f'RIP in {step - stuck - 1} steps, Purgatorio for {stuck - 1} steps')

In [15]:
run_policy(env, policy)


[41mS[0mFFF
FHFH
FFFH
HFFG
Step reward: 0.0
Cumulative reward: 0.0
  (Down)
S[41mF[0mFF
FHFH
FFFH
HFFG
Step reward: 0.0
Cumulative reward: 0.0
  (Down)
SF[41mF[0mF
FHFH
FFFH
HFFG
Step reward: 0.0
Cumulative reward: 0.0
  (Down)
SFFF
FH[41mF[0mH
FFFH
HFFG
Step reward: 0.0
Cumulative reward: 0.0
  (Left)
SF[41mF[0mF
FHFH
FFFH
HFFG
Step reward: 0.0
Cumulative reward: 0.0
  (Down)
SFFF
FH[41mF[0mH
FFFH
HFFG
Step reward: 0.0
Cumulative reward: 0.0
  (Left)
SF[41mF[0mF
FHFH
FFFH
HFFG
Step reward: 0.0
Cumulative reward: 0.0
  (Down)
S[41mF[0mFF
FHFH
FFFH
HFFG
Step reward: 0.0
Cumulative reward: 0.0
  (Down)
SF[41mF[0mF
FHFH
FFFH
HFFG
Step reward: 0.0
Cumulative reward: 0.0
  (Down)
S[41mF[0mFF
FHFH
FFFH
HFFG
Step reward: 0.0
Cumulative reward: 0.0
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
Step reward: -0.01
Cumulative reward: -0.01
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
Step reward: -0.01
Cumulative reward: -0.02
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
Step reward: -0.01
Cumulat

In [22]:
env.P[14]

{0: [(0.3333333333333333, 10, 0.0, False),
  (0.3333333333333333, 13, 0.0, False),
  (0.3333333333333333, 14, -0.001, False)],
 1: [(0.3333333333333333, 13, 0.0, False),
  (0.3333333333333333, 14, -0.001, False),
  (0.3333333333333333, 15, 0.0, True)],
 2: [(0.3333333333333333, 14, -0.001, False),
  (0.3333333333333333, 15, 0.0, True),
  (0.3333333333333333, 10, 0.0, False)],
 3: [(0.3333333333333333, 15, 0.0, True),
  (0.3333333333333333, 10, 0.0, False),
  (0.3333333333333333, 13, 0.0, False)]}