In [89]:
import numpy as np
import gym
from gym.envs.toy_text.frozen_lake import generate_random_map
from gym.envs.registration import register
from gym import wrappers
from time import time

from hiive.mdptoolbox import example, mdp


In [90]:
# Define Problem
num_state = 1000
P, R = example.forest(S=num_state, r1=10, r2=50, p=0.0002886, is_sparse=False)


In [91]:
# Value Iteration
value_iteration = mdp.ValueIteration(P, R, 0.95)
value_iteration.run()
print("Converge at iteration " + str(value_iteration.iter))
print("Running time: ", str(value_iteration.time), 'seconds')

Converge at iteration 188
Running time:  0.09995698928833008 seconds


In [92]:
# Policy Iteration
policy_iteration = mdp.PolicyIteration(P, R, 0.95)
policy_iteration.run()
print("Converge at iteration " + str(policy_iteration.iter))
print("Running time: ", str(policy_iteration.time), 'seconds')

Converge at iteration 58
Running time:  1.1166589260101318 seconds


In [93]:
# Q-Learning
qlearner_iteration = mdp.QLearning(P, R, 0.95, n_iter=1000000)
qlearner_iteration.run()

print("Converge at iteration " + str(qlearner_iteration.max_iter))
print("Running time: ", str(qlearner_iteration.time), 'seconds')


Converge at iteration 1000000
Running time:  51.68143701553345 seconds


In [94]:
#Compare

print("Did Value Iteration and Policy Iteration return the same policy?:")
print(value_iteration.policy == policy_iteration.policy)
print("Did Value Iteration and Q-Learning return the same policy?:")
print(value_iteration.policy == qlearner_iteration.policy)
print("Did Policy Iteration and Q-Learning return the same policy?:")
print(policy_iteration.policy == qlearner_iteration.policy)

# Comparing random iteration, policy iteration, value iteration, and Q-Learner total rewards
print("Policy Iteration Reward: " , policy_iteration.V[len(policy_iteration.V)-1])
print("Value Iteration Total Reward: ", value_iteration.V[len(value_iteration.V)-1])
print("QLearner Total Reward: ", qlearner_iteration.V[len(qlearner_iteration.V)-1])


Did Value Iteration and Policy Iteration return the same policy?:
True
Did Value Iteration and Q-Learning return the same policy?:
False
Did Policy Iteration and Q-Learning return the same policy?:
False
Policy Iteration Reward:  198.9624295055931
Value Iteration Total Reward:  198.95275825170393
QLearner Total Reward:  0.8841299768219463


In [95]:
value_iteration.policy

(0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
