In [None]:
import gym
import hiive.mdptoolbox
import hiive.mdptoolbox.mdp
import hiive.mdptoolbox.example

from gym.envs.toy_text.frozen_lake import generate_random_map
import hiive_openAI_extract

# import hiive.mdptoolbox as mdptoolbox
from hiive.mdptoolbox.mdp import ValueIteration, PolicyIterationModified, QLearning

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time

# set seed
np.random.seed(0)

In [None]:
# Value Iteration - change problem size
time_list = []
iter_list = []
reward_list = []

for size in [10, 50, 100, 200, 400, 500, 750, 1000]:
    T, R = hiive.mdptoolbox.example.forest(S=size)
    
    one_test = ValueIteration(T, R, gamma=0.95, epsilon=0.001, max_iter=100000)
    
    one_test.run()
    time_list.append(one_test.time)
    iter_list.append(one_test.iter)
    reward_list.append(np.mean(one_test.V))

print(time_list)
print(iter_list)
print(reward_list)

In [None]:
# Value Iteration - large and small
T_small, R_small = hiive.mdptoolbox.example.forest(10)
T_large, R_large = hiive.mdptoolbox.example.forest(1000)
T_middle, R_middle = hiive.mdptoolbox.example.forest(625)

time_list1, time_list2, time_list3 = [], [], []
iter_list1, iter_list2, iter_list3 = [], [], []
reward_list1, reward_list2, reward_list3 = [], [], []

for gamma in range(100):
    vi_small = ValueIteration(T_small, R_small, (gamma + 0.5) / 100, epsilon=0.01)
    vi_small.run()
    time_list1.append(vi_small.time)
    iter_list1.append(vi_small.iter)
    reward_list1.append(np.mean(vi_small.V))
    
    vi_large = ValueIteration(T_large, R_large, (gamma + 0.5) / 100, epsilon=0.01)
    vi_large.run()
    time_list2.append(vi_large.time)
    iter_list2.append(vi_large.iter)
    reward_list2.append(np.mean(vi_large.V))
    
    vi_middle = ValueIteration(T_middle, R_middle, (gamma + 0.5) / 100, epsilon=0.01)
    vi_middle.run()
    time_list3.append(vi_middle.time)
    iter_list3.append(vi_middle.iter)
    reward_list3.append(np.mean(vi_middle.V))

gamma_arr = [(i + 0.5) / 100 for i in range(100)]   
    
plt.plot(gamma_arr, iter_list1, label='10 states')
plt.plot(gamma_arr, iter_list2, label='1000 states')
plt.plot(gamma_arr, iter_list3, label='625 states')
plt.xlabel('gamma (discount factor)')
plt.ylabel('Iterations to Converge')
plt.title('Value Iteration - Iterations to Converge')
plt.grid()
plt.legend()
plt.savefig('Forest_Manage_vi_iters', dpi=400)
plt.show()

plt.plot(gamma_arr, time_list1, label='10 states')
plt.plot(gamma_arr, time_list2, label='1000 states')
plt.plot(gamma_arr, time_list3, label='625 states')
plt.xlabel('gamma (discount factor)')
plt.ylabel('Clock time to Converge')
plt.title('Value Iteration - Clock time to Converge')
plt.grid()
plt.legend()
plt.savefig('Forest_Manage_vi_time', dpi=400)
plt.show()

plt.plot(gamma_arr, reward_list1, label='10 states')
plt.plot(gamma_arr, reward_list2, label='1000 states')
plt.plot(gamma_arr, reward_list3, label='625 states')
plt.xlabel('gamma (discount factor)')
plt.ylabel('mean reward')
plt.title('Value Iteration - mean rewards')
plt.grid()
plt.legend()
plt.savefig('Forest_Manage_vi_reward', dpi=400)
plt.show()

In [None]:
# Policy Iteration - change problem size
time_list = []
iter_list = []
reward_list = []

for size in [10, 50, 100, 200, 400, 500, 750, 1000]:
    T, R = hiive.mdptoolbox.example.forest(S=size)
    
    one_test = PolicyIterationModified(T, R, gamma=0.95, epsilon=0.001, max_iter=100000)
    
    one_test.run()
    time_list.append(one_test.time)
    iter_list.append(one_test.iter)
    reward_list.append(np.mean(one_test.V))

print(time_list)
print(iter_list)
print(reward_list)

In [None]:
# Policy Iteration - large and small
T_small, R_small = hiive.mdptoolbox.example.forest(10)
T_large, R_large = hiive.mdptoolbox.example.forest(1000)
T_middle, R_middle = hiive.mdptoolbox.example.forest(625)

time_list1, time_list2, time_list3 = [], [], []
iter_list1, iter_list2, iter_list3 = [], [], []
reward_list1, reward_list2, reward_list3 = [], [], []

for gamma in range(100):
    pi_small = PolicyIterationModified(T_small, R_small, (gamma + 0.5) / 100, epsilon=0.01)
    pi_small.run()
    time_list1.append(pi_small.time)
    iter_list1.append(pi_small.iter)
    reward_list1.append(np.mean(pi_small.V))
    
    pi_large = PolicyIterationModified(T_large, R_large, (gamma + 0.5) / 100, epsilon=0.01)
    pi_large.run()
    time_list2.append(pi_large.time)
    iter_list2.append(pi_large.iter)
    reward_list2.append(np.mean(pi_large.V))
    
    pi_middle = PolicyIterationModified(T_middle, R_middle, (gamma + 0.5) / 100, epsilon=0.01)
    pi_middle.run()
    time_list3.append(pi_middle.time)
    iter_list3.append(pi_middle.iter)
    reward_list3.append(np.mean(pi_middle.V))

gamma_arr = [(i + 0.5) / 100 for i in range(100)]   
    
plt.plot(gamma_arr, iter_list1, label='10 states')
plt.plot(gamma_arr, iter_list2, label='1000 states')
plt.plot(gamma_arr, iter_list3, label='625 states')
plt.xlabel('gamma (discount factor)')
plt.ylabel('Iterations to Converge')
plt.title('Policy Iteration - Iterations to Converge')
plt.grid()
plt.legend()
plt.savefig('Forest_Manage_pi_iters', dpi=400)
plt.show()

plt.plot(gamma_arr, time_list1, label='10 states')
plt.plot(gamma_arr, time_list2, label='1000 states')
plt.plot(gamma_arr, time_list3, label='625 states')
plt.xlabel('gamma (discount factor)')
plt.ylabel('Clock time to Converge')
plt.title('Policy Iteration - Clock time to Converge')
plt.grid()
plt.legend()
plt.savefig('Forest_Manage_pi_time', dpi=400)
plt.show()

plt.plot(gamma_arr, reward_list1, label='10 states')
plt.plot(gamma_arr, reward_list2, label='1000 states')
plt.plot(gamma_arr, reward_list3, label='625 states')
plt.xlabel('gamma (discount factor)')
plt.ylabel('mean reward')
plt.title('Policy Iteration - mean rewards')
plt.grid()
plt.legend()
plt.savefig('Forest_Manage_pi_reward', dpi=400)
plt.show()

In [None]:
# Policy Iteration - change problem size
time_list = []
iter_list = []
reward_list = []

for size in [10, 50, 100, 200, 400, 500, 750, 1000]:
    T, R = hiive.mdptoolbox.example.forest(S=size)
    
    one_test = QLearning(T, R, gamma=0.9, alpha=0.1, n_iter=100000)
    
    test = one_test.run()
    time_list.append(test[-1]["Time"])
    iter_list.append(test[-1]["Iteration"])
    reward_list.append(test[-1]["Mean V"])

print(time_list)
print(iter_list)
print(reward_list)

In [None]:
# Q learning - large and small
T_small, R_small = hiive.mdptoolbox.example.forest(10)
T_large, R_large = hiive.mdptoolbox.example.forest(1000)
T_middle, R_middle = hiive.mdptoolbox.example.forest(625)

time_list1, time_list2, time_list3 = [], [], []
iter_list1, iter_list2, iter_list3 = [], [], []
reward_list1, reward_list2, reward_list3 = [], [], []

for gamma in range(100):
    q_small = QLearning(T_small, R_small, (gamma + 0.5) / 100, n_iter=10000)
    test1 = q_small.run()
    time_list1.append(test1[-1]["Time"])
    iter_list1.append(test1[-1]["Iteration"])
    reward_list1.append(np.mean(test1[-1]["Mean V"]))
    
    q_large = QLearning(T_large, R_large, (gamma + 0.5) / 100, n_iter=10000)
    test2 = q_large.run()
    time_list2.append(test2[-1]["Time"])
    iter_list2.append(test2[-1]["Iteration"])
    reward_list2.append(np.mean(test2[-1]["Mean V"]))
    
    q_middle = QLearning(T_middle, R_middle, (gamma + 0.5) / 100, n_iter=10000)
    test3 = q_middle.run()
    time_list3.append(test3[-1]["Time"])
    iter_list3.append(test3[-1]["Iteration"])
    reward_list3.append(np.mean(test3[-1]["Mean V"]))

gamma_arr = [(i + 0.5) / 100 for i in range(100)]   
    
plt.plot(gamma_arr, iter_list1, label='10 states')
plt.plot(gamma_arr, iter_list2, label='1000 states')
plt.plot(gamma_arr, iter_list3, label='625 states')
plt.xlabel('gamma (discount factor)')
plt.ylabel('Iterations to Converge')
plt.title('Q Learning - Iterations to Converge')
plt.grid()
plt.legend()
plt.savefig('Forest_Manage_q_gamma_iters', dpi=400)
plt.show()

plt.plot(gamma_arr, time_list1, label='10 states')
plt.plot(gamma_arr, time_list2, label='1000 states')
plt.plot(gamma_arr, time_list3, label='625 states')
plt.xlabel('gamma (discount factor)')
plt.ylabel('Clock time to Converge')
plt.title('Q Learning - Clock time to Converge')
plt.grid()
plt.legend()
plt.savefig('Forest_Manage_q_gamma_time', dpi=400)
plt.show()

plt.plot(gamma_arr, reward_list1, label='10 states')
plt.plot(gamma_arr, reward_list2, label='1000 states')
plt.plot(gamma_arr, reward_list3, label='625 states')
plt.xlabel('gamma (discount factor)')
plt.ylabel('mean reward')
plt.title('Q Learning - mean rewards')
plt.grid()
plt.legend()
plt.savefig('Forest_Manage_q_gamma_reward', dpi=400)
plt.show()

In [None]:
# table -- alpha and alpha decay
alphas = [0.1, 0.3, 0.5, 0.7, 0.9]
alpha_decays = [0.999, 0.99, 0.95, 0.9]

for alpha in alphas:
    for alpha_decay in alpha_decays:
        q_small = QLearning(T_small, R_small, gamma=0.9, alpha=alpha, alpha_decay=alpha_decay, n_iter=10000)
        test1 = q_small.run()
        print("10 states: ", alpha, alpha_decay, test1[-1]["Time"], test1[-1]["Mean V"])
        
        q_large = QLearning(T_large, R_large, gamma=0.9, alpha=alpha, alpha_decay=alpha_decay, n_iter=10000)
        test2 = q_large.run()
        print("1000 states: ", alpha, alpha_decay, test2[-1]["Time"], test2[-1]["Mean V"])
        
        q_middle = QLearning(T_middle, R_middle, gamma=0.9, alpha=alpha, alpha_decay=alpha_decay, n_iter=10000)
        test3 = q_middle.run()
        print("625 states: ", alpha, alpha_decay, test3[-1]["Time"], test3[-1]["Mean V"])

In [None]:
# plot -- epsilon_decay
epsilon_decays = [0.999, 0.99, 0.95, 0.9]

T_large, R_large = hiive.mdptoolbox.example.forest(1000)
T_middle, R_middle = hiive.mdptoolbox.example.forest(625)

time_list = []
reward_list = []

for epsilon_decay in epsilon_decays:
#     q_small = QLearning(T_small, R_small, gamma=0.9, epsilon_decay=epsilon_decay, n_iter=100000)
#     test1 = q_small.run()
#     time_list.append(test1[-1]["Time"])
#     reward_list.append(test1[-1]["Mean V"])
    
    q_large = QLearning(T_large, R_large, gamma=0.9, epsilon_decay=epsilon_decay, n_iter=100000)
    test2 = q_large.run()
    time_list.append(test2[-1]["Time"])
    reward_list.append(test2[-1]["Mean V"])

    q_middle = QLearning(T_middle, R_middle, gamma=0.9, epsilon_decay=epsilon_decay, n_iter=100000)
    test3 = q_middle.run()
    time_list.append(test3[-1]["Time"])
    reward_list.append(test3[-1]["Mean V"])

# schedules = ["0.999/10", "0.999/1000", "0.999/625", "0.99/10", "0.99/1000", "0.99/625", "0.95/10", "0.95/1000","0.95/625", "0.9/10",  "0.9/1000", "0.9/625"]



In [None]:
schedules = ["0.999/1000", "0.999/625", "0.99/1000", "0.99/625", "0.95/1000","0.95/625",  "0.9/1000", "0.9/625"]


fig = plt.figure(figsize=(6,4))
ax  = sns.barplot(schedules,time_list) 
ax.set_title('Q Learning - Clock time to Converge')
ax.set_xlabel('epsilon decay/# of states')
ax.set_ylabel('clock time')
ax.set_xticklabels(labels=schedules, rotation=20)

plt.savefig('Forest_Manage_q_epsilon_time.png', dpi=400)

fig = plt.figure(figsize=(6,4))
ax  = sns.barplot(schedules,reward_list) 
ax.set_title('Q Learning - average rewards')
ax.set_xlabel('epsilon decay/# of states')
ax.set_ylabel('reward')
ax.set_xticklabels(labels=schedules, rotation=20)

plt.savefig('Forest_Manage_q_epsilon_reward.png', dpi=400)