In [1]:
from hiive.mdptoolbox.mdp import ValueIteration, PolicyIteration, QLearning
from hiive.mdptoolbox.example import forest
# import hiive_mdptoolbox.example
# import hiive_mdptoolbox
import gym
import numpy as np
import sys
import os
from numpy.random import choice
import pandas as pd
import seaborn as sns
np.random.seed(123)

# 500 states

In [None]:
P, R = forest(S=500, r1=50, r2= 25, p=0.01)

In [34]:
def policy_learner(P, R, policy, test_count=1000, gamma=0.9):
    num_state = P.shape[-1]
    total_episode = num_state * test_count
    # start in each state
    total_reward = 0
    for state in range(num_state):
        state_reward = 0
        for state_episode in range(test_count):
            episode_reward = 0
            disc_rate = 1
            while True:
                # take step
                action = policy[state]
                # get next step using P
                probs = P[action][state]
                candidates = list(range(len(P[action][state])))
                next_state =  choice(candidates, 1, p=probs)[0]
                # get the reward
                reward = R[state][action] * disc_rate
                episode_reward += reward
                # when go back to 0 ended
                disc_rate *= gamma
                if next_state == 0:
                    break
            state_reward += episode_reward
        total_reward += state_reward
    return total_reward / total_episode


In [23]:
def trainVI(P, R, discount=0.9, epsilon=[1e-9]):
    vi_df = pd.DataFrame(columns=["Epsilon", "Policy", "Iteration", 
                                  "Time", "Reward", "Value Function"])
    for eps in epsilon:
        vi = ValueIteration(P, R, gamma=discount, epsilon=eps, max_iter=int(1e15))
        vi.run()
        reward = test_policy(P, R, vi.policy)
        info = [float(eps), vi.policy, vi.iter, vi.time, reward, vi.V]
        df_length = len(vi_df)
        vi_df.loc[df_length] = info
    return vi_df

In [24]:
vi_df = trainVI(P, R, epsilon=[1e-1, 1e-3, 1e-6, 1e-9, 1e-12, 1e-15])
vi_df

Unnamed: 0,Epsilon,Policy,Iteration,Time,Reward,Value Function
0,0.1,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",73,0.036338,1.614272,"(4.709462601222712, 5.238399310397504, 5.23839..."
1,0.001,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",113,0.030641,1.609777,"(4.711758543634446, 5.2405804202105974, 5.2405..."
2,1e-06,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",173,0.049859,1.578855,"(4.711792641354771, 5.240613372254971, 5.24061..."
3,1e-09,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",233,0.064575,1.592677,"(4.711792702164914, 5.240613431938631, 5.24061..."
4,1e-12,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",293,0.082414,1.609879,"(4.711792702273736, 5.240613432046343, 5.24061..."
5,1e-15,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",346,0.098416,1.601837,"(4.7117927022739305, 5.240613432046538, 5.2406..."


In [25]:
pi = PolicyIteration(P, R, gamma=0.9, max_iter=1e6)
pi.run()
pi_pol = pi.policy
pi_reward = policy_learner(P,R, pi_pol)
pi_iter = pi.iter
pi_time = pi.time
pi_iter, pi_time, pi_reward

(40, 0.4065992832183838, 1.6023174626840928)

# Q-Learning

In [37]:
def Q_learner(P, R, discount=0.9, alpha_dec=[0.99], alpha_min=[0.001], 
            epsilon=[1.0], epsilon_decay=[0.99], n_iter=[1000000]):
    q_table = pd.DataFrame(columns=["Iterations", "Alpha Decay", "Alpha Min", 
                                 "Epsilon", "Epsilon Decay", "Reward",
                                 "Time", "Policy", "Value Function",
                                 "Training Rewards"])
    
    count = 0
    for i in n_iter:
        for eps in epsilon:
            for eps_dec in epsilon_decay:
                for a_dec in alpha_dec:
                    for a_min in alpha_min:
                        q = QLearning(P, R, discount, alpha_decay=a_dec, 
                                      alpha_min=a_min, epsilon=eps, 
                                      epsilon_decay=eps_dec, n_iter=i)
                        q.run()
                        reward = test_policy(P, R, q.policy)
                        count += 1
                        print("{}: {}".format(count, reward))
                        st = q.run_stats
                        rews = [s['Reward'] for s in st]
                        info = [i, a_dec, a_min, eps, eps_dec, reward, 
                                q.time, q.policy, q.V, rews]
                        
                        df_length = len(q_table)
                        q_table.loc[df_length] = info
    return q_table

In [9]:
alpha_decs = [0.99, 0.999]
alpha_mins =[0.001, 0.0001]
eps = [10.0, 1.0]
eps_dec = [0.99, 0.999]
iters = [1000000, 10000000]
q_table = Q_learner(P,R, discount=0.9, alpha_dec=alpha_decs, alpha_min=alpha_mins, 
            epsilon=eps, epsilon_decay=eps_dec, n_iter=iters)

1: 2.650183487415402
2: 2.6363744404711285
3: 2.6072561523039504
4: 2.6117818590299353
5: 2.57748663663716
6: 2.67399605909375
7: 2.6278609803234336
8: 2.626661209650027
9: 2.5297012242759016
10: 2.655280375155995
11: 2.6601539567558454
12: 2.608396759527677
13: 2.6790831093841865
14: 2.6850597811143957
15: 0.822
16: 2.676218333466832
17: 2.7135200868055835
18: 2.8117761382427333
19: 2.764109280376738
20: 2.765967761281483
21: 2.7403159646717272
22: 2.861893456347953
23: 2.656581016247709
24: 2.8327739465709403
25: 2.812239418250017
26: 2.7756577123354447
27: 2.7755963806781256
28: 2.7871103415832237
29: 2.753035768577412
30: 2.850375830314547
31: 2.7556116413006757
32: 2.8514372672801582


In [10]:
q_table

Unnamed: 0,Iterations,Alpha Decay,Alpha Min,Epsilon,Epsilon Decay,Reward,Time,Policy,Value Function,Training Rewards
0,1000000,0.99,0.001,10.0,0.99,2.650183,42.294331,"(0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, ...","(4.712210801507589, 5.241277084192105, 5.24121...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1000000,0.99,0.0001,10.0,0.99,2.636374,42.111959,"(0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, ...","(4.672188906786895, 5.200874840421224, 4.37178...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,1000000,0.999,0.001,10.0,0.99,2.607256,42.427873,"(0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, ...","(4.711828804944402, 5.241218072985039, 5.24151...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
3,1000000,0.999,0.0001,10.0,0.99,2.611782,42.333762,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, ...","(4.710461366899375, 5.239581376346029, 5.09524...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
4,1000000,0.99,0.001,10.0,0.999,2.577487,43.569858,"(0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, ...","(4.71057876795351, 5.240113535858624, 5.240027...","[0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, ..."
5,1000000,0.99,0.0001,10.0,0.999,2.673996,41.963869,"(0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, ...","(4.666537393022336, 5.1952190427092955, 4.3410...","[0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
6,1000000,0.999,0.001,10.0,0.999,2.627861,41.814253,"(0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","(4.713478207004994, 5.242419702750562, 5.24130...","[1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, ..."
7,1000000,0.999,0.0001,10.0,0.999,2.626661,41.804873,"(0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...","(4.708203589152052, 5.237402752385588, 5.13867...","[1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
8,1000000,0.99,0.001,1.0,0.99,2.529701,41.911337,"(0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, ...","(4.711667962492642, 5.240921762076372, 5.24098...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9,1000000,0.99,0.0001,1.0,0.99,2.65528,41.723724,"(0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, ...","(4.67205487152553, 5.200652330776339, 4.374608...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [12]:
q_table.groupby("Iterations").mean()

Unnamed: 0_level_0,Alpha Decay,Alpha Min,Epsilon,Epsilon Decay,Reward,Time
Iterations,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1000000,0.9945,0.00055,5.5,0.9945,2.520468,42.218418
10000000,0.9945,0.00055,5.5,0.9945,2.78175,404.961591


In [13]:
q_table.groupby("Epsilon Decay").mean()

Unnamed: 0_level_0,Alpha Decay,Alpha Min,Epsilon,Reward,Time
Epsilon Decay,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.99,0.9945,0.00055,5.5,2.697819,221.398994
0.999,0.9945,0.00055,5.5,2.604399,225.781016


In [14]:
q_table.groupby("Alpha Decay").mean()

Unnamed: 0_level_0,Alpha Min,Epsilon,Epsilon Decay,Reward,Time
Alpha Decay,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.99,0.00055,5.5,0.9945,2.712874,221.422083
0.999,0.00055,5.5,0.9945,2.589345,225.757927


In [15]:
q_table.groupby("Epsilon Decay").mean()


Unnamed: 0_level_0,Alpha Decay,Alpha Min,Epsilon,Reward,Time
Epsilon Decay,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.99,0.9945,0.00055,5.5,2.697819,221.398994
0.999,0.9945,0.00055,5.5,2.604399,225.781016


In [16]:
q_table.groupby("Alpha Min").mean()

Unnamed: 0_level_0,Alpha Decay,Epsilon,Epsilon Decay,Reward,Time
Alpha Min,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0001,0.9945,5.5,0.9945,2.731923,225.472906
0.001,0.9945,5.5,0.9945,2.570296,221.707104


In [28]:
# 40 states

In [30]:
P,R = forest(S=40, r1=10, r2=6, p=0.1)

In [31]:
vi_df = trainVI(P,R, epsilon=[1e-1, 1e-3, 1e-6, 1e-9, 1e-12, 1e-15])
vi_df

Unnamed: 0,Epsilon,Policy,Iteration,Time,Reward,Value Function
0,0.1,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",33,0.004705,1.920902,"(4.328504830081768, 4.881518644971712, 4.88151..."
1,0.001,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",55,0.002758,1.900663,"(4.460720290173723, 5.013211594807497, 5.01321..."
2,1e-06,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",87,0.00398,1.970462,"(4.474643139169861, 5.027129333047953, 5.02712..."
3,1e-09,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",120,0.006043,1.974922,"(4.475122825121185, 5.027609012960728, 5.02760..."
4,1e-12,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",153,0.006605,2.053608,"(4.475137648839068, 5.027623836684378, 5.02762..."
5,1e-15,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",186,0.00809,1.936243,"(4.4751381069387985, 5.027624294784101, 5.0276..."


In [32]:
# policy

In [35]:
pi = PolicyIteration(P,R, gamma=0.9, max_iter=1e6)
pi.run()
pi_pol = pi.policy
pi_reward = policy_learner(P,R, pi_pol)
pi_iter = pi.iter
pi_time = pi.time
pi_iter, pi_time, pi_reward

(14, 0.01045083999633789, 1.9370201347417595)

In [None]:
# q learning 

In [None]:
alpha_decs = [0.99, 0.999]
alpha_mins =[0.001, 0.0001]
eps = [10.0, 1.0]
eps_dec = [0.99, 0.999]
iters = [1000000, 10000000]
q_table = Q_learner(P,R, discount=0.9, alpha_dec=alpha_decs, alpha_min=alpha_mins, 
            epsilon=eps, epsilon_decay=eps_dec, n_iter=iters)

1: 2.079650937934126
2: 2.0096533197381206
3: 1.8813499986619064
4: 1.8390122917902803
5: 2.123874004115888
6: 2.265791961107357
7: 1.9863075524688611
8: 1.8337099752882664
9: 2.0073867631027236
10: 2.024790047996396
11: 2.0536206928926495
12: 1.6421856567419173
13: 2.204214714996393
14: 2.210414773817981
15: 1.8703992204473283
16: 1.7458958741776633
17: 2.019786559177178
18: 0.95
19: 1.756991658996568
20: 1.8201709790148806
21: 2.0650051657135973
22: 2.2664940347038733
23: 1.9893329651317972


In [None]:
q_table

In [None]:
q_table.groupby("Epsilon Decay").mean()