In [193]:


from hiive.mdptoolbox.mdp import ValueIteration, PolicyIteration, QLearning
from hiive.mdptoolbox.example import forest
# import hiive_mdptoolbox.example
# import hiive_mdptoolbox
import gym
import numpy as np
import sys
import os
from numpy.random import choice
import pandas as pd
import seaborn as sns
np.random.seed(44)

In [None]:
P, R = forest(S=20, r1=10, r2=6, p=0.1)

In [191]:
def running_mean(x, N):
    cumsum = np.cumsum(np.insert(x, 0, 0)) 
    return (cumsum[N:] - cumsum[:-N]) / float(N)

In [172]:
def test_policy(P, R, policy, test_count=1000, gamma=0.9):
    num_state = P.shape[-1]
    total_episode = num_state * test_count
    # start in each state
    total_reward = 0
    for state in range(num_state):
        state_reward = 0
        for state_episode in range(test_count):
            episode_reward = 
            
            disc_rate = 1
            while True:
                # take step
                action = policy[state]
                # get next step using P
                probs = P[action][state]
                
                candidates = list(range(len(P[action][state])))
                next_state =  choice(candidates, 1, p=probs)[0]
                # get the reward
                reward = R[state][action] * disc_rate
                episode_reward += reward
                # when go back to 0 ended
                disc_rate *= gamma
                if next_state == 0:
                    break
            state_reward += episode_reward
        total_reward += state_reward
    return total_reward / total_episode


In [214]:
def trainVI(P, R, discount=0.9, epsilon=[1e-9]):
    vi_df = pd.DataFrame(columns=["Epsilon", "Policy", "Iteration", 
                                  "Time", "Reward", "Value Function"])
    for eps in epsilon:
        vi = ValueIteration(P, R, gamma=discount, epsilon=eps, max_iter=int(1e15))
        vi.run()
        reward = test_policy(P, R, vi.policy)
        info = [float(eps), vi.policy, vi.iter, vi.time, reward, vi.V]
        df_length = len(vi_df)
        vi_df.loc[df_length] = info
    return vi_df

In [182]:
vi_df = trainVI(P, R, epsilon=[1e-1, 1e-3, 1e-6, 1e-9, 1e-12, 1e-15])
vi_df

Unnamed: 0,Epsilon,Policy,Iteration,Time,Reward,Value Function
0,0.1,"(0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...",31,0.002649,2.154275,"(4.294036989827981, 4.84732736460308, 4.847327..."
1,0.001,"(0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...",52,0.002633,2.1569,"(4.455368460528459, 5.007845020233536, 5.00784..."
2,1e-06,"(0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...",85,0.005243,2.142954,"(4.474527031364974, 5.027013228405199, 5.02701..."
3,1e-09,"(0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...",118,0.006456,2.152037,"(4.4751192370714845, 5.027605424908008, 5.0276..."
4,1e-12,"(0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...",151,0.007815,2.18503,"(4.475137537956969, 5.027623725802282, 5.02762..."
5,1e-15,"(0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...",186,0.010476,2.113817,"(4.4751381069387985, 5.027624294784101, 5.0276..."


In [185]:
pi = PolicyIteration(P, R, gamma=0.9, max_iter=1e6)
pi.run()
pi_pol = pi.policy
pi_reward = test_policy(P, R, pi_pol)
pi_iter = pi.iter
pi_time = pi.time
pi_iter, pi_time, pi_reward

(12, 0.009848356246948242, 2.114163951479959)

In [170]:
pi_pol

(0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)

# Q-Learning

In [212]:
def trainQ(P, R, discount=0.9, alpha_dec=[0.99], alpha_min=[0.001], 
            epsilon=[1.0], epsilon_decay=[0.99], n_iter=[1000000]):
    q_df = pd.DataFrame(columns=["Iterations", "Alpha Decay", "Alpha Min", 
                                 "Epsilon", "Epsilon Decay", "Reward",
                                 "Time", "Policy", "Value Function",
                                 "Training Rewards"])
    
    count = 0
    for i in n_iter:
        for eps in epsilon:
            for eps_dec in epsilon_decay:
                for a_dec in alpha_dec:
                    for a_min in alpha_min:
                        q = QLearning(P, R, discount, alpha_decay=a_dec, 
                                      alpha_min=a_min, epsilon=eps, 
                                      epsilon_decay=eps_dec, n_iter=i)
                        q.run()
                        reward = test_policy(P, R, q.policy)
                        count += 1
                        print("{}: {}".format(count, reward))
                        st = q.run_stats
                        rews = [s['Reward'] for s in st]
                        info = [i, a_dec, a_min, eps, eps_dec, reward, 
                                q.time, q.policy, q.V, rews]
                        
                        df_length = len(q_df)
                        q_df.loc[df_length] = info
    return q_df

In [213]:
alpha_decs = [0.99, 0.999]
alpha_mins =[0.001, 0.0001]
eps = [10.0, 1.0]
eps_dec = [0.99, 0.999]
iters = [1000000, 10000000]
q_df = trainQ(P, R, discount=0.9, alpha_dec=alpha_decs, alpha_min=alpha_mins, 
            epsilon=eps, epsilon_decay=eps_dec, n_iter=iters)

1: 2.112962646405481
2: 2.114554436224601
3: 2.123520195448921
4: 2.0888112591507824
5: 2.1402737720966294
6: 2.1492408145860464
7: 2.1114409798980227
8: 2.125838636594577
9: 2.1491318650461873
10: 2.152381344969251
11: 2.1358721610948783
12: 2.1704188534760607
13: 2.188514979606224
14: 2.1345532096537214
15: 2.1360032677977454
16: 2.1521126974241773
17: 2.0895614320285256
18: 2.1320880900607415
19: 2.1330403481044544
20: 2.093046013843786
21: 2.154250116248784
22: 2.112107524198963
23: 2.1148869159356267
24: 2.1556510451276263
25: 2.1639811229072214
26: 2.105347086310907
27: 2.1296723046979444
28: 2.134965341850259
29: 2.0952651327605407
30: 2.1219864873474057
31: 2.1567201354383005
32: 2.1517767433819723


In [227]:
vi_df.Policy == pi_pol

0    True
1    True
2    True
3    True
4    True
5    True
Name: Policy, dtype: bool

In [223]:
test_policy(P,R,q_df.Policy[18])

2.090846083918334

In [229]:
q_df

Unnamed: 0,Iterations,Alpha Decay,Alpha Min,Epsilon,Epsilon Decay,Reward,Time,Policy,Value Function,Training Rewards
0,1000000,0.99,0.001,10.0,0.99,2.112963,43.618149,"(0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, ...","(4.470668399870677, 5.027924595576489, 5.03155...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
1,1000000,0.99,0.0001,10.0,0.99,2.114554,43.67741,"(0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, ...","(4.438413582163707, 4.99108021253746, 4.003553...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 6.0, 0.0, ..."
2,1000000,0.999,0.001,10.0,0.99,2.12352,43.546746,"(0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, ...","(4.484660966806103, 5.035224086337198, 5.03276...","[1.0, 0.0, 1.0, 6.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,1000000,0.999,0.0001,10.0,0.99,2.088811,43.096997,"(0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, ...","(4.471314158210771, 5.025134476211464, 4.84949...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
4,1000000,0.99,0.001,10.0,0.999,2.140274,43.282926,"(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...","(4.47818983666401, 5.027582316682896, 5.029782...","[1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, ..."
5,1000000,0.99,0.0001,10.0,0.999,2.149241,43.344919,"(0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, ...","(4.440625624402229, 4.9897241468632005, 4.0076...","[1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, ..."
6,1000000,0.999,0.001,10.0,0.999,2.111441,43.098411,"(0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, ...","(4.4772292147294666, 5.026758552844908, 5.0280...","[0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, ..."
7,1000000,0.999,0.0001,10.0,0.999,2.125839,43.276635,"(0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, ...","(4.47239106431398, 5.025867822420857, 4.887505...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8,1000000,0.99,0.001,1.0,0.99,2.149132,43.35932,"(0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, ...","(4.470355434370849, 5.025243381496025, 5.02333...","[6.0, 0.0, 10.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0,..."
9,1000000,0.99,0.0001,1.0,0.99,2.152381,44.149231,"(0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, ...","(4.437975609124586, 4.990089729344698, 4.04938...","[0.0, 10.0, 0.0, 0.0, 10.0, 0.0, 0.0, 0.0, 0.0..."


In [238]:
q_df.groupby("Iterations").mean()

Unnamed: 0_level_0,Alpha Decay,Alpha Min,Epsilon,Epsilon Decay,Reward,Time
Iterations,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1000000,0.9945,0.00055,5.5,0.9945,2.136602,43.697768
10000000,0.9945,0.00055,5.5,0.9945,2.127772,508.858674


In [242]:
q_df.groupby("Epsilon Decay").mean()

Unnamed: 0_level_0,Alpha Decay,Alpha Min,Epsilon,Reward,Time
Epsilon Decay,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.99,0.9945,0.00055,5.5,2.126835,272.733754
0.999,0.9945,0.00055,5.5,2.137539,279.822688
