In [None]:
import gym
import hiive.mdptoolbox
import hiive.mdptoolbox.mdp
import hiive.mdptoolbox.example

from gym.envs.toy_text.frozen_lake import generate_random_map
import hiive_openAI_extract

# import hiive.mdptoolbox as mdptoolbox
from hiive.mdptoolbox.mdp import ValueIteration, PolicyIterationModified, QLearning

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time

In [None]:
def initialize_problem(size):
    random_map = generate_random_map(size=size, p=0.8)
    env = gym.make('FrozenLake-v1', desc=random_map).unwrapped

    env.max_episode_steps=250

    # Create transition and reward matrices from OpenAI P matrix
    rows = env.nrow
    cols = env.ncol
    T = np.zeros((4, rows*cols, rows*cols))
    R = np.zeros((4, rows*cols, rows*cols))

    old_state = np.inf

    for square in env.P:
        for action in env.P[square]:
            for i in range(len(env.P[square][action])):
                new_state = env.P[square][action][i][1]
                if new_state == old_state:
                    T[action][square][env.P[square][action][i][1]] = T[action][square][old_state] + env.P[square][action][i][0]
                    R[action][square][env.P[square][action][i][1]] = R[action][square][old_state] + env.P[square][action][i][2]
                else:
                    T[action][square][env.P[square][action][i][1]] = env.P[square][action][i][0]
                    R[action][square][env.P[square][action][i][1]] = env.P[square][action][i][2]
                old_state = env.P[square][action][i][1]

    print(env.nrow, env.ncol)
#     plot_lake(env)
    
    return T, R, env

In [None]:
# Value Iteration - change problem size
time_list = []
iter_list = []
reward_list = []

for size in range(5, 35, 5):
    T, R, _ = initialize_problem(size)
    
    one_test = ValueIteration(T, R, gamma=0.98, epsilon=0.001, max_iter=100000)
    
    one_test.run()
    time_list.append(one_test.time)
    iter_list.append(one_test.iter)
    reward_list.append(np.mean(one_test.V))

print(time_list)
print(iter_list)
print(reward_list)

In [None]:
# Value Iteration - large and small
T_small, R_small, _ = initialize_problem(5)
T_large, R_large, _ = initialize_problem(30)

time_list1, time_list2 = [], []
iter_list1, iter_list2 = [], []
reward_list1, reward_list2 = [], []

for gamma in range(100):
    vi_small = ValueIteration(T_small, R_small, (gamma + 0.5) / 100, epsilon=0.001)
    vi_small.run()
    time_list1.append(vi_small.time)
    iter_list1.append(vi_small.iter)
    reward_list1.append(np.mean(vi_small.V))
    
    vi_large = ValueIteration(T_large, R_large, (gamma + 0.5) / 100, epsilon=0.00001)
    vi_large.run()
    time_list2.append(vi_large.time)
    iter_list2.append(vi_large.iter)
    reward_list2.append(np.mean(vi_large.V))

gamma_arr = [(i + 0.5) / 100 for i in range(100)]   
    
plt.plot(gamma_arr, iter_list1, label='5*5 states')
plt.plot(gamma_arr, iter_list2, label='30*30 states')
plt.xlabel('gamma (discount factor)')
plt.ylabel('Iterations to Converge')
plt.title('Value Iteration - Iterations to Converge')
plt.grid()
plt.legend()
plt.savefig('Frozen_Lake_vi_iters', dpi=400)
plt.show()

plt.plot(gamma_arr, time_list1, label='5*5 states')
plt.plot(gamma_arr, time_list2, label='30*30 states')
plt.xlabel('gamma (discount factor)')
plt.ylabel('Clock time to Converge')
plt.title('Value Iteration - Clock time to Converge')
plt.grid()
plt.legend()
plt.savefig('Frozen_Lake_vi_time', dpi=400)
plt.show()

plt.plot(gamma_arr, reward_list1, label='5*5 states')
plt.plot(gamma_arr, reward_list2, label='30*30 states')
plt.xlabel('gamma (discount factor)')
plt.ylabel('mean reward')
plt.title('Value Iteration - mean rewards')
plt.grid()
plt.legend()
plt.savefig('Frozen_Lake_vi_reward', dpi=400)
plt.show()

In [None]:
# Policy Iteration - change problem size
time_list = []
iter_list = []
reward_list = []

for size in range(5, 35, 5):
    T, R, _ = initialize_problem(size)
    
    one_test = PolicyIterationModified(T, R, gamma=0.98, epsilon=0.001, max_iter=100000)
    
    one_test.run()
    time_list.append(one_test.time)
    iter_list.append(one_test.iter)
    reward_list.append(np.mean(one_test.V))

print(time_list)
print(iter_list)
print(reward_list)

In [None]:
# Policy Iteration - large and small
T_small, R_small, _ = initialize_problem(5)
T_large, R_large, _ = initialize_problem(30)

time_list1, time_list2 = [], []
iter_list1, iter_list2 = [], []
reward_list1, reward_list2 = [], []

for gamma in range(100):
    vi_small = PolicyIterationModified(T_small, R_small, (gamma + 0.5) / 100, epsilon=0.01)
    vi_small.run()
    time_list1.append(vi_small.time)
    iter_list1.append(vi_small.iter)
    reward_list1.append(np.mean(vi_small.V))
    
    vi_large = PolicyIterationModified(T_large, R_large, (gamma + 0.5) / 100, epsilon=0.01)
    vi_large.run()
    time_list2.append(vi_large.time)
    iter_list2.append(vi_large.iter)
    reward_list2.append(np.mean(vi_large.V))

gamma_arr = [(i + 0.5) / 100 for i in range(100)]   
    
plt.plot(gamma_arr, iter_list1, label='5*5 states')
plt.plot(gamma_arr, iter_list2, label='30*30 states')
plt.xlabel('gamma (discount factor)')
plt.ylabel('Iterations to Converge')
plt.title('Policy Iteration - Iterations to Converge')
plt.grid()
plt.legend()
plt.savefig('Frozen_Lake_pi_iters', dpi=400)
plt.show()

plt.plot(gamma_arr, time_list1, label='5*5 states')
plt.plot(gamma_arr, time_list2, label='30*30 states')
plt.xlabel('gamma (discount factor)')
plt.ylabel('Clock time to Converge')
plt.title('Policy Iteration - Clock time to Converge')
plt.grid()
plt.legend()
plt.savefig('Frozen_Lake_pi_time', dpi=400)
plt.show()

plt.plot(gamma_arr, reward_list1, label='5*5 states')
plt.plot(gamma_arr, reward_list2, label='30*30 states')
plt.xlabel('gamma (discount factor)')
plt.ylabel('mean reward')
plt.title('Policy Iteration - mean rewards')
plt.grid()
plt.legend()
plt.savefig('Frozen_Lake_pi_reward', dpi=400)
plt.show()

In [None]:
# Q learning - change problem size


# time_list = []
# iter_list = []
# reward_list = []

# for size in range(5, 35, 5):
#     T, R, _ = initialize_problem(size)
    
#     one_test = QLearning(T, R, gamma=0.9, alpha=0.1, n_iter=100000)
    
#     runs = one_test.run()
#     print(runs[-1]['Time'])
#     time_list.append(runs[-1]['Time'])
#     iter_list.append(runs[-1]['Iteration'])
#     reward_list.append(runs[-1]['Mean V'])

# print(time_list)
# print(iter_list)
# print(reward_list)

reward_list = []
iter_list = []
time_array = []
alpha = 0.1
gamma = 0.98
episodes = 100000
# epsilon=1

for size in range(5, 35, 5):
    random_map = generate_random_map(size=size, p=0.8)
    env_large = gym.make('FrozenLake-v1', desc=random_map)
    env = env_large.unwrapped
    
    rewards = []
    iters = []
    
    start = time.time()
    Q = np.zeros((env.observation_space.n, env.action_space.n))
    
    for episode in range(episodes):
        if episode%5000 == 0:
            print(episode)
        state = env.reset()
        complete = False
        total_reward = 0
        max_steps = 1000000
        
        for i in range(max_steps):
            if complete:
                break
            current = state
            if np.random.rand() < (0.5):
                action = np.argmax(Q[current, :])
            else:
                action = env.action_space.sample()

            state, reward, complete, info = env.step(action)
            total_reward += reward
            Q[current, action] += alpha * (reward + gamma * np.max(Q[state, :]) - Q[current, action])
#         epsilon = max([1 - 0.005*(episode/100), 0.1])
        rewards.append(total_reward)
        iters.append(i)
    reward_list.append(np.mean(rewards))
    iter_list.append(np.mean(iters))
    end = time.time()
    time_array.append(end - start)
    
print(time_array)
print(iter_list)
print(reward_list)

In [None]:
reward_list = []
iter_list = []
time_array = []
alphas = [i / 100 for i in range(5, 55, 5)]  
gamma = 0.98
episodes = 20000
# epsilon=1

for alpha in alphas:
    random_map = generate_random_map(size=5, p=0.8)
    env_large = gym.make('FrozenLake-v1', desc=random_map)
    env = env_large.unwrapped
    
    rewards = []
    iters = []
    
    start = time.time()
    Q = np.zeros((env.observation_space.n, env.action_space.n))
    
    for episode in range(episodes):
        if episode%5000 == 0:
            print(episode)
        state = env.reset()
        complete = False
        total_reward = 0
        max_steps = 1000000
        
        for i in range(max_steps):
            if complete:
                break
            current = state
            if np.random.rand() < (0.5):
                action = np.argmax(Q[current, :])
            else:
                action = env.action_space.sample()

            state, reward, complete, info = env.step(action)
            total_reward += reward
            Q[current, action] += alpha * (reward + gamma * np.max(Q[state, :]) - Q[current, action])
#         epsilon = max([1 - 0.005*(episode/100), 0.1])
        rewards.append(total_reward)
        iters.append(i)
    reward_list.append(np.mean(rewards))
    iter_list.append(np.mean(iters))
    end = time.time()
    time_array.append(end - start)
    
print(time_array)
print(iter_list)
print(reward_list)

In [None]:
alphas = [i / 100 for i in range(5, 55, 5)]  

from matplotlib.pyplot import MultipleLocator

plt.plot(alphas, iter_list,'-o',color='g', label='5*5 states')
plt.xlabel('alpha (learning rate)')
plt.ylabel('Iterations to Converge')
x_major_locator=MultipleLocator(0.05)
ax=plt.gca()
ax.xaxis.set_major_locator(x_major_locator)
plt.title('Q Learning - Iterations to Converge')
plt.grid()
plt.legend()
plt.savefig('Frozen_Lake_q_alpha_iter.png', dpi=400)
plt.show()

plt.plot(alphas, time_array,'-o',color='g',  label='5*5 states')
plt.xlabel('alpha (learning rate)')
plt.ylabel('Clock time to Converge')
x_major_locator=MultipleLocator(0.05)
ax=plt.gca()
ax.xaxis.set_major_locator(x_major_locator)
plt.title('Q Learning - Clock time to Converge')
plt.grid()
plt.legend()
plt.savefig('Frozen_Lake_q_alpha_time.png', dpi=400)
plt.show()

plt.plot(alphas, reward_list,'-o',color='g',  label='5*5 states')
plt.xlabel('alpha (learning rate)')
plt.ylabel('mean reward')
x_major_locator=MultipleLocator(0.05)
ax=plt.gca()
ax.xaxis.set_major_locator(x_major_locator)
plt.title('Q Learning - average rewards')
plt.grid()
plt.legend()
plt.savefig('Frozen_Lake_q_alpha_reward.png', dpi=400)
plt.show()


In [None]:
reward_list = []
iter_list = []
time_array = []
alpha = 0.4
gammas = [i / 100 for i in range(2, 100, 2)]  
episodes = 20000
# epsilon=1

for gamma in gammas:
    random_map = generate_random_map(size=5, p=0.8)
    env_large = gym.make('FrozenLake-v1', desc=random_map)
    env = env_large.unwrapped
    
    rewards = []
    iters = []
    
    start = time.time()
    Q = np.zeros((env.observation_space.n, env.action_space.n))
    
    for episode in range(episodes):
        if episode%5000 == 0:
            print(episode)
        state = env.reset()
        complete = False
        total_reward = 0
        max_steps = 1000000
        
        for i in range(max_steps):
            if complete:
                break
            current = state
            if np.random.rand() < (0.5):
                action = np.argmax(Q[current, :])
            else:
                action = env.action_space.sample()

            state, reward, complete, info = env.step(action)
            total_reward += reward
            Q[current, action] += alpha * (reward + gamma * np.max(Q[state, :]) - Q[current, action])
#         epsilon = max([1 - 0.005*(episode/100), 0.1])
        rewards.append(total_reward)
        iters.append(i)
    reward_list.append(np.mean(rewards))
    iter_list.append(np.mean(iters))
    end = time.time()
    time_array.append(end - start)
    
print(time_array)
print(iter_list)
print(reward_list)



In [None]:
plt.plot(gammas, iter_list,color='sandybrown', label='5*5 states')
plt.xlabel('gamma (discount factor)')
plt.ylabel('Iterations to Converge')
plt.title('Q Learning - Clock time to Converge')
plt.grid()
plt.legend()
plt.savefig('Frozen_Lake_q_gamma_iter.png', dpi=400)
plt.show()

plt.plot(gammas, time_array,color='sandybrown',  label='5*5 states')
plt.xlabel('gamma (discount factor)')
plt.ylabel('Clock time to Converge')
plt.title('Q Learning - Clock time to Converge')
plt.grid()
plt.legend()
plt.savefig('Frozen_Lake_q_gamma_time.png', dpi=400)
plt.show()

plt.plot(gammas, reward_list,color='sandybrown',  label='5*5 states')
plt.xlabel('gamma (discount factor)')
plt.ylabel('mean reward')
plt.title('Q Learning - average rewards')
plt.grid()
plt.legend()
plt.savefig('Frozen_Lake_q_gamma_reward.png', dpi=400)
plt.show()

In [None]:
reward_list = []
iter_list = []
time_array = []
alpha = 0.25
gamma = 0.8  
episodes = 20000
epsilon = 1

for strategy in range(4):
    random_map = generate_random_map(size=5, p=0.8)
    env_large = gym.make('FrozenLake-v1', desc=random_map)
    env = env_large.unwrapped
    
    rewards = []
    iters = []
    
    start = time.time()
    Q = np.zeros((env.observation_space.n, env.action_space.n))
    
    for episode in range(episodes):
        if episode%5000 == 0:
            print(episode)
#             print(Q)
        state = env.reset()
        complete = False
        total_reward = 0
        max_steps = 1000000
        
        for i in range(max_steps):
            if complete:
                break
            current = state
            if np.random.rand() < (epsilon):
                action = np.argmax(Q[current, :])
            else:
                action = env.action_space.sample()

            state, reward, complete, info = env.step(action)
            
            total_reward += reward
            Q[current, action] += alpha * (reward + gamma * np.max(Q[state, :]) - Q[current, action])
        # four different strategy
        if strategy == 0:
            epsilon = max([0.95**(episode/100), 0.1])
        elif strategy == 1:
            epsilon = max([1 - 0.005*(episode/100), 0.1])
        elif strategy == 2:
            epsilon = max([np.exp(-0.005*episode), 0.1])
        else:
            epsilon = 0.5
        
        
        rewards.append(total_reward)
        iters.append(i)
    reward_list.append(np.mean(rewards))
    iter_list.append(np.mean(iters))
    end = time.time()
    time_array.append(end - start)
    
print(time_array)
print(iter_list)
print(reward_list)

In [None]:
schedules = ["GeomDecay", "ArithDecay", "ExpDecay", "constant = 0.5"]

fig = plt.figure(figsize=(6,4))
ax  = sns.barplot(schedules,time_array) 
ax.set_title('Q Learning - Clock time to Converge')
ax.set_xlabel('epsilon decay schedule')
ax.set_ylabel('clock time')

plt.savefig('Frozen_Lake_q_epsilon_time.png', dpi=400)

fig = plt.figure(figsize=(6,4))
ax  = sns.barplot(schedules,iter_list) 
ax.set_title('Q Learning - Iterations to Converge')
ax.set_xlabel('epsilon decay schedule')
ax.set_ylabel('iterations')

plt.savefig('Frozen_Lake_q_epsilon_iter.png', dpi=400)

fig = plt.figure(figsize=(6,4))
ax  = sns.barplot(schedules,reward_list) 
ax.set_title('Q Learning - average rewards')
ax.set_xlabel('epsilon decay schedule')
ax.set_ylabel('reward')

plt.savefig('Frozen_Lake_q_epsilon_reward.png', dpi=400)