In [None]:
import time

import gym
import hiive.mdptoolbox
import hiive.mdptoolbox.mdp
import hiive.mdptoolbox.example

from gym.envs.toy_text.frozen_lake import generate_random_map
import hiive_openAI_extract

# import hiive.mdptoolbox as mdptoolbox
from hiive.mdptoolbox.mdp import ValueIteration, PolicyIterationModified, QLearning

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time

# set seed
np.random.seed(0)

In [None]:
colors = {
    b'S': 'b',
    b'F': 'w',
    b'H': 'k',
    b'G': 'g'
}

directions = {
            0: '←',
            1: '↓',
            2: '→',
            3: '↑'
}

def plot_lake(env, policy=None, title="Frozen Lake", flag=True):
    squares = env.nrow
    fig = plt.figure(figsize=(6, 6))
    ax = fig.add_subplot(111, xlim=(-.01, squares+0.01), ylim=(-.01, squares+0.01))
    plt.title(title, fontsize=16, weight='bold', y=1.01)
    for i in range(squares):
        for j in range(squares):
            y = squares - i - 1
            x = j
            p = plt.Rectangle([x, y], 1, 1, linewidth=1, edgecolor='k')
            p.set_facecolor(colors[env.desc[i,j]])
            ax.add_patch(p)
            
            if policy is not None:
                text = ax.text(x+0.5, y+0.5, directions[policy[i, j]],
                               horizontalalignment='center', size=25, verticalalignment='center',
                               color='k')
            
    plt.savefig(title + '.png', dpi=400)

In [None]:
# code based on:
# https://medium.com/analytics-vidhya/solving-the-frozenlake-environment-from-openai-gym-using-value-iteration-5a078dffe438
def get_score(env, policy, printInfo=False, episodes=1000):
    misses = 0
    successes = 0
    steps_list = []
    for episode in range(episodes):
        observation = env.reset()
        steps=0
        while True:
            action = policy[observation]
            observation, reward, done, _ = env.step(action)
            steps+=1
            if done and reward == 1:
                # print('You have got the Frisbee after {} steps'.format(steps))
                steps_list.append(steps)
                successes += 1
                break
            elif done and reward == 0:
                # print("You fell in a hole!")
                misses += 1
                break
    ave_steps = np.mean(steps_list)
    std_steps = np.std(steps_list)
#     pct_fail  = (misses/episodes)* 100
    pct_success  = (successes/episodes)* 100
    
    if (printInfo):
        print('----------------------------------------------')
        print('You took an average of {:.0f} steps to get the frisbee'.format(ave_steps))
        print('And you fell in the hole {:.2f} % of the times'.format(pct_fail))
        print('----------------------------------------------')
  
    return ave_steps, std_steps, pct_success

In [None]:
# initialize the problem - just one time
random_map = generate_random_map(size=5, p=0.8)
env = gym.make('FrozenLake-v1', desc=random_map).unwrapped

env.max_episode_steps=250

# Create transition and reward matrices from OpenAI P matrix
rows = env.nrow
cols = env.ncol
T = np.zeros((4, rows*cols, rows*cols))
R = np.zeros((4, rows*cols, rows*cols))

old_state = np.inf

for square in env.P:
    for action in env.P[square]:
        for i in range(len(env.P[square][action])):
            new_state = env.P[square][action][i][1]
            if new_state == old_state:
                T[action][square][env.P[square][action][i][1]] = T[action][square][old_state] + env.P[square][action][i][0]
                R[action][square][env.P[square][action][i][1]] = R[action][square][old_state] + env.P[square][action][i][2]
            else:
                T[action][square][env.P[square][action][i][1]] = env.P[square][action][i][0]
                R[action][square][env.P[square][action][i][1]] = env.P[square][action][i][2]
            old_state = env.P[square][action][i][1]

print(env.nrow, env.ncol)
plot_lake(env)

In [None]:
# step_list = []
# stepstd_list = []
# success_list = []
# for gamma in range(50, 100, 5):
#     vi_small = ValueIteration(T, R, (gamma + 0.5) / 100, epsilon=0.001)
#     vi_small.run()
#     print(vi_small.policy)
#     steps, steps_stddev, successs = get_score(env, vi_small.policy)
#     step_list.append(steps)
#     stepstd_list.append(steps_stddev)
#     success_list.append(successs)

# # sns.set(style="whitegrid")
# gamma_arr = [i / 100 for i in range(50, 100, 5)]   

# fig = plt.figure(figsize=(10,4))
# ax  = sns.barplot(gamma_arr,step_list) 
# ax.set_title('Average Steps when selecting different gamma')
# ax.set_xlabel('Gamma')
# ax.set_ylabel('Average Steps')

# title='VI_averageSteps_vs_gamma'
# plt.savefig(title + '.png', dpi=400)

# fig = plt.figure(figsize=(10,4))
# ax  = sns.barplot(gamma_arr,success_list) 
# ax.set_title('Success rate when selecting different gamma (percentage %)')
# ax.set_xlabel('Gamma')
# ax.set_ylabel('Success rate/%')

# title='VI_successRate_vs_gamma'
# plt.savefig(title + '.png', dpi=400)

In [None]:
# Visualization for value iteration
vi_small = ValueIteration(T, R, 0.95, epsilon=0.001)
vi_small.run()
best_policy = np.array(list(vi_small.policy))


# bestID = argmax(success_list)
# best_policy = vi_data['policy'][bestRun]
# best_policy = np.array(list(policy_list[bestID]))

rows = env.nrow
cols = env.ncol

best_policy = best_policy.reshape(rows, cols)

# plot the policy
title='Frozen Lake VI Optimal Policy'
plot_lake(env, best_policy, title)

In [None]:
# Visualization for policy iteration

pi_small = PolicyIterationModified(T, R, 0.95, epsilon=0.001)
pi_small.run()
best_policy = np.array(list(pi_small.policy))

print(best_policy)

# bestID = argmax(success_list)
# best_policy = vi_data['policy'][bestRun]
# best_policy = np.array(list(policy_list[bestID]))

rows = env.nrow
cols = env.ncol

best_policy = best_policy.reshape(rows, cols)

# plot the policy
title='Frozen Lake PI Optimal Policy'
plot_lake(env, best_policy, title)


In [None]:
alpha = 0.25
gamma = 0.8  
episodes = 100000
epsilon = 1

# random_map = generate_random_map(size=5, p=0.8)
# env_large = gym.make('FrozenLake-v1', desc=random_map)
# env = env_large.unwrapped

Q = np.zeros((env.observation_space.n, env.action_space.n))

for episode in range(episodes):
    if episode%5000 == 0:
        print(episode)
        print(Q)
    state = env.reset()
    complete = False
    total_reward = 0
    max_steps = 1000000

    for i in range(max_steps):
        if complete:
            break
        current = state
        if np.random.rand() < (epsilon):
            action = np.argmax(Q[current, :])
        else:
            action = env.action_space.sample()

        state, reward, complete, info = env.step(action)

        total_reward += reward
        Q[current, action] += alpha * (reward + gamma * np.max(Q[state, :]) - Q[current, action])
    epsilon = max([1 - 0.005*(episode/100), 0.1])
print(Q)

In [None]:
best_policy = np.reshape(np.argmax(Q, axis=1), -1)

rows = env.nrow
cols = env.ncol

best_policy = best_policy.reshape(rows, cols)

# plot the policy
title='Frozen Lake Q Learning Optimal Policy'
plot_lake(env, best_policy, title)

In [None]:
# Comparison - best policy: time, step, success
start = time.time()
steps, steps_stddev, successs = get_score(env, np.array(list(vi_small.policy)))
end = time.time()
print(steps, successs, end-start)

start = time.time()
steps, steps_stddev, successs = get_score(env, np.array(list(pi_small.policy)))
end = time.time()
print(steps, successs, end-start)

start = time.time()
steps, steps_stddev, successs = get_score(env, np.reshape(np.argmax(Q, axis=1), -1))
end = time.time()
print(steps, successs, end-start)

In [None]:
# Test the training time
start = time.time()
vi_small = ValueIteration(T, R, 0.95, epsilon=0.001)
vi_small.run()
end = time.time()
print(end-start)

start = time.time()
pi_small = PolicyIterationModified(T, R, 0.95, epsilon=0.001)
pi_small.run()
end = time.time()
print(end-start)


start = time.time()

alpha = 0.25
gamma = 0.8  
episodes = 100000
epsilon = 1

# random_map = generate_random_map(size=5, p=0.8)
# env_large = gym.make('FrozenLake-v1', desc=random_map)
# env = env_large.unwrapped

Q = np.zeros((env.observation_space.n, env.action_space.n))

for episode in range(episodes):
    state = env.reset()
    complete = False
    total_reward = 0
    max_steps = 1000000
    
    prev = Q
    for i in range(max_steps):
        if complete:
            break
        current = state
        if np.random.rand() < (epsilon):
            action = np.argmax(Q[current, :])
        else:
            action = env.action_space.sample()

        state, reward, complete, info = env.step(action)

        total_reward += reward
        Q[current, action] += alpha * (reward + gamma * np.max(Q[state, :]) - Q[current, action])
    epsilon = max([1 - 0.005*(episode/100), 0.1])
    if episode > 20000 and np.abs(np.mean(prev-Q)) < 0.001:
        print(prev)
        print(Q)
        break
    
end = time.time()
print(end-start)

In [None]:
# initialize the bigger size one
random_map = generate_random_map(size=30, p=0.8)
env2 = gym.make('FrozenLake-v1', desc=random_map).unwrapped

env2.max_episode_steps=250

# Create transition and reward matrices from OpenAI P matrix
rows = env2.nrow
cols = env2.ncol
T = np.zeros((4, rows*cols, rows*cols))
R = np.zeros((4, rows*cols, rows*cols))

old_state = np.inf

for square in env2.P:
    for action in env2.P[square]:
        for i in range(len(env2.P[square][action])):
            new_state = env2.P[square][action][i][1]
            if new_state == old_state:
                T[action][square][env2.P[square][action][i][1]] = T[action][square][old_state] + env2.P[square][action][i][0]
                R[action][square][env2.P[square][action][i][1]] = R[action][square][old_state] + env2.P[square][action][i][2]
            else:
                T[action][square][env2.P[square][action][i][1]] = env2.P[square][action][i][0]
                R[action][square][env2.P[square][action][i][1]] = env2.P[square][action][i][2]
            old_state = env2.P[square][action][i][1]

print(env2.nrow, env2.ncol)
plot_lake(env2)