In [None]:
'''
    File name: CIFAR10-WideResNet.py
    Author: Yue Liang
    Date last modified: 4/15/2021
    Python Version: 3.8
    TensorFlow 2.4
'''

<h2>Register none slippery FrozenLake8x8<h2>

In [1]:
import numpy as np 
import gym
import random
from gym.envs.registration import register

if 'FrozenLake8x8NotSlippery-v0' in gym.envs.registry.env_specs:
    del gym.envs.registry.env_specs['FrozenLake8x8NotSlippery-v0']

register(
    id='FrozenLake8x8NotSlippery-v0',
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'map_name' : '8x8', 'is_slippery': False},
    max_episode_steps=100,
    reward_threshold=0.8196
)

<h2>FrozenLake8x8NotSlippery-v0<h2>

In [2]:
env = gym.make("FrozenLake8x8NotSlippery-v0")

In [3]:
n_episodes = 500000
current_epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.001
decay_rate = 0.0001
Reward_list = []

In [4]:
Q = np.zeros([env.observation_space.n, env.action_space.n])
New_Q = np.zeros([env.observation_space.n, env.action_space.n])

for i_episodes in range(n_episodes):
    state = env.reset()
    x = 0
    y_list = []
    score = 0.0
    done = False
    
    while not done:
        if np.random.rand() < current_epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(Q[state, :])
        new_state, reward, done, _ = env.step(action)
        y_list.append((state, action))
        score += reward
        state = new_state
        x += reward
    
    Reward_list.append(x)

    for (state, action) in y_list:
        New_Q[state, action] += 1.0
        learning_rate = 1.0 / New_Q[state, action]
        Q[state, action] += learning_rate * (score - Q[state, action])

    if i_episodes % 10000 == 0 and i_episodes != 0:
        print(str(i_episodes) + "/" +str(n_episodes))
        print("Current score: " + str(sum(Reward_list) / i_episodes))
        
    current_epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*i_episodes)

print()        
print("Final score: " + str(sum(Reward_list)/n_episodes))

print()
print("Done!")

env.close()

10000/500000
Current score: 0.3546
20000/500000
Current score: 0.59045
30000/500000
Current score: 0.6734
40000/500000
Current score: 0.750825
50000/500000
Current score: 0.7994
60000/500000
Current score: 0.8323833333333334
70000/500000
Current score: 0.8561857142857143
80000/500000
Current score: 0.8740625
90000/500000
Current score: 0.8879555555555556
100000/500000
Current score: 0.8991
110000/500000
Current score: 0.9082272727272728
120000/500000
Current score: 0.9158083333333333
130000/500000
Current score: 0.9222538461538462
140000/500000
Current score: 0.9277642857142857
150000/500000
Current score: 0.9325333333333333
160000/500000
Current score: 0.936725
170000/500000
Current score: 0.9404117647058824
180000/500000
Current score: 0.94365
190000/500000
Current score: 0.9466
200000/500000
Current score: 0.94925
210000/500000
Current score: 0.9516476190476191
220000/500000
Current score: 0.9538181818181818
230000/500000
Current score: 0.9558130434782609
240000/500000
Current score

<h2>FrozenLake8x8-v0<h2>

In [5]:
env = gym.make("FrozenLake8x8-v0")

In [6]:
n_episodes = 500000
current_epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.001
decay_rate = 0.0001
Reward_list = []

In [7]:
Q = np.zeros([env.observation_space.n, env.action_space.n])
New_Q = np.zeros([env.observation_space.n, env.action_space.n])

for i_episodes in range(n_episodes):
    state = env.reset()
    x = 0
    y_list = []
    score = 0.0
    done = False
    
    while not done:
        if np.random.rand() < current_epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(Q[state, :])
        new_state, reward, done, _ = env.step(action)
        y_list.append((state, action))
        score += reward
        state = new_state
        x += reward
    
    Reward_list.append(x)

    for (state, action) in y_list:
        New_Q[state, action] += 1.0
        learning_rate = 1.0 / New_Q[state, action]
        Q[state, action] += learning_rate * (score - Q[state, action])

    if i_episodes % 10000 == 0 and i_episodes != 0:
        print(str(i_episodes) + "/" +str(n_episodes))
        print("Current score: " + str(sum(Reward_list) / i_episodes))
        
    current_epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*i_episodes)

print()        
print("Final score: " + str(sum(Reward_list)/n_episodes))

print()
print("Done!")

env.close()

10000/500000
Current score: 0.0113
20000/500000
Current score: 0.05875
30000/500000
Current score: 0.11123333333333334
40000/500000
Current score: 0.15395
50000/500000
Current score: 0.18468
60000/500000
Current score: 0.20781666666666668
70000/500000
Current score: 0.22582857142857143
80000/500000
Current score: 0.23795
90000/500000
Current score: 0.2486
100000/500000
Current score: 0.25654
110000/500000
Current score: 0.26280909090909094
120000/500000
Current score: 0.26798333333333335
130000/500000
Current score: 0.27226153846153844
140000/500000
Current score: 0.27612857142857145
150000/500000
Current score: 0.27894
160000/500000
Current score: 0.28164375
170000/500000
Current score: 0.2841
180000/500000
Current score: 0.28634444444444446
190000/500000
Current score: 0.2886684210526316
200000/500000
Current score: 0.29027
210000/500000
Current score: 0.29222857142857145
220000/500000
Current score: 0.2938181818181818
230000/500000
Current score: 0.2951478260869565
240000/500000
Cur