In [1]:
import numpy as np
import gym
import random
import time
from IPython.display import clear_output

In [2]:
env = gym.make("FrozenLake-v0")
action_space_size = env.action_space.n
state_space_size = env.observation_space.n
q_table = np.zeros((state_space_size, action_space_size))
print(q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [3]:
num_episodes = 10000
max_step_per_episode = 100

learning_rate = 0.1
discount_rate = 0.99

exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.001

In [5]:
rewards_from_all_episodes = []
for episode in range(num_episodes):
    state = env.reset()
    
    done = False
    reward_from_current_episode = 0
    
    for step in range(max_step_per_episode):
        
        exploration_rate_threshold = random.uniform(0,1)
        if exploration_rate_threshold > exploration_rate:
            action = np.argmax(q_table[state,:])
        else:
            action = env.action_space.sample()
        
        new_state, reward, done, info = env.step(action)
        
        # update q table for (s,a)
        q_table[state,action] = ((1 - learning_rate) * (q_table[state,action])) + \
                                (learning_rate * (reward + (discount_rate * np.argmax(q_table[new_state,:]))))
        
        state = new_state
        reward_from_current_episode += reward
        
        if done == True:
            break
        
    exploration_rate = min_exploration_rate + \
        ((max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate * episode))
    rewards_from_all_episodes.append(reward_from_current_episode)

In [9]:
rewards_per_thousand_episodes = np.split(np.array(rewards_from_all_episodes), num_episodes/1000)
count = 1000
print("Average rewards per thousand episodes: ")
for r in rewards_per_thousand_episodes:
    print(count,':',str(sum(r/1000)))
    count +=1000
print("\n\n Q-table")
print(q_table)

Average rewards per thousand episodes: 
1000 : 0.02100000000000001
2000 : 0.024000000000000014
3000 : 0.022000000000000013
4000 : 0.013000000000000005
5000 : 0.008
6000 : 0.002
7000 : 0.0
8000 : 0.001
9000 : 0.001
10000 : 0.001


 Q-table
[[2.70945851 2.70327649 2.61528386 2.8809    ]
 [1.9623458  1.97580197 2.13260037 2.15043288]
 [2.97       2.772      2.97       2.97      ]
 [2.38574602 2.36028646 1.83598884 2.25841436]
 [0.77792605 1.07510345 2.1421188  1.40674774]
 [0.         0.         0.         0.        ]
 [0.26198731 0.16943712 0.27541937 0.8310129 ]
 [0.         0.         0.         0.        ]
 [0.41808866 0.32982702 0.4994441  1.78860975]
 [1.03503057 0.33110325 0.33294147 0.29917828]
 [1.92327027 0.54065354 0.69115055 0.42363592]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.46884541 0.58859655 1.3391358  0.55078488]
 [1.01203609 1.30271828 0.65349584 1.07300662]
 [0.         0.         0.         0.        ]]
