In [1]:
import numpy as np
import gym
import torch
import random
import time

In [2]:
env = gym.make("FrozenLake-v0")
state = env.reset()
state

0

In [3]:
num_episodes = 100000
max_steps_per_episode = 200

learnring_rate = 0.1
discount_rate =0.9

exploration_rate = 1
min_exploration_rate = 0.001
max_exploration_rate = 1
exploration_decay_rate = 0.003

In [4]:
env.observation_space.n

16

In [5]:
state_space_size = env.observation_space.n
action_space_size = env.action_space.n

q_table = np.zeros((state_space_size, action_space_size))
q_table

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [6]:
rewards_all_episodes= []
for episode in range(num_episodes):
    state = env.reset()
        
    done = False
    rewards = 0
    for step in range(max_steps_per_episode):
        
        # Exploration-exploitation trade-off
        exploration_random = random.uniform(0,1)
        if  exploration_random > exploration_rate:
            #print("using policy - exploitation")
            action = np.argmax(q_table[state, :])
        else:
            action = env.action_space.sample()
            #print("exploration", state, action)
        
        # Take new action
        new_state, reward, done, info = env.step(action)
        
        # Update Q-table
        q_table[state, action] = (1-learnring_rate) * q_table[state, action] +  \
                                 learnring_rate * (reward + discount_rate * np.max(q_table[new_state, :]))  
        
        state = new_state
        rewards += reward 
        
        if done == True: 
            break
    
    # Exploration rate decay
    exploration_rate = min_exploration_rate + \
    (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate*episode)
    
    rewards_all_episodes.append(rewards)

In [7]:
q_table

array([[0.0385139 , 0.03763396, 0.0723246 , 0.03272634],
       [0.0272031 , 0.02678068, 0.0320454 , 0.05338485],
       [0.03070431, 0.03088893, 0.06904674, 0.03359133],
       [0.02195296, 0.02766177, 0.02294736, 0.04642389],
       [0.10168758, 0.04761009, 0.03640014, 0.03762939],
       [0.        , 0.        , 0.        , 0.        ],
       [0.1199263 , 0.02599748, 0.03186477, 0.01675023],
       [0.        , 0.        , 0.        , 0.        ],
       [0.07241244, 0.06979899, 0.06458645, 0.16018495],
       [0.13821211, 0.22892139, 0.11356702, 0.1361748 ],
       [0.23063017, 0.14192529, 0.10070992, 0.09547503],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.12835819, 0.20719701, 0.45923121, 0.15362899],
       [0.37311096, 0.74142224, 0.41417691, 0.37851984],
       [0.        , 0.        , 0.        , 0.        ]])

In [8]:
exploration_rate = min_exploration_rate + \
(max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate*30000)
exploration_rate    

0.001

In [9]:
rewards_all_episodes[-10:]

[0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0]

In [10]:
# Calculate and print the average reward per thousand episodes
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes),num_episodes/1000)
count = 1000

print("********Average reward per thousand episodes********\n")
for r in rewards_per_thousand_episodes:
    print(count, ": ", str(sum(r/1000)))
    count += 1000

********Average reward per thousand episodes********

1000 :  0.07600000000000005
2000 :  0.33100000000000024
3000 :  0.44300000000000034
4000 :  0.6030000000000004
5000 :  0.5620000000000004
6000 :  0.5440000000000004
7000 :  0.4960000000000004
8000 :  0.43900000000000033
9000 :  0.5080000000000003
10000 :  0.5340000000000004
11000 :  0.5370000000000004
12000 :  0.6200000000000004
13000 :  0.6220000000000004
14000 :  0.5850000000000004
15000 :  0.5290000000000004
16000 :  0.5000000000000003
17000 :  0.47700000000000037
18000 :  0.4130000000000003
19000 :  0.5180000000000003
20000 :  0.5110000000000003
21000 :  0.5430000000000004
22000 :  0.5210000000000004
23000 :  0.5320000000000004
24000 :  0.5820000000000004
25000 :  0.5960000000000004
26000 :  0.5280000000000004
27000 :  0.6710000000000005
28000 :  0.6380000000000005
29000 :  0.6740000000000005
30000 :  0.5630000000000004
31000 :  0.6940000000000005
32000 :  0.6720000000000005
33000 :  0.7320000000000005
34000 :  0.641000000000000

## visualization

In [11]:
from IPython import display

In [12]:
state = env.reset()
env.render()
state


[41mS[0mFFF
FHFH
FFFH
HFFG


0

In [13]:
q_table[state, :], np.argmax(q_table[state, :] )

(array([0.0385139 , 0.03763396, 0.0723246 , 0.03272634]), 2)

In [14]:
state = env.reset()
for setp in range(500):

    action = np.argmax(q_table[state, :] )
    new_state, reward, done, info = env.step(action)
    state = new_state
    
    display.clear_output(wait=True)
    env.render()
    time.sleep(0.2)
    
    if done: break

  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
