In [2]:
import numpy as np
import gym
import random

In [3]:
env = gym.make("FrozenLake-v0")

In [4]:
#Creating our QTable
action_size = env.action_space.n
state_size = env.observation_space.n

qtable = np.zeros((state_size , action_size))
print(qtable)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [7]:
#creating hyperparameters
total_episodes = 15000
learning_rate = 0.8
max_steps = 99 #max steps per episode
gamma = 0.95 #discounting rate

#exploration parameters
epsilon = 1.0 #exploration rate
max_epsilon = 1.0 #exploration rate at start
min_epsilon = 0.01 
decay_rate = 0.005 #exponentil decay rate for exploration probability

In [10]:
#q learning algorithm 
#initialize q values for all state action pairs
# until learning has stopped
# chose an action in the current state based on current q value estimate
#take the action and observe the outcome state and rewards
#update q value for state and action

rewards = [] #list of rewards

#until learning has stopped
for episode in range(total_episodes):
    
    #reset the env
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    for step in range(max_steps):
        #chose an action a in the current world state 
        #first randomize a number
        exp_exp_tradeoff = random.uniform(0,1)
        
        #if number is greater than epsilon > exploitation
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(qtable[state,:])
            
        #else explore
        else:
            action = env.action_space.sample()
            
        #take the action and observe outcome state and rewards
        new_state , reward , done , info = env.step(action)
        
        #update q table 
        #q = q + lr(r + gamma*max(q) - q)
        qtable[state , action] = qtable[state , action] + learning_rate *(reward + gamma * np.max(qtable[new_state,:]) - qtable[state ,action])
        
        total_rewards += reward
        
        state = new_state
        
        if done == True:
            break
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode) #reduce epsilon
    rewards.append(total_rewards)
    
print("score over time :" + str(sum(rewards)/total_episodes))
print(qtable)

score over time :0.48106666666666664
[[1.89867734e-02 6.50421802e-02 1.94733405e-02 1.86048061e-02]
 [5.81009577e-03 1.70086953e-03 2.74497716e-03 3.09590691e-02]
 [5.29291044e-02 7.63522683e-03 1.00363060e-02 1.05144313e-02]
 [3.10785982e-03 1.74947571e-04 7.85186247e-03 1.00383266e-02]
 [2.23534943e-01 7.26383247e-03 5.33282396e-03 1.32216147e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [9.12295436e-02 5.68435451e-04 2.00222224e-06 2.26137095e-11]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [9.59222051e-03 8.48643920e-03 1.62376584e-02 2.43913946e-01]
 [1.44430495e-02 4.98437493e-01 7.19697304e-04 1.08637237e-02]
 [3.20120198e-02 2.06665181e-02 2.33077212e-04 6.28396411e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [9.08001380e-02 7.93434162e-08 4.63822599e-01 2.06157131e-02]
 [1.96023258e-01 3.03014210e-01 1.92243888e-01 2.17540184e-01]
 [0.00000000e+00 0

In [22]:
env.reset()

for episode in range(5):
    state = env.reset()
    step = 0
    done = False
    print("****************************************************")
    print("EPISODE ", episode)

    for step in range(max_steps):
        
        # Take the action (index) that have the maximum expected future reward given that state
        action = np.argmax(qtable[state,:])
        
        new_state, reward, done, info = env.step(action)
        
        if done:
            # Here, we decide to only print the last state (to see if our agent is on the goal or fall into an hole)
            env.render()
            
            # We print the number of step it took.
            print("Number of steps", step)
            break
        state = new_state
env.close()

****************************************************
EPISODE  0
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
Number of steps 29
****************************************************
EPISODE  1
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 34
****************************************************
EPISODE  2
****************************************************
EPISODE  3
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 12
****************************************************
EPISODE  4
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 30
