<a href="https://colab.research.google.com/github/thinkdeepai/reinforcement-learning-training/blob/master/Sarsa_Frozen_Lake.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import gym
import random
import numpy as np
import time, pickle, os

In [0]:
env = gym.make('FrozenLake-v0')
env.reset()
epsilon = 0.9
total_episodes = 10000
max_steps = 100
lr_rate = 0.81
gamma = 0.96


In [0]:
def choose_action(state):
    action=0
    if np.random.uniform(0, 1) < epsilon:
        action = env.action_space.sample()
    else:
        action = np.argmax(Q[state, :])
    return action

def qlearning_learn(state, state2, reward, action):
    old_value = Q[state, action]
    learned_value = reward + gamma * np.max(Q[state2, :])
    Q[state, action] = (1 - lr_rate) * old_value +  lr_rate * learned_value

def sarsa_learn(state, state2, reward, action, action2):
    old_value = Q[state, action]
    learned_value = reward + gamma * Q[state2, action2]
    Q[state, action] = (1 - lr_rate) * old_value +  lr_rate * learned_value    

In [5]:
Q = np.zeros((env.observation_space.n, env.action_space.n))

# Start
for episode in range(total_episodes):
    state = env.reset()
    action = choose_action(state)  
    t = 0
    while t < max_steps:
        #env.render()
        state2, reward, done, info = env.step(action)
        action2 = choose_action(state2)  
        sarsa_learn(state, state2, reward, action, action2)
        state = state2
        action = action2
        t += 1
        if done:
            break

print(Q)




[[2.14396734e-02 7.90446913e-03 8.17581270e-03 4.15357927e-03]
 [6.10721562e-03 1.11365444e-03 4.03015349e-04 3.12293168e-03]
 [8.10069842e-02 2.24281956e-02 2.19682785e-02 1.40468927e-02]
 [2.53869383e-03 2.95102559e-03 1.70069433e-03 6.37731120e-02]
 [8.32873064e-03 5.32504781e-03 5.87363913e-03 4.28842820e-05]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [7.54651080e-04 5.64190874e-03 1.19221210e-02 3.89182040e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [2.36717009e-03 7.67174446e-03 1.69325350e-03 3.70731510e-02]
 [4.35303650e-02 2.60168958e-01 1.93773413e-01 2.23026894e-02]
 [1.59210878e-01 3.96009740e-01 5.43769186e-03 4.62334665e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.16505191e-01 5.80190462e-01 1.43500993e-01 5.84963170e-02]
 [8.23366646e-01 9.80314868e-01 2.81277096e-02 5.27584490e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.000000

In [0]:
"""Evaluate agent's performance after Q-learning"""
def evaluate(rnd=False):
  total_epochs, total_penalties = 0, 0

  for _ in range(total_episodes):
      state = env.reset()
      epochs, penalties, reward = 0, 0, 0
      
      done = False
      while not done:
          if rnd:
            action = env.action_space.sample()
          else:
            action = np.argmax(Q[state, :])
          
          state, reward, done, info = env.step(action)
          if reward < 0:
              penalties += 1

          epochs += 1

      total_penalties += penalties
      total_epochs += epochs

  print(f"Results after {total_episodes} episodes:")
  print(f"Average timesteps per episode: {total_epochs / total_episodes}")
  print(f"Average penalties per episode: {total_penalties / total_episodes}")

In [7]:
evaluate()

Results after 10000 episodes:
Average timesteps per episode: 28.0007
Average penalties per episode: 0.0
