In [8]:
import gym
from gridworld import CliffWalkingWapper
import time
import numpy as np

In [9]:
env = gym.make('CliffWalking-v0')
env = CliffWalkingWapper(env)

In [10]:
class SarsaAgent(object):
    def __init__(self, obs_n, act_n, learning_rate=0.01, gamma=0.9, e_greed=0.1):
        self.act_n = act_n
        self.lr = learning_rate
        self.gamma = gamma
        self.epsilon = e_greed
        self.Q = np.zeros((obs_n, act_n))
    def sample(self, obs):
        if np.random.uniform(0,1) < (1 - self.epsilon):
            action = self.predict(obs)
        else:
            action = np.random.choice(self.act_n)
        return action
    def predict(self, obs):
        Q_list = self.Q[obs,:]
        action_index = np.where(Q_list == np.max(Q_list))[0]
        action = np.random.choice(action_index)
        return action
    def learn(self, obs, action, reward, next_obs, next_action, done):
        predict_Q = self.Q[obs, action]
        if done:
            target_Q = reward
        else:
            target_Q = reward + self.gamma * self.Q[next_obs, next_action]
        self.Q[obs, action] += self.lr * (target_Q - predict_Q)

In [11]:
def run_episode(env, agent, is_render=False):
    total_steps = 0
    total_reward = 0
    obs = env.reset()
    action = agent.sample(obs)
    while True:
        if is_render:env.render()
        next_obs, reward, done, _ = env.step(action)
        next_action = agent.sample(next_obs)
        agent.learn(obs, action, reward, next_obs, next_action, done)
        obs = next_obs
        action = next_action
        total_reward += reward
        total_steps += 1
        if done:break
    return total_reward, total_steps

In [12]:
def test_episode(env, agent):
    total_rewards = 0
    obs = env.reset()
    while True:
        action = agent.predict(obs)
        next_obs, reward, done, _ = env.step(action)
        obs = next_obs
        total_rewards += total_rewards
        time.sleep(0.5)
        if done:break

In [13]:
def main():
    env = gym.make('CliffWalking-v0')
    env = CliffWalkingWapper(env)
    agent = SarsaAgent(
        env.observation_space.n, 
        env.action_space.n, 
        learning_rate=0.1, 
        gamma=0.9, 
        e_greed=0.1)
    for episode in range(500):
        ep_reward, ep_steps = run_episode(env, agent, is_render)
        print('Episode:', episode, 'Reward:', ep_reward, 'Steps:', ep_steps)
        is_render = True if episode % 10 == 0 else False
    
    test_episode(env, agent)

main()

Episode: 0 Reward: -2260 Steps: 874
Episode: 1 Reward: -151 Steps: 151
Episode: 2 Reward: -307 Steps: 208
Episode: 3 Reward: -398 Steps: 299
Episode: 4 Reward: -479 Steps: 281
Episode: 5 Reward: -120 Steps: 120
Episode: 6 Reward: -252 Steps: 153
Episode: 7 Reward: -100 Steps: 100
Episode: 8 Reward: -694 Steps: 298
Episode: 9 Reward: -91 Steps: 91
Episode: 10 Reward: -46 Steps: 46
Episode: 11 Reward: -675 Steps: 378
Episode: 12 Reward: -319 Steps: 220
Episode: 13 Reward: -45 Steps: 45
Episode: 14 Reward: -112 Steps: 112
Episode: 15 Reward: -337 Steps: 238
Episode: 16 Reward: -19 Steps: 19
Episode: 17 Reward: -77 Steps: 77
Episode: 18 Reward: -345 Steps: 147
Episode: 19 Reward: -75 Steps: 75
Episode: 20 Reward: -120 Steps: 120
Episode: 21 Reward: -289 Steps: 91
Episode: 22 Reward: -79 Steps: 79
Episode: 23 Reward: -352 Steps: 154
Episode: 24 Reward: -56 Steps: 56
Episode: 25 Reward: -84 Steps: 84
Episode: 26 Reward: -42 Steps: 42
Episode: 27 Reward: -323 Steps: 125
Episode: 28 Reward: -8