In [1]:
import gym
from gridworld import CliffWalkingWapper
import time
import numpy as np

In [2]:
env = gym.make('CliffWalking-v0')
env = CliffWalkingWapper(env)

In [8]:
class QlearningAgent(object):
    def __init__(self, obs_n, act_n, learning_rate=0.01, gamma=0.9, e_greed=0.1):
        self.act_n = act_n
        self.lr = learning_rate
        self.gamma = gamma
        self.epsilon = e_greed
        self.Q = np.zeros((obs_n, act_n))
    def sample(self, obs):
        # 如果没有随机性，那么两种算法完全相同。因为有了随机性，就等同于把跳崖的负反馈给排除掉了绝大部分，于是策略更加激进
        # Sasar算法把随机走的那一步的reward也参与了学习
        if np.random.uniform(0,1) < (1 - self.epsilon):
            action = self.predict(obs)
        else:
            action = np.random.choice(self.act_n)
        return action
    def predict(self, obs):
        Q_list = self.Q[obs,:]
        action_index = np.where(Q_list == np.max(Q_list))[0]
        action = np.random.choice(action_index)
        return action
    def learn(self, obs, action, reward, next_obs, done):
        predict_Q = self.Q[obs, action]
        if done:
            target_Q = reward
        else:
            target_Q = reward + self.gamma * np.max(self.Q[next_obs, :])
        self.Q[obs, action] += self.lr * (target_Q - predict_Q)

In [9]:
def run_episode(env, agent, is_render=False):
    total_steps = 0
    total_reward = 0
    obs = env.reset()
    while True:
        action = agent.sample(obs)
        next_obs, reward, done, _ = env.step(action)
        agent.learn(obs, action, reward, next_obs, done)
        obs = next_obs
        total_reward += reward
        total_steps += 1
        if is_render:env.render()
        if done:break
    return total_reward, total_steps

In [10]:
def test_episode(env, agent):
    total_rewards = 0
    obs = env.reset()
    while True:
        action = agent.predict(obs)
        next_obs, reward, done, _ = env.step(action)
        obs = next_obs
        total_rewards += total_rewards
        time.sleep(0.5)
        if done:break

In [11]:
def main():
    env = gym.make('CliffWalking-v0')
    env = CliffWalkingWapper(env)
    agent = QlearningAgent(
        env.observation_space.n, 
        env.action_space.n, 
        learning_rate=0.1, 
        gamma=0.9, 
        e_greed=0.1)
    is_render = False
    for episode in range(500):
        ep_reward, ep_steps = run_episode(env, agent, is_render)
        print('Episode:', episode, 'Reward:', ep_reward, 'Steps:', ep_steps)
        is_render = True if episode % 20 == 0 else False
    
    test_episode(env, agent)

main()

Episode: 0 Reward: -2300 Steps: 815
Episode: 1 Reward: -661 Steps: 364
Episode: 2 Reward: -259 Steps: 160
Episode: 3 Reward: -359 Steps: 260
Episode: 4 Reward: -793 Steps: 397
Episode: 5 Reward: -62 Steps: 62
Episode: 6 Reward: -802 Steps: 406
Episode: 7 Reward: -265 Steps: 166
Episode: 8 Reward: -84 Steps: 84
Episode: 9 Reward: -98 Steps: 98
Episode: 10 Reward: -178 Steps: 79
Episode: 11 Reward: -152 Steps: 152
Episode: 12 Reward: -132 Steps: 132
Episode: 13 Reward: -246 Steps: 147
Episode: 14 Reward: -52 Steps: 52
Episode: 15 Reward: -360 Steps: 162
Episode: 16 Reward: -40 Steps: 40
Episode: 17 Reward: -334 Steps: 136
Episode: 18 Reward: -104 Steps: 104
Episode: 19 Reward: -227 Steps: 128
Episode: 20 Reward: -500 Steps: 203
Episode: 21 Reward: -62 Steps: 62
Episode: 22 Reward: -88 Steps: 88
Episode: 23 Reward: -92 Steps: 92
Episode: 24 Reward: -28 Steps: 28
Episode: 25 Reward: -106 Steps: 106
Episode: 26 Reward: -358 Steps: 160
Episode: 27 Reward: -104 Steps: 104
Episode: 28 Reward: 