## QLearning



吃豆人项目地址: https://inst.eecs.berkeley.edu/~cs188/pacman/project_overview.html

In [1]:
import gym
import argparse
import numpy as np
import random, math
import matplotlib.pyplot as plt
from collections import defaultdict
from IPython.display import clear_output




对于一个`Q-Learning`强化学习智能体来说，需要传入的参数有：

1. 环境
2. 智能体学习所需要的超参数，像学习率，折扣因子

需要的方法有:

1. 依据状态选择动作的策略
2. 更新智能体的方法

```python
class QLearningAgent(object):
    def __init__(self, env, learning_rate, epsilon, discount):
        super(QLearningAgent, self).__init__()
        pass
    
    def choose_action(self, state):
        pass
    
    def learn(self, state, action, reward, next_state):
        pass
```

In [2]:
class QLearningAgent(object):
    def __init__(self, env, args):
        super(QLearningAgent, self).__init__()
        self.legal_action = range(env.action_space.n)  # 获取合法的动作
        
        self.epsilon = args.epsilon
        self.discount = args.gamma
        
        self.q_table = defaultdict(lambda: defaultdict(lambda: 0))
        self.lr = args.lr
    
    def choose_action(self, state):
        """
        依据状态选择动作, 加入合法动作是为了方便之后用于复杂环境留的接口
        
        基于epsilon贪婪策略选择动作。
        self.epsilon概率选择随机动作，1 - self.epsilon概率选择最好的动作。
        
        """
        possible_actions = self.legal_action
        
        if len(possible_actions) == 0:
            print("return None")
            return None
        
        epsilon = self.epsilon
        
        if np.random.random() > epsilon:
            possible_q_values = [self.q_table[state][action] for action in possible_actions]
            index = np.argmax(possible_q_values)
            
            return possible_actions[index]
        else:
            
            return random.choice(possible_actions)
    
    def learn(self, state, action, reward, next_state):
        """
        Q值更新公式如下:
        Q(s,a) := (1 - lr) * Q(s,a) + lr * (r + gamma * V(s'))
        """
        current_q = self.q_table[state][action]
        
        next_state_possible_actions = self.legal_action
        
        next_v = np.max([self.q_table[next_state][next_action] for next_action in next_state_possible_actions])
        
        new_q = (1 - self.lr) * current_q + self.lr * (reward + self.discount * next_v)
        
        self.q_table[state][action] = new_q

&emsp;&emsp;除了上述的这种实现方法之外，还有一种是在计算下一个状态值函数的时候以`epsilon`概率选择最大的`V(next_state)`, `1-epsilon`概率选择平均的`V(next_state)`。

## Q-Learning用于离散状态空间

In [3]:
def play_and_train(env, agent, t_max=10**4):
    total_reward = 0.0
    state = env.reset()
    
    for t in range(t_max):
        
        action = agent.choose_action(state)
        
        next_state, reward, done, _ = env.step(action)
        
        agent.learn(state, action, reward, next_state)
        
        state = next_state
        total_reward += reward
        
        if done: break
    
    return total_reward

In [4]:
def main():

    parser = argparse.ArgumentParser(description="The parameter of Q-Learning")
    parser.add_argument("--gamma", type=float, help="gamma value used for Bellman approximation", default=0.99)
    parser.add_argument("--lr", type=float, help="learning rate used in the Adam optimizer", default=0.5)
    parser.add_argument("--epsilon", type=float, help="epsilon for greedy", default=0.25)
    args = parser.parse_args(args=[])

    env = gym.make("Taxi-v3")
    print("observation_space {}".format(env.observation_space))
    print("action_space {}".format(env.action_space))

    agent = QLearningAgent(env, args)
    
    rewards = []
    for i in range(1000):
        rewards.append(play_and_train(env, agent))    
        if i %100 ==0:
            clear_output(True)
            print("mean reward",np.mean(rewards[max(0, len(rewards) - 100) : len(rewards)]))
            plt.plot(rewards)
            plt.show()
            
# if __name__ == "__main__":
#     main()

## Q-learning用于连续状态空间

In [5]:
def get_state(observation):
    pos_space = np.linspace(-1.2, 0.6, 12)
    vel_space = np.linspace(-0.07, 0.07, 20)
    
    pos, vel =  observation
    pos_bin = int(np.digitize(pos, pos_space))
    vel_bin = int(np.digitize(vel, vel_space))

    return (pos_bin, vel_bin)

In [6]:
def get_state(observation):
    pos_space = np.linspace(-1.2, 0.6, 12)
    vel_space = np.linspace(-0.07, 0.07, 20)
    
    pos, vel =  observation
    pos_bin = int(np.digitize(pos, pos_space))
    vel_bin = int(np.digitize(vel, vel_space))

    return (pos_bin, vel_bin)

def play_and_train(env, agent, t_max=10**4):
    total_reward = 0.0
    state = get_state(env.reset())
    
    for t in range(t_max):
        
        action = agent.choose_action(state)
        
        next_state, reward, done, _ = env.step(action)
        next_state = get_state(next_state)
        
        agent.learn(state, action, reward, next_state)
        
        state = next_state
        total_reward += reward
        if done: break
    
    return total_reward

In [11]:
def main():
    parser = argparse.ArgumentParser(description="The parameter of Q-Learning")
    parser.add_argument("--gamma", type=float, help="gamma value used for Bellman approximation", default=0.99)
    parser.add_argument("--lr", type=float, help="learning rate used in the Adam optimizer", default=0.1)
    parser.add_argument("--epsilon", type=float, help="epsilon for greedy", default=1.0)
    args = parser.parse_args(args=[])
    
    env = gym.make('MountainCar-v0')
    env._max_episode_steps = 1000
    print("observation_space {}".format(env.observation_space))
    print("action_space {}".format(env.action_space))
    
    agent = QLearningAgent(env, args)
    
    n_games = 50000
    rewards = []
    total_rewards = np.zeros(n_games)
#     total_rewards = np.zeros(n_games)
    for i in range(n_games):
        score = play_and_train(env, agent)
        total_rewards[i] = score
#         total_rewards[i] = score

        agent.epsilon = agent.epsilon - 2 / n_games if agent.epsilon > 0.01 else 0.01
        
        if i % 100 ==0:
            print('episode ', i, 'score ', score, 'epsilon %.3f' % agent.epsilon)
#             if i %100 ==0:
#             clear_output(True)
#             print("mean reward ", mean_reward[-1])
#             plt.plot(mean_reward)
#             plt.show()
    mean_rewards = np.zeros(n_games)
    for t in range(n_games):
        mean_rewards[t] = np.mean(total_rewards[max(0, t-50):(t+1)])
    plt.plot(mean_rewards)
    plt.show()
if __name__ == "__main__":
    main()

observation_space Box([-1.2  -0.07], [0.6  0.07], (2,), float32)
action_space Discrete(3)
episode  0 score  -1000.0 epsilon 1.000
episode  100 score  -1000.0 epsilon 0.996
episode  200 score  -1000.0 epsilon 0.992
episode  300 score  -1000.0 epsilon 0.988
episode  400 score  -1000.0 epsilon 0.984
episode  500 score  -1000.0 epsilon 0.980
episode  600 score  -1000.0 epsilon 0.976
episode  700 score  -1000.0 epsilon 0.972
episode  800 score  -1000.0 epsilon 0.968
episode  900 score  -1000.0 epsilon 0.964
episode  1000 score  -1000.0 epsilon 0.960
episode  1100 score  -1000.0 epsilon 0.956
episode  1200 score  -1000.0 epsilon 0.952
episode  1300 score  -1000.0 epsilon 0.948
episode  1400 score  -1000.0 epsilon 0.944
episode  1500 score  -1000.0 epsilon 0.940
episode  1600 score  -1000.0 epsilon 0.936
episode  1700 score  -1000.0 epsilon 0.932
episode  1800 score  -969.0 epsilon 0.928
episode  1900 score  -1000.0 epsilon 0.924
episode  2000 score  -1000.0 epsilon 0.920
episode  2100 score 

KeyboardInterrupt: 

In [8]:
# mean_rewards = np.zeros(n_games)
# for t in range(n_games):
#     mean_rewards[t] = np.mean(total_rewards[max(0, t-50):(t+1)])
# plt.plot(mean_rewards)