# Reinforcement Learning 進階篇：Deep Q-Learning

https://medium.com/pyladies-taiwan/reinforcement-learning-進階篇-deep-q-learning-26b10935a745

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import gym

# 超参数
BATCH_SIZE = 32
LR = 0.01                   # learning rate
EPSILON = 0.9               # 最优选择动作百分比
GAMMA = 0.9                 # 奖励递减参数
TARGET_REPLACE_ITER = 100   # Q 现实网络的更新频率
MEMORY_CAPACITY = 2000      # 记忆库大小
env = gym.make('CartPole-v0')   # 立杆子游戏
env = env.unwrapped
N_ACTIONS = env.action_space.n  # 杆子能做的动作
N_STATES = env.observation_space.shape[0]   # 杆子能获取的环境信息数

In [15]:
class Net(nn.Module):
    def __init__(self, n_states, n_actions, n_hidden):
        super(Net, self).__init__()

        # 輸入層 (state) 到隱藏層，隱藏層到輸出層 (action)
        self.fc1 = nn.Linear(n_states, n_hidden)
        self.out = nn.Linear(n_hidden, n_actions)

    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x) # ReLU activation
        actions_value = self.out(x)
        return actions_value
    

In [8]:
class DQN(object):
    def __init__(self, n_states, n_actions, n_hidden, batch_size, lr, epsilon, gamma, target_replace_iter, memory_capacity):
        self.eval_net, self.target_net = Net(n_states, n_actions, n_hidden), Net(n_states, n_actions, n_hidden)

        self.memory = np.zeros((memory_capacity, n_states * 2 + 2)) # 每個 memory 中的 experience 大小為 (state + next state + reward + action)
        self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=lr)
        self.loss_func = nn.MSELoss()
        self.memory_counter = 0
        self.learn_step_counter = 0 # 讓 target network 知道什麼時候要更新

        self.n_states = n_states
        self.n_actions = n_actions
        self.n_hidden = n_hidden
        self.batch_size = batch_size
        self.lr = lr
        self.epsilon = epsilon
        self.gamma = gamma
        self.target_replace_iter = target_replace_iter
        self.memory_capacity = memory_capacity

    def choose_action(self, state):
        x = torch.unsqueeze(torch.FloatTensor(state), 0)

        # epsilon-greedy
        if np.random.uniform() < self.epsilon: # 隨機
            action = np.random.randint(0, self.n_actions)
        else: # 根據現有 policy 做最好的選擇
            actions_value = self.eval_net(x) # 以現有 eval net 得出各個 action 的分數
            action = torch.max(actions_value, 1)[1].data.numpy()[0] # 挑選最高分的 action

        return action

    def store_transition(self, state, action, reward, next_state):
        # 打包 experience
        transition = np.hstack((state, [action, reward], next_state))

        # 存進 memory；舊 memory 可能會被覆蓋
        index = self.memory_counter % self.memory_capacity
        self.memory[index, :] = transition
        self.memory_counter += 1

    def learn(self):
        # 隨機取樣 batch_size 個 experience
        sample_index = np.random.choice(self.memory_capacity, self.batch_size)
        b_memory = self.memory[sample_index, :]
        b_state = torch.FloatTensor(b_memory[:, :self.n_states])
        b_action = torch.LongTensor(b_memory[:, self.n_states:self.n_states+1].astype(int))
        b_reward = torch.FloatTensor(b_memory[:, self.n_states+1:self.n_states+2])
        b_next_state = torch.FloatTensor(b_memory[:, -self.n_states:])

        # 計算現有 eval net 和 target net 得出 Q value 的落差
        q_eval = self.eval_net(b_state).gather(1, b_action) # 重新計算這些 experience 當下 eval net 所得出的 Q value
        q_next = self.target_net(b_next_state).detach() # detach 才不會訓練到 target net
        q_target = b_reward + self.gamma * q_next.max(1)[0].view(self.batch_size, 1) # 計算這些 experience 當下 target net 所得出的 Q value
        loss = self.loss_func(q_eval, q_target)

        # Backpropagation
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # 每隔一段時間 (target_replace_iter), 更新 target net，即複製 eval net 到 target net
        self.learn_step_counter += 1
        if self.learn_step_counter % self.target_replace_iter == 0:
            self.target_net.load_state_dict(self.eval_net.state_dict())

In [16]:
import gym
env = gym.make('CartPole-v0')

# Environment parameters
n_actions = env.action_space.n
n_states = env.observation_space.shape[0]

# Hyper parameters
n_hidden = 50
batch_size = 32
lr = 0.01                 # learning rate
epsilon = 0.1             # epsilon-greedy
gamma = 0.9               # reward discount factor
target_replace_iter = 100 # target network 更新間隔
memory_capacity = 2000
n_episodes = 4000

# 建立 DQN
dqn = DQN(n_states, n_actions, n_hidden, batch_size, lr, epsilon, gamma, target_replace_iter, memory_capacity)

# 學習
for i_episode in range(n_episodes):
    t = 0
    rewards = 0
    state = env.reset()
    while True:
        env.render()

        # 選擇 action
        action = dqn.choose_action(state)
        next_state, reward, done, info = env.step(action)

        # 儲存 experience
        dqn.store_transition(state, action, reward, next_state)

        # 累積 reward
        rewards += reward

        # 有足夠 experience 後進行訓練
        if dqn.memory_counter > memory_capacity:
            dqn.learn()

        # 進入下一 state
        state = next_state

        if done:
            print('Episode finished after {} timesteps, total rewards {}'.format(t+1, rewards))
            break

        t += 1

env.close()

Episode finished after 9 timesteps, total rewards 9.0
Episode finished after 10 timesteps, total rewards 10.0
Episode finished after 9 timesteps, total rewards 9.0
Episode finished after 10 timesteps, total rewards 10.0
Episode finished after 11 timesteps, total rewards 11.0
Episode finished after 10 timesteps, total rewards 10.0
Episode finished after 11 timesteps, total rewards 11.0
Episode finished after 10 timesteps, total rewards 10.0
Episode finished after 8 timesteps, total rewards 8.0
Episode finished after 9 timesteps, total rewards 9.0
Episode finished after 10 timesteps, total rewards 10.0
Episode finished after 10 timesteps, total rewards 10.0
Episode finished after 10 timesteps, total rewards 10.0
Episode finished after 9 timesteps, total rewards 9.0
Episode finished after 11 timesteps, total rewards 11.0
Episode finished after 10 timesteps, total rewards 10.0
Episode finished after 9 timesteps, total rewards 9.0
Episode finished after 10 timesteps, total rewards 10.0
Epis

Episode finished after 11 timesteps, total rewards 11.0
Episode finished after 10 timesteps, total rewards 10.0
Episode finished after 9 timesteps, total rewards 9.0
Episode finished after 8 timesteps, total rewards 8.0
Episode finished after 9 timesteps, total rewards 9.0
Episode finished after 9 timesteps, total rewards 9.0
Episode finished after 11 timesteps, total rewards 11.0
Episode finished after 9 timesteps, total rewards 9.0
Episode finished after 9 timesteps, total rewards 9.0
Episode finished after 10 timesteps, total rewards 10.0
Episode finished after 9 timesteps, total rewards 9.0
Episode finished after 11 timesteps, total rewards 11.0
Episode finished after 10 timesteps, total rewards 10.0
Episode finished after 9 timesteps, total rewards 9.0
Episode finished after 11 timesteps, total rewards 11.0
Episode finished after 12 timesteps, total rewards 12.0
Episode finished after 10 timesteps, total rewards 10.0
Episode finished after 11 timesteps, total rewards 11.0
Episode 

Episode finished after 11 timesteps, total rewards 11.0
Episode finished after 9 timesteps, total rewards 9.0
Episode finished after 9 timesteps, total rewards 9.0
Episode finished after 9 timesteps, total rewards 9.0
Episode finished after 10 timesteps, total rewards 10.0
Episode finished after 10 timesteps, total rewards 10.0
Episode finished after 12 timesteps, total rewards 12.0
Episode finished after 11 timesteps, total rewards 11.0
Episode finished after 10 timesteps, total rewards 10.0
Episode finished after 14 timesteps, total rewards 14.0
Episode finished after 13 timesteps, total rewards 13.0
Episode finished after 10 timesteps, total rewards 10.0
Episode finished after 10 timesteps, total rewards 10.0
Episode finished after 10 timesteps, total rewards 10.0
Episode finished after 22 timesteps, total rewards 22.0
Episode finished after 10 timesteps, total rewards 10.0
Episode finished after 12 timesteps, total rewards 12.0
Episode finished after 10 timesteps, total rewards 10.

Episode finished after 9 timesteps, total rewards 9.0
Episode finished after 9 timesteps, total rewards 9.0
Episode finished after 10 timesteps, total rewards 10.0
Episode finished after 10 timesteps, total rewards 10.0
Episode finished after 9 timesteps, total rewards 9.0
Episode finished after 9 timesteps, total rewards 9.0
Episode finished after 9 timesteps, total rewards 9.0
Episode finished after 10 timesteps, total rewards 10.0
Episode finished after 10 timesteps, total rewards 10.0
Episode finished after 9 timesteps, total rewards 9.0
Episode finished after 9 timesteps, total rewards 9.0
Episode finished after 11 timesteps, total rewards 11.0
Episode finished after 10 timesteps, total rewards 10.0
Episode finished after 9 timesteps, total rewards 9.0
Episode finished after 10 timesteps, total rewards 10.0
Episode finished after 12 timesteps, total rewards 12.0
Episode finished after 13 timesteps, total rewards 13.0
Episode finished after 12 timesteps, total rewards 12.0
Episode 

Episode finished after 10 timesteps, total rewards 10.0
Episode finished after 10 timesteps, total rewards 10.0
Episode finished after 10 timesteps, total rewards 10.0
Episode finished after 10 timesteps, total rewards 10.0
Episode finished after 9 timesteps, total rewards 9.0
Episode finished after 11 timesteps, total rewards 11.0
Episode finished after 12 timesteps, total rewards 12.0
Episode finished after 9 timesteps, total rewards 9.0
Episode finished after 10 timesteps, total rewards 10.0
Episode finished after 10 timesteps, total rewards 10.0
Episode finished after 11 timesteps, total rewards 11.0
Episode finished after 11 timesteps, total rewards 11.0
Episode finished after 10 timesteps, total rewards 10.0
Episode finished after 10 timesteps, total rewards 10.0
Episode finished after 8 timesteps, total rewards 8.0
Episode finished after 10 timesteps, total rewards 10.0
Episode finished after 9 timesteps, total rewards 9.0
Episode finished after 11 timesteps, total rewards 11.0


KeyboardInterrupt: 