# Deep Q Network

## 1. Setup

In [1]:
import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

env = gym.make('CartPole-v0').unwrapped

state = env.reset()
print(state)

action = env.action_space.sample()
print(action)

n_action = env.action_space.n
n_state = env.observation_space.shape[0]
print(n_state)
print(n_action)

# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion() # interactive mode on: 交互模式

[ 0.00530503 -0.00892214  0.01711064  0.02285505]
1
4
2


## 2. Replay Memory

In [2]:
Experience = namedtuple('Experience', ('state', 'action', 'reward', 'next_state', 'terminal'))

class ReplayMemory(object):

    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = -1

    def push(self, *args):
        if len(self.memory) < self.capacity:
            self.memory.append(None) # allocate space
        self.position = (self.position + 1) % self.capacity
        self.memory[self.position] = Experience(*args) ## append a new experience

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self): ## len(experience)
        return len(self.memory)


experience_pool = ReplayMemory(10000) #initialize memory pool

## 3. Q-Network

In [3]:
class NN():
    def __init__(self, input_size, output_size):
        super(NN, self).__init__()
        self.model = nn.Sequential(
        nn.Linear(input_size, 100),
        nn.ReLU(),
        nn.Linear(100, 100),
        nn.ReLU(),
        nn.Linear(100, output_size))

policy_net = NN(n_state,n_action).model #initialize nn models
target_net = NN(n_state,n_action).model
target_net.load_state_dict(policy_net.state_dict()) ## copy policy to target

<All keys matched successfully>

## 4. Hyperparameters


In [4]:
BATCH_SIZE = 128 #batch-train
GAMMA = 0.9 #reward-discount
EPS = 0.1 #epsilon-greedy
TARGET_UPDATE = 10 #policy to target

optimizer = optim.SGD(policy_net.parameters(), lr=0.1)
loss_fn = torch.nn.MSELoss()

## 5. Optimize



In [5]:
def update_policy_net():
    if len(experience_pool) < BATCH_SIZE:
        return #not enough experience

    experiences = experience_pool.sample(BATCH_SIZE)
    experiences_batch = Experience(*zip(*experiences)) #experiences of batches

    state_batch = torch.stack(experiences_batch.state)
    action_batch = torch.stack(experiences_batch.action)
    reward_batch = torch.stack(experiences_batch.reward)
    next_state_batch = torch.stack(experiences_batch.next_state)
    terminal_batch = torch.stack(experiences_batch.terminal)

    output_policy = policy_net(state_batch)
    policy_q_value = torch.gather(output_policy, 1, action_batch)

    with torch.no_grad():
        output_target_next = target_net(next_state_batch)
        target_next_q_value = torch.max(output_target_next, dim=1).values

    target_q_value = reward_batch + GAMMA * target_next_q_value * (1 - terminal_batch)

    # mean square loss
    loss = loss_fn(policy_q_value, target_q_value )

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

## 6. Exploration

In [6]:
def greedy_action(state): # state is tensor
    with torch.no_grad():
        action = torch.argmax(policy_net(state)).item()
    return action #return integer

def sample_action(state):
    if np.random.uniform(0, 1) < (1.0 - EPS): #choose a by policy-NN
        action = greedy_action(state) #greedy
    else:
        action = env.action_space.sample() #random action

    return torch.tensor([action], dtype=torch.int64) #return tensor

def explore_one_step(state, pool):
    action = sample_action(state) # a
    obs, r, done, _ = env.step(action.item())
    reward = torch.tensor(r, dtype=torch.float) # r
    next_state = torch.tensor(obs, dtype=torch.float) # s'
    terminal = torch.tensor(int(done) * 1.0, dtype=torch.int64) # t

    # Store the transition in experience pool
    pool.push(state, action, reward, next_state, terminal) #(s,a,r,s',t), tensors

    return done, next_state, r

## 7. Train Loop

In [9]:
EPOCHS = 200
for epoch in range(EPOCHS):
    reward = 0
    # Initialize the environment and state
    state = torch.tensor(env.reset(), dtype=torch.float) # s
    while True:
        # generate experience
        done, next_state, r = explore_one_step(state, experience_pool)
        state = next_state
        reward += r
        # Perform one step of the optimization
        update_policy_net()
        # one episode
        if done:
            print("epoch: ", epoch, "reward: ", reward)
            break

    # Update the target network, copying all weights and biases from policy network
    if epoch % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())


epoch:  0 reward:  14.0
epoch:  1 reward:  11.0
epoch:  2 reward:  16.0
epoch:  3 reward:  25.0
epoch:  4 reward:  12.0
epoch:  5 reward:  9.0
epoch:  6 reward:  9.0
epoch:  7 reward:  31.0
epoch:  8 reward:  23.0
epoch:  9 reward:  15.0
epoch:  10 reward:  31.0
epoch:  11 reward:  31.0
epoch:  12 reward:  22.0
epoch:  13 reward:  18.0
epoch:  14 reward:  16.0
epoch:  15 reward:  47.0
epoch:  16 reward:  11.0
epoch:  17 reward:  15.0
epoch:  18 reward:  12.0
epoch:  19 reward:  12.0
epoch:  20 reward:  10.0
epoch:  21 reward:  28.0
epoch:  22 reward:  29.0
epoch:  23 reward:  29.0
epoch:  24 reward:  11.0
epoch:  25 reward:  16.0
epoch:  26 reward:  31.0
epoch:  27 reward:  12.0
epoch:  28 reward:  19.0
epoch:  29 reward:  12.0
epoch:  30 reward:  24.0
epoch:  31 reward:  16.0
epoch:  32 reward:  36.0
epoch:  33 reward:  21.0
epoch:  34 reward:  12.0
epoch:  35 reward:  42.0
epoch:  36 reward:  15.0
epoch:  37 reward:  15.0
epoch:  38 reward:  20.0
epoch:  39 reward:  15.0
epoch:  40 r