# Deep Q Network

## 1. Setup

In [1]:
import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from collections import deque 
from itertools import count
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T


env = gym.make('CartPole-v0').unwrapped

state = env.reset()
print("initial state: ", state)

action = env.action_space.sample()
print("sample action: ", action)

n_action = env.action_space.n
n_state = env.observation_space.shape[0]
print("#state: ", n_state)
print("#action: ", n_action)



initial state:  [ 0.02665237 -0.01264466 -0.03104987  0.01376822]
sample action:  1
#state:  4
#action:  2


## 2. Replay Memory

In [2]:
Experience = namedtuple('Experience', ('state', 'action', 'reward', 'next_state', 'terminal'))

class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque(maxlen=capacity)

    def push(self, *args):
        self.memory.append(Experience(*args)) ## append a new experience

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self): ## len(experience)
        return len(self.memory)


experience_pool = ReplayMemory(20000) #initialize memory pool

## 3. Q-Network

In [3]:
policy_net = nn.Sequential(
        nn.Linear(n_state, 128),
        nn.ReLU(),
        nn.Linear(128, 128),
        nn.ReLU(),
        nn.Linear(128, n_action))

def nn_init(model):
    i = 0
    for p in model.parameters():
        i += 1
        if i % 2 == 1:
            nn.init.xavier_uniform_(p)
        else:
            nn.init.zeros_(p)            
            #nn.init.normal_(p) #standard Gaussian distribution
            #nn.init.uniform_(p) 

nn_init(policy_net)

target_net = nn.Sequential(
        nn.Linear(n_state, 128),
        nn.ReLU(),
        nn.Linear(128, 128),
        nn.ReLU(),
        nn.Linear(128, n_action))

target_net.load_state_dict(policy_net.state_dict()) ## copy policy to target


<All keys matched successfully>

## 4. Hyperparameters


In [4]:
BATCH_SIZE = 32 #batch-train
WARM_UP_SIZE = 200
LEARN_FRQ = 5
GAMMA = 0.99 #reward-discount
EPS = 0.1 #epsilon-greedy
EPS_DEC = 1e-6 #adaptive epsilon greedy
TARGET_UPDATE = 200 #policy to target

optimizer = optim.Adam(policy_net.parameters(), lr=0.001)
#optimizer = optim.SGD(policy_net.parameters(), lr=0.1)
loss_fn = torch.nn.MSELoss()

## 5. Optimize



In [5]:
def update_policy_net():
    experiences = experience_pool.sample(BATCH_SIZE)
    experiences_batch = Experience(*zip(*experiences)) #experiences of batches

    state_batch = torch.stack(experiences_batch.state)
    action_batch = torch.stack(experiences_batch.action)
    reward_batch = torch.stack(experiences_batch.reward)
    next_state_batch = torch.stack(experiences_batch.next_state)
    terminal_batch = torch.stack(experiences_batch.terminal)

    output_policy = policy_net(state_batch)
    policy_q_value = torch.gather(output_policy, 1, action_batch)

    with torch.no_grad():
        output_target_next = target_net(next_state_batch)
        target_next_q_value = torch.max(output_target_next, dim=1).values

    target_q_value = reward_batch + GAMMA * target_next_q_value * (1 - terminal_batch)

    # mean square loss
    loss = loss_fn(policy_q_value, target_q_value)
    #print(loss)

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

## 6. Exploration

In [6]:
def greedy_action(state): # state is tensor
    with torch.no_grad():
        action = torch.argmax(policy_net(state)).item()
    return action #return integer

def sample_action(state):
    global EPS
    current_eps = np.maximum(0.01, EPS)
    EPS -= EPS_DEC
    if np.random.uniform(0, 1) < (1.0 - current_eps): #choose a by policy-NN
        action = greedy_action(state) #greedy
    else:
        action = np.random.randint(n_action) #random action

    return torch.tensor([action], dtype=torch.int64) #return tensor

def explore_one_step(state, pool):
    action = sample_action(state) # a
    obs, r, done, _ = env.step(action.item())
    reward = torch.tensor(r, dtype=torch.float) # r
    next_state = torch.tensor(obs, dtype=torch.float) # s'
    terminal = torch.tensor(int(done), dtype=torch.int64) # t

    # Store the transition in experience pool
    pool.push(state, action, reward, next_state, terminal) #(s,a,r,s',t), tensors

    return done, next_state, r

## 7. Train Loop

In [7]:
while len(experience_pool) < WARM_UP_SIZE:
    state = torch.tensor(env.reset(), dtype=torch.float) # intialize
    while True:
        done, next_state, r = explore_one_step(state, experience_pool)
        state = next_state
        if done:
            break

EPOCHS = 2000
update_steps = 0
for epoch in range(EPOCHS):
    explore_steps = 0
    reward = 0
    # Initialize the environment and state
    state = torch.tensor(env.reset(), dtype=torch.float) # s
    while True:
        explore_steps += 1
        # generate experience
        done, next_state, r = explore_one_step(state, experience_pool)
        state = next_state
        reward += r
        # Perform one step of the optimization
        if explore_steps % LEARN_FRQ == 0:
            update_policy_net()
            update_steps += 1
            # Update the target network, copying all weights and biases from policy network
            if update_steps % TARGET_UPDATE == 0:
                target_net.load_state_dict(policy_net.state_dict())
        
        if done:
            break # one episode

    print("epoch: ", epoch, "reward: ", reward)




epoch:  0 reward:  22.0
epoch:  1 reward:  9.0
epoch:  2 reward:  10.0
epoch:  3 reward:  9.0
epoch:  4 reward:  9.0
epoch:  5 reward:  10.0
epoch:  6 reward:  9.0
epoch:  7 reward:  10.0
epoch:  8 reward:  9.0
epoch:  9 reward:  9.0
epoch:  10 reward:  9.0
epoch:  11 reward:  12.0
epoch:  12 reward:  10.0
epoch:  13 reward:  10.0
epoch:  14 reward:  9.0
epoch:  15 reward:  12.0
epoch:  16 reward:  8.0
epoch:  17 reward:  10.0
epoch:  18 reward:  9.0
epoch:  19 reward:  9.0
epoch:  20 reward:  10.0
epoch:  21 reward:  8.0
epoch:  22 reward:  12.0
epoch:  23 reward:  9.0
epoch:  24 reward:  11.0
epoch:  25 reward:  11.0
epoch:  26 reward:  9.0
epoch:  27 reward:  10.0
epoch:  28 reward:  11.0
epoch:  29 reward:  11.0
epoch:  30 reward:  9.0
epoch:  31 reward:  10.0
epoch:  32 reward:  10.0
epoch:  33 reward:  9.0
epoch:  34 reward:  9.0
epoch:  35 reward:  12.0
epoch:  36 reward:  10.0
epoch:  37 reward:  11.0
epoch:  38 reward:  11.0
epoch:  39 reward:  11.0
epoch:  40 reward:  9.0
epo