# DDPG: Deep Deterministic Policy Gradient

## 1. Setup environment

In [1]:
import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from collections import deque 
from itertools import count
from PIL import Image
from copy import deepcopy
import time
import cartenv

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

env = cartenv.ContinuousCartPoleEnv()

state = env.reset()
print("initial state: ", state)

action = env.action_space.sample()
print("sample action: ", action)

initial state:  [ 0.03352889  0.0183511  -0.04621113 -0.00925396]
sample action:  [0.68621486]


In [2]:
print(env.action_space)
print(env.observation_space)

n_action = env.action_space.shape[0]
n_state = env.observation_space.shape[0]
print("#state: ", n_state)
print("#action: ", n_action)

print(env.observation_space.high)
print(env.observation_space.low)

print(env.action_space.high)
print(env.action_space.low)

Box(1,)
Box(4,)
#state:  4
#action:  1
[4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]
[-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38]
[1.]
[-1.]


## 2. Experience Pool

In [3]:
Experience = namedtuple('Experience', ('state', 'action', 'reward', 'next_state', 'terminal'))

class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque(maxlen=capacity)

    def push(self, *args):
        self.memory.append(Experience(*args)) ## append a new experience

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self): ## len(experience)
        return len(self.memory)


experience_pool = ReplayMemory(10000000) #initialize memory pool

## 3. Hyperparameters


In [4]:
EPOCHS = 6001
EPOCH_STEPS = 200
BATCH_SIZE = 128 #batch-train
WARM_UP_SIZE = BATCH_SIZE
GAMMA = 0.99 #reward-discount
EXPLORE_NOISE = 0.05
UPDATE_WEIGHT = 0.999
LEARN_RATE = 1e-3 

## 4. Policy-Network & Q-Network

In [5]:
policy_net = nn.Sequential(
        nn.Linear(n_state, 100),
        nn.ReLU(),
        nn.Linear(100, n_action),
        nn.Tanh()) #tanh

q_net = nn.Sequential(
        nn.Linear(n_state + n_action, 100),
        nn.ReLU(),
        nn.Linear(100, 1))

target_p_net = deepcopy(policy_net)
target_q_net = deepcopy(q_net)

def enable_gradient(network):
        for p in network.parameters():
                p.requires_grad = True

def disable_gradient(network):
        for p in network.parameters():
                p.requires_grad = False

disable_gradient(target_p_net)
disable_gradient(target_q_net)

def copy_net(source_net, target_net):
        with torch.no_grad():
                for p, p_targ in zip(source_net.parameters(), target_net.parameters()):
                        p_targ.data.mul_(UPDATE_WEIGHT)
                        p_targ.data.add_((1 - UPDATE_WEIGHT) * p.data)

## 5. Exploration

In [6]:
def policy_action(state): # state is tensor
    with torch.no_grad():
        action = env.action_space.high[0] * policy_net(state)
    return action

def explore_action(state):
    with torch.no_grad():
        action = env.action_space.high[0] * policy_net(state)
        action = torch.normal(action, EXPLORE_NOISE)
        action = torch.clamp(action, min=env.action_space.low[0], max=env.action_space.high[0])
    return action

def target_action(state):
    return env.action_space.high[0] * target_p_net(state)

def explore_one_step(state):
    action = explore_action(state) # a
    obs, r, done, _ = env.step(action.item())
    reward = torch.tensor(r, dtype=torch.float) # r
    next_state = torch.tensor(obs, dtype=torch.float) # s'
    terminal = torch.tensor(int(done) * 1.0, dtype=torch.float) # t

    # Store the transition in experience pool
    experience_pool.push(state, action, reward, next_state, terminal) #(s,a,r,s',t), tensors

    return done, next_state, r

## 6. Optimize



In [7]:
#optimizer = optim.Adam(policy_net.parameters(), lr=0.001)
optimizer_p = optim.SGD(policy_net.parameters(), lr=LEARN_RATE)
optimizer_q = optim.SGD(q_net.parameters(), lr=LEARN_RATE)
loss_fn = torch.nn.MSELoss()

def sample_batch():
    experiences = experience_pool.sample(BATCH_SIZE)
    experiences_batch = Experience(*zip(*experiences)) #experiences of batches, unpack twice

    state_batch = torch.stack(experiences_batch.state)
    action_batch = torch.stack(experiences_batch.action)
    reward_batch = torch.stack(experiences_batch.reward)
    next_state_batch = torch.stack(experiences_batch.next_state)
    terminal_batch = torch.stack(experiences_batch.terminal)
    state_action_batch = torch.cat((state_batch, action_batch), dim=1)
    return state_batch, action_batch, reward_batch, next_state_batch, terminal_batch, state_action_batch

def update_q_net(r, ns, d, sa):
    curr_q_value = q_net(sa).squeeze()

    next_action = target_p_net(ns)
    next_sa = torch.cat((ns, next_action), dim=1)
    target_next_q_value = target_q_net(next_sa).squeeze()

    target_q_value = r + GAMMA * target_next_q_value * (1 - d)

    # mean square loss
    loss = loss_fn(curr_q_value, target_q_value)

    # Optimize the model
    optimizer_q.zero_grad()
    loss.backward()
    optimizer_q.step()

def update_policy_net(s):
    curr_action = policy_net(s)
    curr_sa = torch.cat((s, curr_action), dim=1)

    ## using q network
    disable_gradient(q_net)
    loss = -1.0 * torch.mean(q_net(curr_sa))
    # Optimize the model
    optimizer_p.zero_grad()
    loss.backward()
    optimizer_p.step()
    enable_gradient(q_net)

## 7. Train Loop

In [8]:
def evaluate():
    state = torch.tensor(env.reset(), dtype=torch.float)
    while True:
        env.render()
        action = policy_action(state).item()
        next_state, _, done, _ = env.step(action)
        state = torch.tensor(next_state, dtype=torch.float)
        if done:
            break # one episode

def train_loop():
    for epoch in range(EPOCHS):
        explore_steps = 0
        reward = 0
        # Initialize the environment and state
        state = torch.tensor(env.reset(), dtype=torch.float) # s
        while explore_steps < EPOCH_STEPS:
            explore_steps += 1
            # generate experience
            done, next_state, r = explore_one_step(state)
            state = next_state
            reward += r
            # Perform one step of the optimization
            if len(experience_pool) > WARM_UP_SIZE:
                s, _, r, ns, d, sa = sample_batch()
                update_q_net(r,ns,d,sa)
                update_policy_net(s)
                copy_net(policy_net, target_p_net)
                copy_net(q_net, target_q_net)

            if done:
                break # one episode

        if epoch % 50 == 0:
            evaluate()
            print("epoch: ", epoch, "reward: ", reward)

In [9]:
train_loop()

epoch:  0 reward:  12.0
epoch:  50 reward:  16.0
epoch:  100 reward:  8.0
epoch:  150 reward:  7.0
epoch:  200 reward:  7.0
epoch:  250 reward:  7.0
epoch:  300 reward:  6.0
epoch:  350 reward:  7.0
epoch:  400 reward:  6.0
epoch:  450 reward:  6.0
epoch:  500 reward:  6.0
epoch:  550 reward:  7.0
epoch:  600 reward:  6.0
epoch:  650 reward:  6.0
epoch:  700 reward:  6.0
epoch:  750 reward:  6.0
epoch:  800 reward:  6.0
epoch:  850 reward:  6.0
epoch:  900 reward:  6.0
epoch:  950 reward:  6.0
epoch:  1000 reward:  6.0
epoch:  1050 reward:  6.0
epoch:  1100 reward:  6.0
epoch:  1150 reward:  18.0
epoch:  1200 reward:  27.0
epoch:  1250 reward:  44.0
epoch:  1300 reward:  65.0
epoch:  1350 reward:  41.0
epoch:  1400 reward:  76.0
epoch:  1450 reward:  74.0
epoch:  1500 reward:  166.0
epoch:  1550 reward:  200.0
epoch:  1600 reward:  200.0
epoch:  1650 reward:  200.0
epoch:  1700 reward:  200.0
epoch:  1750 reward:  200.0
epoch:  1800 reward:  200.0


KeyboardInterrupt: 

## 8. Load Saved Model

In [None]:
#torch.save(policy_net.state_dict(), 'policy.pt')
#policy_net.load_state_dict(torch.load('policy.pt'))

#evaluate()