# DDPG: Deep Deterministic Policy Gradient

## 1. Setup environment

In [6]:
import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from collections import deque 
from itertools import count
from PIL import Image
from copy import deepcopy
import time
import cartenv

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

env = cartenv.ContinuousCartPoleEnv()

state = env.reset()
print("initial state: ", state)

action = env.action_space.sample()
print("sample action: ", action)

initial state:  [-0.0158464   0.00129966  0.04116796  0.03368109]
sample action:  [-0.90528506]


In [12]:
print(env.action_space)
print(env.observation_space)

n_action = env.action_space.shape[0]
n_state = env.observation_space.shape[0]
print("#state: ", n_state)
print("#action: ", n_action)

print(env.observation_space.high)
print(env.observation_space.low)

print(env.action_space.high)
print(env.action_space.low)

Box(1,)
Box(4,)
#state:  4
#action:  1
[4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]
[-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38]
[1.]
[-1.]


## 2. Experience Pool

In [13]:
Experience = namedtuple('Experience', ('state', 'action', 'reward', 'next_state', 'terminal'))

class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque(maxlen=capacity)

    def push(self, *args):
        self.memory.append(Experience(*args)) ## append a new experience

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self): ## len(experience)
        return len(self.memory)


experience_pool = ReplayMemory(50000) #initialize memory pool

## 3. Hyperparameters


In [26]:
EPOCHS = 1001
EPOCH_STEPS = 200
BATCH_SIZE = 32 #batch-train
WARM_UP_SIZE = BATCH_SIZE
GAMMA = 0.99 #reward-discount: 0.99 is better than 0.90 !!!!!!
EPS_GREEDY = 0.1 #epsilon-greedy
EPS_DEC = 1e-6 #adaptive epsilon greedy
TARGET_UPDATE = 200 #policy to target
EXPLORE_NOISE = 0.05
UPDATE_WEIGHT = 0.99

## 4. Policy-Network & Q-Network

In [30]:
policy_net = nn.Sequential(
        nn.Linear(n_state, 20),
        nn.ReLU(),
        nn.Linear(20, n_action),
        nn.Tanh()) #tanh

q_net = nn.Sequential(
        nn.Linear(n_state + n_action, 100),
        nn.ReLU(),
        nn.Linear(100, 1))

target_p_net = deepcopy(policy_net)
target_q_net = deepcopy(q_net)

def disable_gradient(network):
        for p in network.parameters():
                p.requires_grad = False

disable_gradient(target_p_net)
disable_gradient(target_q_net)

def copy_net(source_net, target_net):
        with torch.no_grad():
                for p, p_targ in zip(source_net.parameters(), target_net.parameters()):
                        p_targ.data.mul_(UPDATE_WEIGHT)
                        p_targ.data.add_((1 - UPDATE_WEIGHT) * p.data)

## 5. Optimize



In [28]:
#optimizer = optim.Adam(policy_net.parameters(), lr=0.001)
optimizer = optim.SGD(policy_net.parameters(), lr=0.001)
loss_fn = torch.nn.MSELoss()

def update_policy_net():
    experiences = experience_pool.sample(BATCH_SIZE)
    experiences_batch = Experience(*zip(*experiences)) #experiences of batches

    state_batch = torch.stack(experiences_batch.state)
    action_batch = torch.stack(experiences_batch.action)
    reward_batch = torch.stack(experiences_batch.reward)
    next_state_batch = torch.stack(experiences_batch.next_state)
    terminal_batch = torch.stack(experiences_batch.terminal)

    output_policy = policy_net(state_batch)
    policy_q_value = torch.squeeze(torch.gather(output_policy, 1, action_batch))

    with torch.no_grad():
        output_target_next = target_net(next_state_batch)
        target_next_q_value = torch.max(output_target_next, dim=1).values

    target_q_value = reward_batch + GAMMA * target_next_q_value * (1 - terminal_batch)

    # mean square loss
    loss = loss_fn(policy_q_value, target_q_value)

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

## 6. Exploration

In [None]:
def policy_action(state): # state is tensor
    return policy_net(state)

def explore_action(state):
    return torch.normal(policy_net(state), NOISE)

def target_action():
    return target_p_net(state)
    
def sample_action(state):
    global EPS_GREEDY
    current_eps = np.maximum(0.01, EPS_GREEDY)
    EPS_GREEDY -= EPS_DEC
    if np.random.rand() < (1.0 - current_eps): #choose a by policy-NN
        action = greedy_action(state) #greedy
    else:
        action = np.random.randint(n_action) #random action

    return torch.tensor([action], dtype=torch.int64) #return tensor

def explore_one_step(state, pool):
    action = sample_action(state) # a
    obs, r, done, _ = env.step(action.item())
    reward = torch.tensor(r, dtype=torch.float) # r
    next_state = torch.tensor(obs, dtype=torch.float) # s'
    terminal = torch.tensor(int(done) * 1.0, dtype=torch.float) # t

    # Store the transition in experience pool
    pool.push(state, action, reward, next_state, terminal) #(s,a,r,s',t), tensors

    return done, next_state, r

## 7. Train Loop

In [None]:
def evaluate():
    state = torch.tensor(env.reset(), dtype=torch.float)
    env.render()
    while True:
        env.render()
        action = torch.argmax(policy_net(state)).item()
        next_state, _, done, _ = env.step(action)
        state = torch.tensor(next_state, dtype=torch.float)
        if done:
            break # one episode

def train_loop():
    update_policy_steps = 0
    for epoch in range(EPOCHS):
        explore_steps = 0
        reward = 0
        # Initialize the environment and state
        state = torch.tensor(env.reset(), dtype=torch.float) # s
        while explore_steps < EPOCH_STEPS:
            explore_steps += 1
            # generate experience
            done, next_state, r = explore_one_step(state, experience_pool)
            state = next_state
            reward += r
            # Perform one step of the optimization
            if len(experience_pool) > WARM_UP_SIZE:
                update_policy_net()
                update_policy_steps += 1
                # Update the target network, copying all weights and biases from policy network
                if update_policy_steps % TARGET_UPDATE == 0:
                    target_net.load_state_dict(policy_net.state_dict())
            if done:
                break # one episode

        if epoch % 50 == 0:
            print("epoch: ", epoch, "reward: ", reward)
            evaluate()

In [None]:
train_loop()

## 8. Load Saved Model

In [None]:
#torch.save(policy_net.state_dict(), 'policy-1.pt')
policy_net.load_state_dict(torch.load('policy-1.pt'))

evaluate()