Modified from: [sweetice/Deep-reinforcement-learning-with-pytorch](https://github.com/sweetice/Deep-reinforcement-learning-with-pytorch/tree/master/Char05%20DDPG)

# Import Modules and Configuations

In [1]:
import os, sys, random
import numpy as np
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from itertools import count
from torch.distributions import Normal
from tensorboardX import SummaryWriter

# Environment Configs & Initializations

In [2]:
# OpenAI gym environment name, # ['BipedalWalker-v2', 'Pendulum-v0'] or any continuous environment
# Note that DDPG is feasible about hyper-parameters.
# You should fine-tuning if you change to another environment.
ENV_NAME = "Pendulum-v0"

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
env = gym.make(ENV_NAME).unwrapped
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])
min_Val = torch.tensor(1e-7).float().to(device) # min value

# Helper Classes and Configs

In [4]:
CAPACITY = 50000 # replay buffer size
TAU = 0.005 # target smoothing coefficient
LEARNING_RATE = 1e-3
GAMMA = 0.99 # discounted factor
BATCH_SIZE = 64 # mini batch size
UPDATE_ITERATION = 10
DIRECTORY = "exp_{}/".format(ENV_NAME)

In [5]:
class Replay_buffer():
    '''
    Code based on:
    https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py
    Expects tuples of (state, next_state, action, reward, done)
    '''
    def __init__(self, max_size=CAPACITY):
        self.storage = []
        self.max_size = max_size
        self.ptr = 0

    def push(self, data):
        if len(self.storage) == self.max_size:
            self.storage[int(self.ptr)] = data
            self.ptr = (self.ptr + 1) % self.max_size
        else:
            self.storage.append(data)

    def sample(self, batch_size):
        ind = np.random.randint(0, len(self.storage), size=batch_size)
        x, y, u, r, d = [], [], [], [], []

        for i in ind:
            X, Y, U, R, D = self.storage[i]
            x.append(np.array(X, copy=False))
            y.append(np.array(Y, copy=False))
            u.append(np.array(U, copy=False))
            r.append(np.array(R, copy=False))
            d.append(np.array(D, copy=False))

        return np.array(x), np.array(y), np.array(u), np.array(r).reshape(-1, 1), np.array(d).reshape(-1, 1)
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, max_action):
        super(Actor, self).__init__()

        self.l1 = nn.Linear(state_dim, 400)
        self.l2 = nn.Linear(400, 300)
        self.l3 = nn.Linear(300, action_dim)

        self.max_action = max_action

    def forward(self, x):
        x = F.relu(self.l1(x))
        x = F.relu(self.l2(x))
        x = self.max_action * torch.tanh(self.l3(x))
        return x
class Critic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()

        self.l1 = nn.Linear(state_dim + action_dim, 400)
        self.l2 = nn.Linear(400 , 300)
        self.l3 = nn.Linear(300, 1)

    def forward(self, x, u):
        x = F.relu(self.l1(torch.cat([x, u], 1)))
        x = F.relu(self.l2(x))
        x = self.l3(x)
        return x
class DDPG(object):
    def __init__(self, state_dim, action_dim, max_action):
        self.actor = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = optim.Adam(self.actor.parameters(), LEARNING_RATE)

        self.critic = Critic(state_dim, action_dim).to(device)
        self.critic_target = Critic(state_dim, action_dim).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = optim.Adam(self.critic.parameters(), LEARNING_RATE)
        self.replay_buffer = Replay_buffer()
        self.writer = SummaryWriter(DIRECTORY)
        self.num_critic_update_iteration = 0
        self.num_actor_update_iteration = 0
        self.num_training = 0

    def select_action(self, state):
        state = torch.FloatTensor(state.reshape(1, -1)).to(device)
        return self.actor(state).cpu().data.numpy().flatten()

    def update(self):

        for it in range(UPDATE_ITERATION):
            # Sample replay buffer
            x, y, u, r, d = self.replay_buffer.sample(BATCH_SIZE)
            state = torch.FloatTensor(x).to(device)
            action = torch.FloatTensor(u).to(device)
            next_state = torch.FloatTensor(y).to(device)
            done = torch.FloatTensor(d).to(device)
            reward = torch.FloatTensor(r).to(device)

            # Compute the target Q value
            target_Q = self.critic_target(next_state, self.actor_target(next_state))
            target_Q = reward + ((1 - done) * GAMMA * target_Q).detach()

            # Get current Q estimate
            current_Q = self.critic(state, action)

            # Compute critic loss
            critic_loss = F.mse_loss(current_Q, target_Q)
            self.writer.add_scalar('Loss/critic_loss', critic_loss, global_step=self.num_critic_update_iteration)
            # Optimize the critic
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()

            # Compute actor loss
            actor_loss = -self.critic(state, self.actor(state)).mean()
            self.writer.add_scalar('Loss/actor_loss', actor_loss, global_step=self.num_actor_update_iteration)

            # Optimize the actor
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # Update the frozen target models
            for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                target_param.data.copy_(TAU * param.data + (1 - TAU) * target_param.data)

            for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                target_param.data.copy_(TAU * param.data + (1 - TAU) * target_param.data)

            self.num_actor_update_iteration += 1
            self.num_critic_update_iteration += 1

    def save(self):
        torch.save(self.actor.state_dict(), DIRECTORY + 'actor.pth')
        torch.save(self.critic.state_dict(), DIRECTORY + 'critic.pth')
        # print("====================================")
        # print("Model has been saved...")
        # print("====================================")

    def load(self):
        self.actor.load_state_dict(torch.load(DIRECTORY + 'actor.pth'))
        self.critic.load_state_dict(torch.load(DIRECTORY + 'critic.pth'))
        print("====================================")
        print("model has been loaded...")
        print("====================================")

# Run Training

In [6]:
RENDER = True # show UI or not
LOG_INTERVAL = 50
LOAD = False # load model
EXPLORATION_NOISE = 0.1
MAX_EPISODE = 1500 # num of games
MAX_LENGTH_OF_TRAJECTORY = 2000 # num of games
PRINT_LOG = 5

In [7]:
agent = DDPG(state_dim, action_dim, max_action)
ep_r = 0
print("====================================")
print("Collection Experience...")
print("====================================")
if LOAD:
    agent.load()
for i in range(MAX_EPISODE):
    state = env.reset()
    for t in count():
        action = agent.select_action(state)
        action = (action + np.random.normal(0, EXPLORATION_NOISE, size=env.action_space.shape[0])).clip(
            env.action_space.low, env.action_space.high)
        next_state, reward, done, info = env.step(action)
        ep_r += reward
        if RENDER:
            env.render()
        agent.replay_buffer.push((state, next_state, action, reward, np.float(done)))
        state = next_state
        if done or t >= MAX_LENGTH_OF_TRAJECTORY:
            agent.writer.add_scalar('ep_r', ep_r, global_step=i)
            if i % PRINT_LOG == 0:
                print("Ep_i \t{}, the ep_r is \t{:0.2f}, the step is \t{}".format(i, ep_r, t))
            ep_r = 0
            break

    if i % LOG_INTERVAL == 0:
        agent.save()
    if len(agent.replay_buffer.storage) >= CAPACITY-1:
        agent.update()

Collection Experience...
Ep_i 	0, the ep_r is 	-11462.75, the step is 	2000
Ep_i 	5, the ep_r is 	-12378.22, the step is 	2000
Ep_i 	10, the ep_r is 	-13139.87, the step is 	2000
Ep_i 	15, the ep_r is 	-12236.66, the step is 	2000
Ep_i 	20, the ep_r is 	-11595.00, the step is 	2000
Ep_i 	25, the ep_r is 	-19205.06, the step is 	2000
Ep_i 	30, the ep_r is 	-17465.96, the step is 	2000
Ep_i 	35, the ep_r is 	-15953.73, the step is 	2000
Ep_i 	40, the ep_r is 	-16286.34, the step is 	2000
Ep_i 	45, the ep_r is 	-15965.87, the step is 	2000
Ep_i 	50, the ep_r is 	-13854.60, the step is 	2000
Ep_i 	55, the ep_r is 	-13083.17, the step is 	2000
Ep_i 	60, the ep_r is 	-12491.64, the step is 	2000
Ep_i 	65, the ep_r is 	-12293.70, the step is 	2000
Ep_i 	70, the ep_r is 	-11611.28, the step is 	2000
Ep_i 	75, the ep_r is 	-16273.28, the step is 	2000
Ep_i 	80, the ep_r is 	-13507.06, the step is 	2000
Ep_i 	85, the ep_r is 	-14492.36, the step is 	2000
Ep_i 	90, the ep_r is 	-13787.16, the ste

Ep_i 	785, the ep_r is 	-15685.01, the step is 	2000
Ep_i 	790, the ep_r is 	-15550.21, the step is 	2000
Ep_i 	795, the ep_r is 	-127.95, the step is 	2000
Ep_i 	800, the ep_r is 	-230.00, the step is 	2000
Ep_i 	805, the ep_r is 	-15132.14, the step is 	2000
Ep_i 	810, the ep_r is 	-485.54, the step is 	2000
Ep_i 	815, the ep_r is 	-15080.91, the step is 	2000
Ep_i 	820, the ep_r is 	-126.36, the step is 	2000
Ep_i 	825, the ep_r is 	-120.82, the step is 	2000
Ep_i 	830, the ep_r is 	-15069.48, the step is 	2000
Ep_i 	835, the ep_r is 	-15084.35, the step is 	2000
Ep_i 	840, the ep_r is 	-122.90, the step is 	2000
Ep_i 	845, the ep_r is 	-124.83, the step is 	2000
Ep_i 	850, the ep_r is 	-235.47, the step is 	2000
Ep_i 	855, the ep_r is 	-123.36, the step is 	2000
Ep_i 	860, the ep_r is 	-230.44, the step is 	2000
Ep_i 	865, the ep_r is 	-245.62, the step is 	2000
Ep_i 	870, the ep_r is 	-124.38, the step is 	2000
Ep_i 	875, the ep_r is 	-15119.53, the step is 	2000
Ep_i 	880, the ep

# Test Trained Model

In [8]:
TEST_ITERATION = 10
MAX_LENGTH_OF_TRAJECTORY = 2000 # num of games

In [9]:
agent = DDPG(state_dim, action_dim, max_action)
ep_r = 0
agent.load()
for i in range(TEST_ITERATION):
    state = env.reset()
    for t in count():
        action = agent.select_action(state)
        next_state, reward, done, info = env.step(np.float32(action))
        ep_r += reward
        env.render()
        if done or t >= MAX_LENGTH_OF_TRAJECTORY:
            print("Ep_i \t{}, the ep_r is \t{:0.2f}, the step is \t{}".format(i, ep_r, t))
            ep_r = 0
            break
        state = next_state

model has been loaded...
Ep_i 	0, the ep_r is 	-10.99, the step is 	2000
Ep_i 	1, the ep_r is 	-508.22, the step is 	2000
Ep_i 	2, the ep_r is 	-334.91, the step is 	2000
Ep_i 	3, the ep_r is 	-246.38, the step is 	2000
Ep_i 	4, the ep_r is 	-604.13, the step is 	2000
Ep_i 	5, the ep_r is 	-130.22, the step is 	2000
Ep_i 	6, the ep_r is 	-441.71, the step is 	2000
Ep_i 	7, the ep_r is 	-129.36, the step is 	2000
Ep_i 	8, the ep_r is 	-133.75, the step is 	2000
Ep_i 	9, the ep_r is 	-132.05, the step is 	2000
