# Deep Deterministic Policy Gradients (DDPG)
---
In this notebook, we train DDPG with OpenAI Gym's Pendulum-v0 environment.

### 1. Import the Necessary Packages

In [1]:
import gym
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

from ddpg_agent import Agent

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import sys

repo_path = os.path.dirname(os.path.dirname(os.path.abspath("__file__")))
repo_path
sys.path.append(repo_path)

In [4]:
rnd_seed = 123
num_agents = 1

### 2. Instantiate the Environment and Agent

In [5]:
env = gym.make('Pendulum-v0')
env.seed(rnd_seed)
state_size = 3
action_size = 1

In [6]:
env.action_space

Box(-2.0, 2.0, (1,), float32)

In [7]:
env.observation_space

Box(-8.0, 8.0, (3,), float32)

In [8]:
agent = Agent(state_size=3, action_size=1, random_seed=2)

In [9]:
agent.actor_local

Actor(
  (fc1): Linear(in_features=3, out_features=400, bias=True)
  (fc2): Linear(in_features=400, out_features=300, bias=True)
  (fc3): Linear(in_features=300, out_features=1, bias=True)
)

In [10]:
agent.critic_local

Critic(
  (fcs1): Linear(in_features=3, out_features=400, bias=True)
  (fc2): Linear(in_features=401, out_features=300, bias=True)
  (fc3): Linear(in_features=300, out_features=1, bias=True)
)

In [11]:

score = 0
t_step = 0
state = env.reset()
for _ in range(300):
    action = agent.act(state)
    next_state, reward, done, _ = env.step(action)
    agent.step(state, action, reward, next_state, done)
    state = next_state
    score += reward
    t_step += 1
    if done:
        break 

In [12]:
action

array([-0.07023147], dtype=float32)

In [13]:
next_state

array([-0.30570887,  0.95212504,  4.91803974])

In [14]:
from src.ac_agent import AgentDDPG, GaussianProcess, OUNoise

ou_noise = OUNoise(num_agents, rnd_seed)

ddpg_agent = AgentDDPG(
    state_size=state_size, action_size=action_size, gamma=0.99,
    actor_hidden_layers=(400, 300), critic_hidden_layers=(400, 300),
    batch_size=128, learning_rates=(1e-4, 1e-3), grad_clipping=(False, 1.), weight_decay=(0, 1e-2),
    soft_upd_param=1e-3, update_every=1, buffer_size=int(1e5),
    noise=ou_noise,
    seed=rnd_seed, action_dtype='float')

In [15]:
state_numpy = state
state_numpy

array([-0.30570887,  0.95212504,  4.91803974])

In [16]:
state_torch_1d = torch.from_numpy(state_numpy).float()
state_torch_1d.size()

torch.Size([3])

In [17]:
state_torch_2d = torch.from_numpy(state_numpy).float().unsqueeze(0)
state_torch_2d.size()

torch.Size([1, 3])

In [18]:
state_torch = torch.from_numpy(state.reshape(1,-1)).float()
state_torch.size()

torch.Size([1, 3])

In [19]:
score = 0
t_step = 0
state = env.reset()
for _ in range(300):
    action = ddpg_agent.act(state)#.reshape(1,-1))
    next_state, reward, done, _ = env.step(action)
    #ddpg_agent.step(state, action, reward, next_state, done)
    state = next_state
    score += reward
    t_step += 1
    if done:
        break 