# Deep Deterministic Policy Gradients (DDPG)
---
In this notebook, we train DDPG with OpenAI Gym's BipedalWalker-v2 environment.

### 1. Import the Necessary Packages

In [1]:
import gym
import math
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

In [2]:
#add parent dir to find package. Only needed for source code build, pip install doesn't need it.
import os, inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(os.path.dirname(currentdir))
os.sys.path.insert(0,parentdir)

from udacityPickAndPlaceEnv import UdacityPickAndPlaceEnv
import time

env = UdacityPickAndPlaceEnv(renders=False,isDiscrete=False, maxSteps = 10000)

done = False


current_dir=/Users/jamesough/misc/model-based-RL




### 2. Instantiate the Environment and Agent

In [3]:
import gym
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

from ddpg_agent import Agent

agent = Agent(state_size=9, action_size=2, random_seed=1)

#agent.actor_local.load_state_dict(torch.load('checkpoint_actor.pth'))
#agent.critic_local.load_state_dict(torch.load('checkpoint_critic.pth'))

### 3. Train the Agent with DDPG

Run the code cell below to train the agent from scratch.  Alternatively, you can skip to the next code cell to load the pre-trained weights from file.

In [5]:
states = []
actions = []
next_states = []
starting_states = []
rewards = []



def ddpg(n_episodes=10000, max_t=20, print_every=100):
    scores_deque = deque(maxlen=print_every)
    scores = []
    states = []
    for i_episode in range(1, n_episodes+1):
        state = env.reset()
        agent.reset()
        score = 0
        for t in range(max_t):
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action/2+0.5)
            states.append(state)
            actions.append(action)
            next_states.append(next_state)
        
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break 
        scores_deque.append(score)
        scores.append(score)
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)), end="")
        torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
        torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')
        if i_episode % print_every == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
            
    return scores

scores = ddpg()

fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()


Episode 17	Average Score: -7.22

KeyboardInterrupt: 

In [6]:
states = []
actions = []
next_states = []
starting_states = []
rewards = []

def ddpg(n_episodes=10000, max_t=20, print_every=100):
    scores_deque = deque(maxlen=print_every)
    scores = []
    for i_episode in range(1, n_episodes+1):
        state = env.reset()
        worldmodelenv.reset(state)
        agent.reset()
        score = 0
        for t in range(max_t):
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action/2+0.5)
            
            states.append(state)
            actions.append(action)
            next_states.append(next_state)
        
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break 
        scores_deque.append(score)
        scores.append(score)
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)), end="")
        torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
        torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')
        if i_episode % print_every == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
            
    return scores

scores = ddpg()

fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()


NameError: name 'worldmodelenv' is not defined

In [7]:
states = []
actions = []
next_states = []
starting_states = []
rewards = []


for i_episode in range(1, 100):
    state = env.reset()
    starting_states.append(state)
    agent.reset()
    score = 0
    action = np.random.uniform(0,1,2)
    states.append([])
    actions.append([])
    next_states.append([])
    starting_states.append([])
    rewards.append([])
    for t in range(20):
        next_state, reward, done, _ = env.step(action)
        states[-1].append(state)
        actions[-1].append(action)
        next_states[-1].append(next_state)
        rewards[-1].append(reward)
        state = next_state
        score += reward
        if done:
            break

In [29]:
                            
class WorldModel(nn.Module):
    "Simple multi-layer perceptron policy, no internal state"
    def __init__(self, observation_space, action_space, starting_states):
        self.starting_states=starting_states
        super(WorldModel, self).__init__()
        self.weights_dense1 = nn.Linear(observation_space.shape[0]*2+2, 256) 
        self.weights_dense2 = nn.Linear(256, 128) 
        self.weights_dense_final = nn.Linear(128, observation_space.shape[0]+1) 
        self.weights_dense_final_h = nn.Linear(128, observation_space.shape[0]) 

        torch.nn.init.xavier_uniform_(self.weights_dense1.weight)
        torch.nn.init.xavier_uniform_(self.weights_dense2.weight)
        torch.nn.init.xavier_uniform_(self.weights_dense_final.weight)
        torch.nn.init.xavier_uniform_(self.weights_dense_final_h.weight)
        
        self.weights_dense1.bias.data.fill_(0.01)
        self.weights_dense2.bias.data.fill_(0.01)
        self.weights_dense_final.bias.data.fill_(0.01)
        self.weights_dense_final_h.bias.data.fill_(0.01)

    def forward(self, x,h):
        if len(x.size())==1:
            dim=0
        else: dim=1
        x = torch.cat([x, h], dim = dim)
        x = f.relu(self.weights_dense1(x))
        x = f.relu(self.weights_dense2(x))
        o = self.weights_dense_final(x)
        h = self.weights_dense_final_h(x)
        return o,h

from torch.utils import data
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as f

class Dataset(data.Dataset):
  'Characterizes a dataset for PyTorch'
  def __init__(self, states, actions, next_states, rewards):
        'Initialization'
        self.states=states
        self.actions=actions
        self.next_states=next_states
        self.rewards=rewards

  def __len__(self):
        'Denotes the total number of samples'
        return len(self.states)

  def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        s = np.array(self.states[index])
        a = np.array(self.actions[index])
        n_s = np.array(self.next_states[index])
        r = np.array(self.rewards[index])
        r = r.reshape(r.shape[0],1)
        return np.concatenate([s,a], axis=1),np.concatenate([n_s,r], axis=1)

def train_world_model(world_model, states, actions, next_states, rewards):
    dataset = Dataset(states, actions, next_states, rewards)
    dataloader = data.DataLoader(dataset, batch_size = 128)
    
    optimizer = optim.Adam(world_model.parameters(), lr=1e-3)
    mse = nn.MSELoss()
    for ne in range(100):
        h = None
        for obs, next_state in dataloader:

            obs = Variable(obs).float()[:,0]
            next_state = Variable(next_state).float()[:,0]

            world_model.zero_grad()
            optimizer.zero_grad()
            
            
            h = obs[:,:-2]*0
            for t in range(20):

                next_state_pred,h = world_model(obs,h)
                loss = mse(next_state_pred, next_state.float())

            loss.backward(retain_graph=True)
            optimizer.step()

            print("Epoch: {}, Total loss: {}".format(ne, loss))
    return world_model
                      

In [30]:
world_model = WorldModel(env.observation_space, env.action_space, starting_states)   

train_world_model(world_model, states, actions, next_states, rewards)

Epoch: 0, Total loss: 0.12482006847858429
Epoch: 1, Total loss: 0.10063561797142029
Epoch: 2, Total loss: 0.08086177706718445
Epoch: 3, Total loss: 0.06472554057836533
Epoch: 4, Total loss: 0.05080728232860565
Epoch: 5, Total loss: 0.03898658975958824
Epoch: 6, Total loss: 0.029061276465654373
Epoch: 7, Total loss: 0.02082749828696251
Epoch: 8, Total loss: 0.014637568034231663
Epoch: 9, Total loss: 0.010970236733555794
Epoch: 10, Total loss: 0.009806604124605656
Epoch: 11, Total loss: 0.010003343224525452
Epoch: 12, Total loss: 0.010170461609959602
Epoch: 13, Total loss: 0.009809786453843117
Epoch: 14, Total loss: 0.009026668034493923
Epoch: 15, Total loss: 0.00807274878025055
Epoch: 16, Total loss: 0.00714862858876586
Epoch: 17, Total loss: 0.006381989922374487
Epoch: 18, Total loss: 0.005795128643512726
Epoch: 19, Total loss: 0.005352777894586325
Epoch: 20, Total loss: 0.004995638970285654
Epoch: 21, Total loss: 0.004673440009355545
Epoch: 22, Total loss: 0.004353997763246298
Epoch: 

WorldModel(
  (weights_dense1): Linear(in_features=20, out_features=256, bias=True)
  (weights_dense2): Linear(in_features=256, out_features=128, bias=True)
  (weights_dense_final): Linear(in_features=128, out_features=10, bias=True)
  (weights_dense_final_h): Linear(in_features=128, out_features=9, bias=True)
)

In [144]:
class WorldModelEnv():
    def __init__(self, world_model, starting_states):
        self.world_model = world_model
        self.starting_states=starting_states
        
    def reset(self, state=None):
        if state is not None:
            self.state=state
        else:
            self.state = np.random.choice(self.starting_states)
        self.h = Variable(torch.Tensor(self.state*0))
        return self.state
         
    
    def step(self, action):
        x = Variable(torch.Tensor(np.concatenate([self.state, action])).float())
        print(x.shape, self.h.shape)
        output, h = self.world_model(x, self.h)
        next_state = output[:-1]
        reward = output[-1]
        return next_state, reward, None, None

worldmodelenv = WorldModelEnv(world_model,starting_states)



In [160]:
for i_episode in range(1, 100):
    worldmodelenv = WorldModelEnv(world_model,starting_states)
    state = worldmodelenv.reset()

    agent.reset()
    score = 0
    action = np.random.uniform(0,1,2)


    for t in range(19):
        print(t)
        _, _, _, _ = worldmodelenv.step(action)


        score += reward
        if done:
            break

0
torch.Size([2]) torch.Size([0])


RuntimeError: size mismatch, m1: [1 x 2], m2: [20 x 256] at /Users/soumith/b101_2/2019_02_08/wheel_build_dirs/wheel_3.6/pytorch/aten/src/TH/generic/THTensorMath.cpp:940

### 4. Watch a Smart Agent!

In the next code cell, you will load the trained weights from file to watch a smart agent!

In [None]:
agent.load_state_dict(torch.load('checkpoint.pth'))



for e in range(100):
    state = env.reset()
    print(e)
    for t in range(20):
        action = agent(Variable(torch.Tensor(state)))
        state = torch.from_numpy(state).float().to(device)
        action = agent(state)

        state, reward, done, _ = env.step(action/2+0.5)
        state = next_state

        
env.close()

### 5. Explore

In this exercise, we have provided a sample DDPG agent and demonstrated how to use it to solve an OpenAI Gym environment.  To continue your learning, you are encouraged to complete any (or all!) of the following tasks:
- Amend the various hyperparameters and network architecture to see if you can get your agent to solve the environment faster than this benchmark implementation.  Once you build intuition for the hyperparameters that work well with this environment, try solving a different OpenAI Gym task!
- Write your own DDPG implementation.  Use this code as reference only when needed -- try as much as you can to write your own algorithm from scratch.
- You may also like to implement prioritized experience replay, to see if it speeds learning.  
- The current implementation adds Ornsetein-Uhlenbeck noise to the action space.  However, it has [been shown](https://blog.openai.com/better-exploration-with-parameter-noise/) that adding noise to the parameters of the neural network policy can improve performance.  Make this change to the code, to verify it for yourself!
- Write a blog post explaining the intuition behind the DDPG algorithm and demonstrating how to use it to solve an RL environment of your choosing.  