# Model-Based RL
---
In this notebook, we explore model-based approaches on a robotic arm reaching environment.

We show that training inside a learned model of the world dynamics can dramatically reduce the number of rollouts needed. 

### 1. Import the Necessary Packages

In [1]:
import gym
import math
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

In [2]:
#add parent dir to find package. Only needed for source code build, pip install doesn't need it.
import os, inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(os.path.dirname(currentdir))
os.sys.path.insert(0,parentdir)

from udacityPickAndPlaceEnv import UdacityPickAndPlaceEnv
import time

env = UdacityPickAndPlaceEnv(renders=False,isDiscrete=False, maxSteps = 10000)

done = False


current_dir=/Users/jamesough/misc/model-based-RL




### 2. Instantiate the Environment and Agent

In [3]:
import gym
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

from ddpg_agent import Agent

agent = Agent(state_size=9, action_size=2, random_seed=1)

#agent.actor_local.load_state_dict(torch.load('checkpoint_actor.pth'))
#agent.critic_local.load_state_dict(torch.load('checkpoint_critic.pth'))

### 3. Train the Agent with DDPG (no model)

Run the code cell below to train the agent from scratch.  Alternatively, you can skip to the next code cell to load the pre-trained weights from file.

In [5]:
states = []
actions = []
next_states = []
starting_states = []
rewards = []



def ddpg(n_episodes=10000, max_t=20, print_every=100):
    scores_deque = deque(maxlen=print_every)
    scores = []
    states = []
    for i_episode in range(1, n_episodes+1):
        state = env.reset()
        agent.reset()
        score = 0
        for t in range(max_t):
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action/2+0.5)
            states.append(state)
            actions.append(action)
            next_states.append(next_state)
        
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break 
        scores_deque.append(score)
        scores.append(score)
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)), end="")
        torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth')
        torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth')
        if i_episode % print_every == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
            
    return scores

scores = ddpg()

fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()


Episode 17	Average Score: -7.22

KeyboardInterrupt: 

### 3. Train the Agent with DDPG ( insisde the world model)
