In [1]:
1

1

In [2]:
import numpy as np

import tensorflow as tf

import gym

In [3]:
from dynamics import NNDynamicsModel
from controllers import MPCcontroller, RandomController
from cost_functions import cheetah_cost_fn, trajectory_cost_fn
import time

In [4]:
import logz
import os
import copy
import matplotlib.pyplot as plt

In [None]:
from cheetah_env import HalfCheetahEnvNew

In [1]:
import math
from time import time

In [1]:
def sample(env, 
           controller, 
           num_paths=10, 
           horizon=1000, 
           render=False,
           verbose=False):
    """
        Write a sampler function which takes in an environment, a controller (either random or the MPC controller), 
        and returns rollouts by running on the env. 
        Each path can have elements for observations, next_observations, rewards, returns, actions, etc.
    """
    paths = []
    """ YOUR CODE HERE """
    
    max_path_steps = 1000
    for n in range(num_paths):
        obs = env.reset()
        observations = []
        rewards = []
        actions = []
        for _ in range(max_path_steps):
            observations.append(obs)
            
            action = controller.get_action(obs)
            obs, rew, done, _ = env.step(action)
            rewards.append(rew)
            actions.append(action)
            if done:
                break
        observations.append(obs)
            
        paths.append({"observations":observations[:-1], \
                      "actions": actions, "rewards": rewards,
                      "next_observations": observations[1:]})
    samples_ = {"observations": np.array(sum([p["observations"] for p in paths], [])),
                "next_observations": np.array(sum([p["next_observations"] for p in paths], [])),
                "actions": np.array(sum([p["actions"] for p in paths], [])),
                "rewards": np.array(sum([p["rewards"] for p in paths], [])),}
    return samples_

In [2]:
def path_cost(cost_fn, path):
    return trajectory_cost_fn(cost_fn, path['observations'], path['actions'], path['next_observations'])

In [3]:
def compute_normalization(data):
    """
    Write a function to take in a dataset and compute the means, and stds.
    Return 6 elements: mean of s_t, std of s_t, mean of (s_t+1 - s_t), std of (s_t+1 - s_t), mean of actions, std of actions
    """

    """ YOUR CODE HERE """
    fields = ["mean_obs", "std_obs", "mean_deltas", "std_deltas", "mean_actions", "std_actions"]
    
    l = len(data["observations"])
    assert l > 1
    
    mean_obs = np.mean(data["observations"])
    std_obs = np.std(data["observations"])
    
    deltas = data["next_observations"] - data["observations"]
    mean_deltas = np.mean(deltas)
    std_deltas = np.std(deltas)
    
    mean_actions = np.mean(data["actions"])
    std_actions = np.std(data["actions"])
    normalization = mean_obs, std_obs, mean_deltas, std_deltas, mean_actions, std_actions
    return {f: nm for f, nm in zip(fields, normalization)}

In [4]:
def plot_comparison(env, dyn_model):
    """
    Write a function to generate plots comparing the behavior of the model predictions for each element of the state to the actual ground truth, using randomly sampled actions. 
    """
    """ YOUR CODE HERE """
    pass

In [5]:
#training parameters
env = HalfCheetahEnvNew()
cost_fn = cheetah_cost_fn
logdir=None
render=False
learning_rate=1e-3
onpol_iters=200
dynamics_iters=60
batch_size=512
num_paths_random=50 
num_paths_onpol=10 
num_simulated_paths=1000
env_horizon=1000 
mpc_horizon=5
n_layers=2
size=500
activation=tf.nn.relu
output_activation=None

NameError: name 'HalfCheetahEnvNew' is not defined

In [None]:
#========================================================
# 
# First, we need a lot of data generated by a random
# agent, with which we'll begin to train our dynamics
# model.

random_controller = RandomController(env)

""" YOUR CODE HERE """
samples = sample(env, random_controller, num_paths_random)

In [25]:
path_cost(cost_fn, samples)

244344.61963309604

In [26]:
#========================================================
# 
# The random data will be used to get statistics (mean
# and std) for the observations, actions, and deltas
# (where deltas are o_{t+1} - o_t). These will be used
# for normalizing inputs and denormalizing outputs
# from the dynamics network. 
# 
fields = ["mean_obs", ]
normalization = compute_normalization(samples)
normalization 

{'mean_actions': -0.00018708621391208557,
 'mean_deltas': -0.00016235200339030122,
 'mean_obs': -0.022776083467363834,
 'std_actions': 0.5772008826606332,
 'std_deltas': 1.3062886948262578,
 'std_obs': 1.8248150821602953}

In [27]:
#========================================================
# 
# Build dynamics model and MPC controllers.
# 
sess = tf.Session()

dyn_model = NNDynamicsModel(env=env, 
                            n_layers=n_layers, 
                            size=size, 
                            activation=activation, 
                            output_activation=output_activation, 
#                             normalization=normalization,
                            normalization=compute_normalization,
                            batch_size=batch_size,
                            iterations=dynamics_iters,
                            learning_rate=learning_rate,
                            sess=sess)

mpc_controller = MPCcontroller(env=env, 
                               dyn_model=dyn_model, 
                               horizon=mpc_horizon, 
                               cost_fn=cost_fn, 
                               num_simulated_paths=num_simulated_paths)

In [28]:
# samples
sess

<tensorflow.python.client.session.Session at 0x7f25d3dcc8d0>

In [29]:
#========================================================
# 
# Tensorflow session building.
# 
sess.__enter__()
tf.global_variables_initializer().run()

In [35]:
samples["rewards"].shape

(2050000,)

In [36]:
# %%heat

#========================================================
# 
# Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. 
# Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596
# 
for itr in range(onpol_iters):
    """ YOUR CODE HERE """
    dyn_model = NNDynamicsModel(env=env, 
                            n_layers=n_layers, 
                            size=size, 
                            activation=activation, 
                            output_activation=output_activation, 
#                             normalization=normalization,
                            normalization=compute_normalization,
                            batch_size=batch_size,
                            iterations=dynamics_iters,
                            learning_rate=learning_rate,
                            sess=sess)
    tf.global_variables_initializer().run()
    print(itr, "fitting dyn_model")
    dyn_model.fit(samples, verbose=True)
    
    obs_onpol = []
    next_obs_onpol = []
    act_onpol = []
    rew_onpol = []
    for i in range(num_paths_onpol):
        print("on policy, collecting path", i)
        obs = env.reset()
        obs_onpol.append(obs)
        for step in range(env_horizon):
#             now = time()
            act, _ = mpc_controller.get_action(obs_onpol[-1])
#             print(time()-now)
            obs, rew, done, _ = env.step(act)
            obs_onpol.append(obs)
            act_onpol.append(act)
            rew_onpol.append(rew)
            
            if done:
                break
                
        next_obs_onpol.extend(obs_onpol[-step-1:])
        obs_onpol = obs_onpol[:-1]
    print(trajectory_cost_fn(cost_fn, obs_onpol, act_onpol, next_obs_onpol))
    
    samples["observations"] = np.vstack([samples["observations"], obs_onpol])
    samples["next_observations"] = np.vstack([samples["next_observations"], next_obs_onpol])
    samples["actions"] = np.vstack([samples["actions"], act_onpol])
    samples["rewards"] = np.concatenate([samples["rewards"], rew_onpol])
    
    

0 fitting dyn_model
0.48881626
0.29989445
0.2722991
0.23899078
0.2196289
0.20355888
0.19134812
0.17077085
0.14928882
0.121889114
0.11626508
0.12256648
0.11463864
0.119629
0.12405555
0.11004744
0.10093753
0.07925246
0.11187925
0.100053646
0.115978345
0.11314686
0.097912714
0.09261451
0.10466292
0.09439521
0.09271498
0.080723666
0.09729247
0.09483518
0.10732639
0.102229156
0.09401718
0.110747956
0.09906388
0.08919812
0.084319465
0.08986615
0.097662896
0.09845631
0.09308641
0.102363944
0.1063572
0.12728879
0.086184755
0.08953484
0.07603881
0.088394865
0.114995085
0.08993729
0.07932953
0.08649646
0.08884039
0.09984279
0.099409714
0.090295054
0.0737682
0.09614961
0.08124363
0.09515349
0.10355739
0.1041548
0.09843818
0.10214343
0.08359537
0.0852836
0.07670307
0.09366165
0.09693213
0.09618003
0.089744836
0.0858601
0.1017066
0.0850039
0.09168397
0.07791982
0.10960064
0.084722996
0.09797717
0.09584016
0.11265395
0.08696864
0.08776641
0.07112209
0.083016835
0.08433912
0.08627107
0.106657624
0.08

0.023665586
0.020667285
0.026490862
0.015487669
0.02030702
0.02011928
0.022187764
0.02424623
0.015867805
0.02002577
0.020414982
0.017884575
0.017630259
0.020964693
0.019487794
0.017218905
0.025557106
0.020839859
0.022675402
0.018219676
0.022647336
0.017840656
0.022227209
0.019248616
0.015120169
0.019202435
0.01828502
0.014886901
0.01834862
0.023525655
0.020925116
0.020892277
0.017281426
0.020827243
0.018351259
0.0160289
0.017432757
0.025853347
0.014621886
0.019856188
0.022011023
0.019321179
0.01715238
0.023422552
0.018200412
0.025014777
0.022919025
0.024830412
0.013625177
0.020271193
0.018065821
0.018174406
0.01934547
0.022268888
0.019726355
0.018342003
0.016586397
0.015506631
0.02270852
0.020782758
0.0231127
0.022236919
0.015566376
0.021226134
0.018121198
0.020781538
0.018151294
0.014357688
0.016172828
0.017325375
0.021548841
0.023093743
0.01994909
0.018015524
0.016820535
0.020296203
0.020867731
0.018098217
0.017890599
0.02168046
0.020837102
0.01985507
0.015709808
0.019194424
0.024049

0.009233869
0.01147441
0.014493553
0.013605538
0.011243188
0.014065902
0.009080534
0.015413242
0.015201181
0.012952989
0.009206574
0.0108561665
0.011621313
0.010723516
0.012320818
0.01196898
0.011830821
0.012313028
0.018510753
0.008962211
0.014660204
0.012860814
0.010774413
0.010826322
0.015819535
0.0100495275
0.009673411
0.011585419
0.012352543
0.008399884
0.009949096
0.008636142
0.012162431
0.010412063
0.014413011
0.011938819
0.012568434
0.010133833
0.011005497
0.010509156
0.010640004
0.010117011
0.009102198
0.01375524
0.011493358
0.014777517
0.013530147
0.013068946
0.0096928915
0.009616336
0.011969101
0.010692028
0.010484673
0.01020204
0.014190053
0.011873459
0.013693234
0.009102787
0.01730249
0.013251839
0.016516777
0.013547833
0.019957306
0.016122222
0.01179524
0.011400765
0.0129375905
0.011864537
0.014726308
0.0133419335
0.013841595
0.011359925
0.010439869
0.0112652015
0.010958883
0.0137752835
0.013692764
0.011136663
0.011360519
0.010859175
0.011440541
0.011848481
0.012670338
0.0

0.009599699
0.012039529
0.0075297044
0.0069232183
0.00738948
0.009175483
0.0081362
0.007929797
0.007260938
0.006007336
0.01163096
0.013342971
0.009483533
0.0097181825
0.007864145
0.008438826
0.008832345
0.007008837
0.010169601
0.008502736
0.00823809
0.00671531
0.0069508925
0.009495243
0.010544291
0.008893159
0.010193551
0.01146994
0.008781895
0.007894019
0.0074735703
0.005681927
0.0078048618
0.012782523
0.010967443
0.01193255
0.0076398626
0.009340016
0.010807751
0.008006981
0.010384008
0.007998511
0.008049808
0.009692148
0.0066642733
0.009919124
0.01057532
0.008162433
0.009196249
0.0076931207
0.009665938
0.009685274
0.009716688
0.010478134
0.014584312
0.010277269
0.009328084
0.009858594
0.009300147
0.009746758
0.00960567
0.009387737
0.010392811
0.009013963
0.009968123
0.008591275
0.010023415
0.010252143
0.009186727
0.01105774
0.0087788105
0.009371156
0.008988466
0.008938816
0.006764649
0.012567669
0.012789416
0.009475375
0.0102604665
0.007670899
0.008632116
0.013501088
0.01673766
0.009

0.00926744
0.0043859333
0.007130054
0.010212148
0.0068542957
0.00860658
0.00620401
0.0081397025
0.009382829
0.008383592
0.009640005
0.008380155
0.008535916
0.007008764
0.00651429
0.0073432266
0.008802277
0.0071192617
0.008221549
0.010117067
0.011057964
0.0103460895
0.009777791
0.0070104315
0.0076218965
0.007061736
0.006401381
0.008272832
0.008933969
0.010613358
0.010969518
0.008887067
0.0072009144
0.007858405
0.008863464
0.008429712
0.008080034
0.00887158
0.010930516
0.0095798755
0.009976402
0.010481435
0.0083174035
0.00800032
0.0060225027
0.0058136433
0.006841657
0.007354787
0.0076143905
0.007068911
0.0070665367
0.0075957896
0.0116213765
0.0075552226
0.009455083
0.009161742
0.0068781087
0.011588071
0.00820674
0.008858755
0.009859729
0.008466914
0.012060432
0.00969065
0.012056192
0.010618466
0.008938529
0.011402788
0.007753591
0.010430673
0.006319174
0.0090972595
0.009672402
0.011788219
0.0079846755
0.008957448
0.008511858
0.00806722
0.009419155
0.0071983063
0.010131965
0.008923272
0.0

KeyboardInterrupt: 

In [21]:
samples["rewards"] = np.concatenate([samples["rewards"], rew_onpol])

In [14]:
from tensorflow.contrib.layers import fully_connected as fc, xavier_initializer

In [34]:
# dyn_model.fit(samples)
# dyn_model.predict(np.random.random([5, 20]), np.random.random([5, 6]))

In [16]:
env = HalfCheetahEnvNew()

In [17]:
env.observation_space.shape

(20,)

In [18]:
mpc = MPCcontroller(env, dyn_model, cost_fn=cheetah_cost_fn, num_simulated_paths=10)

In [28]:
next_action, cost= mpc.get_action(np.random.random(env.observation_space.shape))

In [29]:
next_action

array([ 0.76935887,  0.79222861,  0.22855308, -0.30406221,  0.99219278,
        0.96647708])

In [30]:
cost

65.6946283992138

In [169]:
costs

array([ 63.73937178,  71.72518611,  94.94598173,  87.00776118,
        74.87483092,  84.47668119,  86.00206505,  64.22916289,
       104.13716424,  74.67663242])

In [170]:
state_samples.shape

(1000, 5, 20)

In [97]:
action_samples.shape

(1000, 5, 6)

In [98]:
next_states.shape

(1000, 5, 20)