In [23]:
from rllab.algos.cem import CEM
from rllab.algos.ddpg import DDPG
from rllab.algos.tnpg import TNPG
from rllab.algos.trpo import TRPO
from rllab.algos.vpg import VPG

from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
from rllab.envs.gym_env import GymEnv
from rllab.envs.normalized_env import normalize
from rllab.exploration_strategies.ou_strategy import OUStrategy
from rllab.misc.instrument import run_experiment_lite
from rllab.policies.categorical_mlp_policy import CategoricalMLPPolicy
from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
from rllab.policies.deterministic_mlp_policy import DeterministicMLPPolicy
from rllab.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction
from rllab.sampler.utils import collect_data
from rllab.sampler.utils import rollout

import gym
import numpy as np
import GPy
import GPyOpt

import matplotlib.pyplot as plt
%matplotlib inline

import matplotlib
import pandas as pd


import plotly
plotly.offline.init_notebook_mode() 
import plotly.plotly as py
import plotly.graph_objs as go

import numpy as np
import pandas as pd
import copy

In [28]:
env_name = "Swimmer-v1"

def run_task(*_):
    # Please note that different environments with different action spaces may
    # require different policies. For example with a Discrete action space, a
    # CategoricalMLPPolicy works, but for a Box action space may need to use
    # a GaussianMLPPolicy (see the trpo_gym_pendulum.py example)
    env = normalize(GymEnv(env_name,record_video=False))

    policy = DeterministicMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32)
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)
    
    algo = TNPG(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=env.horizon,
        n_itr=10,
        discount=0.99,
        step_size=0.01,
#         Uncomment both lines (this and the plot parameter below) to enable plotting
#         plot=True,
    )
    
#     qf = ContinuousMLPQFunction(env.spec)
#     es = OUStrategy(env.spec)
#     algo = DDPG(
#         env=env, policy=policy, qf=qf, es=es,
#         n_epochs=1,
#         epoch_length=100,
#         batch_size=32,
#         min_pool_size=50,
#         replay_pool_size=1000,
#         eval_samples=100,
#     )

    algo.train()


run_experiment_lite(
    run_task,
    # Number of parallel workers for sampling
    n_parallel=16,
    # Only keep the snapshot parameters for the last iteration
    snapshot_mode="last",
    # Specifies the seed for the experiment. If this is not provided, a random seed
    # will be used
    seed=1,
    exp_prefix = "trpo_"+env_name+"_32_32_4000_40"
#     plot=True,
)


python /home/zhusj/repos/rllab/scripts/run_experiment_lite.py  --variant_data 'gAN9cQBYCAAAAGV4cF9uYW1lcQFYNgAAAHRycG9fU3dpbW1lci12MV8zMl8zMl80MDAwXzQwXzIwMTdfMDlfMDVfMThfMTlfNDVfMDAxNXECcy4='  --log_dir '/home/zhusj/repos/rllab/data/local/trpo-Swimmer-v1-32-32-4000-40/trpo_Swimmer-v1_32_32_4000_40_2017_09_05_18_19_45_0015'  --n_parallel '16'  --seed '1'  --use_cloudpickle 'True'  --exp_name 'trpo_Swimmer-v1_32_32_4000_40_2017_09_05_18_19_45_0015'  --args_data 'gAJjY2xvdWRwaWNrbGUuY2xvdWRwaWNrbGUKX2ZpbGxfZnVuY3Rpb24KcQAoY2Nsb3VkcGlja2xlLmNsb3VkcGlja2xlCl9tYWtlX3NrZWxfZnVuYwpxAWNjbG91ZHBpY2tsZS5jbG91ZHBpY2tsZQpfYnVpbHRpbl90eXBlCnECWAgAAABDb2RlVHlwZXEDhXEEUnEFKEsASwBLBUsRS0djX2NvZGVjcwplbmNvZGUKcQZYkgAAAHQAAHQBAHQCAGQBAGQCAMKDAQHCgwEAfQEAdAMAZAMAfAEAagQAZAQAZBIAwoMAAn0CAHQFAGQDAHwBAGoEAMKDAAF9AwB0BgBkBgB8AQBkBwB8AgBkCAB8AwBkCQBkCgBkCwB8AQBqBwBkDABkDQBkDgBkDwBkEABkEQDCgwAIfQQAfAQAaggAwoMAAAFkAABTcQdYBgAAAGxhdGluMXEIhnEJUnEKKE5YDAAAAHJlY29yZF92aWRlb3ELiVgIAAAAZW52X3NwZWNxDFgMAAAAaGlkZGVuX3

In [19]:
datafile =  '../../data/local/trpo-Swimmer-v1-32-32-4000-40/trpo_Swimmer-v1_32_32_4000_40_2017_09_05_18_19_45_0008/progress.csv'
df = pd.read_csv(datafile)
trace1 = go.Scatter(
                    x=df['Iteration'], y=df['AverageReturn'], # Data
                    mode='lines', name='logx' # Additional options
                   )
layout = go.Layout(title='Simple Plot from csv data',
                   plot_bgcolor='rgb(230, 230,230)')

fig = go.Figure(data=[trace1], layout=layout)

# Plot data in the notebook
plotly.offline.iplot(fig)

In [15]:
datafile =  '../../data/local/trpo-Swimmer-v1-32-32-4000-40/trpo_Swimmer-v1_32_32_4000_40_2017_09_05_18_19_45_0007/progress.csv'
df = pd.read_csv(datafile)
trace1 = go.Scatter(
                    x=df['Iteration'], y=df['AverageReturn'], # Data
                    mode='lines', name='logx' # Additional options
                   )
layout = go.Layout(title='Simple Plot from csv data',
                   plot_bgcolor='rgb(230, 230,230)')

fig = go.Figure(data=[trace1], layout=layout)

# Plot data in the notebook
plotly.offline.iplot(fig)

In [None]:
env.w

In [None]:
def run_task(*_):
    # Please note that different environments with different action spaces may
    # require different policies. For example with a Discrete action space, a
    # CategoricalMLPPolicy works, but for a Box action space may need to use
    # a GaussianMLPPolicy (see the trpo_gym_pendulum.py example)
    env = normalize(GymEnv("InvertedPendulum-v1",record_video=False))

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32)
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)
    returns = []
    total_ier = 100
    test_iter = 10
    num_iters = int(total_ier/test_iter)
    
    for i in range(0,num_iters):
        print("iter: ", i)
        
        ################### Testing in different parameter #################
        print("mass: ", env.wrapped_env.env.env.env.model.body_mass)
        
        env2 = copy.deepcopy(env)
        mb = env2.wrapped_env.env.env.env.model.body_mass
        mb = np.array(mb)
        mb[2,0] = mb[2,0]*0.6
        env2.wrapped_env.env.env.env.model.body_mass = mb

        print("mass: ", env2.wrapped_env.env.env.env.model.body_mass)

        path = rollout(env2, policy, max_path_length=1000,
                       animated=False, speedup=1)
        path["returns"] = sum(path["rewards"])
        returns.append(path['returns'])
        #################################################
        
        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=4000,
            max_path_length=env.horizon,
            n_itr=test_iter,
            discount=0.99,
            step_size=0.01,
    #         Uncomment both lines (this and the plot parameter below) to enable plotting
    #         plot=True,
        )
        algo.train()
    print("returns: ", returns)


run_experiment_lite(
    run_task,
    # Number of parallel workers for sampling
    n_parallel=8,
    # Only keep the snapshot parameters for the last iteration
    snapshot_mode="last",
    # Specifies the seed for the experiment. If this is not provided, a random seed
    # will be used
    seed=1,
    exp_prefix = "trpo_ip_32_32_4000_100"
#     plot=True,
)


python /home/zhusj/repos/rllab/scripts/run_experiment_lite.py  --args_data 'gAJjY2xvdWRwaWNrbGUuY2xvdWRwaWNrbGUKX2ZpbGxfZnVuY3Rpb24KcQAoY2Nsb3VkcGlja2xlLmNsb3VkcGlja2xlCl9tYWtlX3NrZWxfZnVuYwpxAWNjbG91ZHBpY2tsZS5jbG91ZHBpY2tsZQpfYnVpbHRpbl90eXBlCnECWAgAAABDb2RlVHlwZXEDhXEEUnEFKEsASwBLDUsSS0djX2NvZGVjcwplbmNvZGUKcQZY1wEAAHQAAHQBAGQBAGQCAGQDAMKDAQHCgwEAfQEAdAIAZAQAfAEAagMAZAUAZCAAwoMAAn0CAHQEAGQEAHwBAGoDAMKDAAF9AwBnAAB9BABkBwB9BQBkCAB9BgB0BQB8BQB8BgAbwoMBAH0HAHhOAXQGAGQJAHwHAMKDAgBEXT0BfQgAdAcAZAoAfAgAwoMCAAF0BwBkCwB8AQBqCABqCQBqCQBqCQBqCgBqCwDCgwIAAXQMAGoNAHwBAMKDAQB9CQB8CQBqCABqCQBqCQBqCQBqCgBqCwB9CgB0DgBqDwB8CgDCgwEAfQoAfAoAZCEAGWQNABR8CgBkIgA8fAoAfAkAaggAagkAagkAagkAagoAXwsAdAcAZAsAfAkAaggAagkAagkAagkAagoAagsAwoMCAAF0EAB8CQB8AgBkDgBkDwBkEABkAwBkEQBkEgDCgwIDfQsAdBEAfAsAZBMAGcKDAQB8CwBkFAA8fAQAahIAfAsAZBQAGcKDAQABdBMAZBUAfAEAZBYAfAIAZBcAfAMAZBgAZBkAZA4AfAEAahQAZBoAfAYAZBsAZBwAZB0AZB4AwoMACH0MAHwMAGoVAMKDAAABcXQAV3QHAGQfAHwEAMKDAgABZAAAU3EHWAYAAABsYXRpbjFxCIZxCVJxCihOWBMAAABJbnZlcnRlZFBl

In [10]:
datafile =  '../../data/local/trpo-ip-32-32-4000-100/trpo_ip_32_32_4000_100_2017_09_05_17_48_37_0002/progress.csv'
df = pd.read_csv(datafile)
trace1 = go.Scatter(
                    y=df['AverageReturn'], # Data
                    mode='lines', name='training' # Additional options
                   )
trace２ = go.Scatter(
                    y=np.array([10.0, 4.0, 5.0, 4.0, 3.0, 12.0, 30.0, 66.0, 15.0, 45.0, 78.0, 102.0, 29.0, 55.0, 81.0, 115.0, 133.0, 103.0, 166.0, 170.0, 154.0, 26.0, 486.0, 1000.0, 306.0, 283.0, 819.0, 394.0, 478.0, 524.0, 873.0, 893.0, 1000.0, 1000.0, 802.0, 293.0, 1000.0, 1000.0, 224.0, 604.0]), # Data
                    mode='lines', name='testing (mass*0.6)' # Additional options
                   )
layout = go.Layout(title='Inverted Pendulum TRPO Perfomance',
                   plot_bgcolor='rgb(230, 230,230)')

fig = go.Figure(data=[trace1, trace2], layout=layout)

# Plot data in the notebook
plotly.offline.iplot(fig)

In [None]:
datafile =  '../../data/local/trpo-ip-32-32-4000-100/trpo_ip_32_32_4000_100_2017_09_05_17_48_37_0002/progress.csv'
df = pd.read_csv(datafile)
trace1 = go.Scatter(
                    y=df['AverageReturn'], # Data
                    mode='lines', name='training' # Additional options
                   )
trace２ = go.Scatter(
                    x=np.array([1:10:100])
                    y=np.array([10.0, 4.0, 5.0, 4.0, 3.0, 12.0, 30.0, 66.0, 15.0, 45.0, 78.0, 102.0, 29.0, 55.0, 81.0, 115.0, 133.0, 103.0, 166.0, 170.0, 154.0, 26.0, 486.0, 1000.0, 306.0, 283.0, 819.0, 394.0, 478.0, 524.0, 873.0, 893.0, 1000.0, 1000.0, 802.0, 293.0, 1000.0, 1000.0, 224.0, 604.0]), # Data
                    mode='lines', name='testing (mass*0.6)' # Additional options
                   )
layout = go.Layout(title='Inverted Pendulum TRPO Perfomance',
                   plot_bgcolor='rgb(230, 230,230)')

fig = go.Figure(data=[trace1, trace2], layout=layout)

# Plot data in the notebook
plotly.offline.iplot(fig)

In [12]:
datafile =  '../../data/local/trpo-ip-32-32-4000-100/trpo_ip_32_32_4000_100_2017_09_05_17_48_37_0003/progress.csv'
df = pd.read_csv(datafile)
trace1 = go.Scatter(
                    y=df['AverageReturn'], # Data
                    mode='lines', name='training' # Additional options
                   )
trace２ = go.Scatter(
                    y = np.array([11.0, 4.0, 3.0, 5.0, 5.0, 7.0, 31.0, 35.0, 6.0, 50.0, 79.0, 92.0, 38.0, 62.0, 144.0, 139.0, 163.0, 136.0, 94.0, 15.0, 225.0, 228.0, 443.0, 283.0, 805.0, 68.0, 761.0, 1000.0, 52.0, 1000.0, 1000.0, 694.0, 1000.0, 1000.0, 744.0, 1000.0, 1000.0, 7.0, 72.0, 1000.0, 594.0, 1000.0, 214.0, 110.0, 1000.0, 565.0, 185.0, 557.0, 246.0, 1000.0, 1000.0, 280.0, 545.0, 1000.0, 1000.0, 1000.0, 451.0, 857.0, 255.0, 312.0, 462.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 325.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 462.0, 549.0, 1000.0, 233.0, 1000.0, 1000.0, 1000.0, 733.0]
),
                    mode='lines', name='testing (mass*0.6)' # Additional options
                   )
layout = go.Layout(title='Inverted Pendulum TRPO Perfomance',
                   plot_bgcolor='rgb(230, 230,230)')

fig = go.Figure(data=[trace1, trace2], layout=layout)

# Plot data in the notebook
plotly.offline.iplot(fig)

In [3]:
datafile =  '../../data/local/trpo-ip-32-32-4000-100/trpo_ip_32_32_4000_100_2017_09_05_17_48_37_0001/progress.csv'
df = pd.read_csv(datafile)
trace1 = go.Scatter(
                    y=df['AverageReturn'], # Data
                    mode='lines', name='logx' # Additional options
                   )
layout = go.Layout(title='Simple Plot from csv data',
                   plot_bgcolor='rgb(230, 230,230)')

fig = go.Figure(data=[trace1], layout=layout)

# Plot data in the notebook
plotly.offline.iplot(fig)