In [52]:
from rllab.algos.cem import CEM
from rllab.algos.ddpg import DDPG
from rllab.algos.tnpg import TNPG
from rllab.algos.trpo import TRPO
from rllab.algos.vpg import VPG
from rllab.algos.batch_polopt import BatchSampler


from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
from rllab.envs.gym_env import GymEnv
from rllab.envs.normalized_env import normalize
from rllab.exploration_strategies.ou_strategy import OUStrategy
from rllab.misc.instrument import run_experiment_lite
from rllab.misc.instrument import VariantGenerator, variant
from rllab.policies.categorical_mlp_policy import CategoricalMLPPolicy
from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
from rllab.policies.deterministic_mlp_policy import DeterministicMLPPolicy
from rllab.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction
from rllab.sampler.utils import collect_data
from rllab.sampler.utils import rollout

import gym
import numpy as np
import GPy
import GPyOpt

import matplotlib.pyplot as plt
%matplotlib inline

import matplotlib
import pandas as pd


import plotly
plotly.offline.init_notebook_mode() 
import plotly.plotly as py
import plotly.graph_objs as go

import numpy as np
import pandas as pd
import copy
import pickle
from datetime import datetime

common_batch_algo_args = dict(
    n_itr=2000,
    batch_size=1000,
    max_path_length=1000,
)

import random
from operator import add, sub

In [55]:
class VG(VariantGenerator):

    @variant
    def env(self):
#         return ["HalfCheetah-v1","Hopper-v1","Walker2d-v1", "Swimmer-v1","InvertedPendulum-v1"]
        return ["Swimmer-v1"]
    @variant
    def total_iter(self):
#         return [1000, 200, 1000, 400, 100]
        return [400]
    @variant
    def seed(self):
        return list(range(1, 41))

In [57]:
### real = defualt, sim = [0.5,0.9]-[1.1,1.5]* real

############ Train in sim env ###############
def run_task(vv):
    # Please note that different environments with different action spaces may
    # require different policies. For example with a Discrete action space, a
    # CategoricalMLPPolicy works, but for a Box action space may need to use
    # a GaussianMLPPolicy (see the trpo_gym_pendulum.py example)
    print("vv: ", vv)
#     env = normalize(GymEnv(vv["env"],record_video=False, record_log=False))

    
    env_sim = normalize(GymEnv(vv["env"],record_video=False, record_log=False))
    
    op = random.choice([add, sub])
    factor = op(1, 0.1 + 0.4*np.random.random_sample())
    mb = env_sim.wrapped_env.env.env.model.body_mass
    mb = np.array(mb)
    mb = mb*factor
    env_sim.wrapped_env.env.env.model.body_mass = mb
    
    policy = GaussianMLPPolicy(
        env_spec=env_sim.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32)
    )

    baseline = LinearFeatureBaseline(env_spec=env_sim.spec)
    returns = []
    total_iter = vv["total_iter"]
    test_iter = 10
    num_iters = int(total_iter/test_iter)
    
    for i in range(0,num_iters):
        print("iter: ", i)
        
        algo = TRPO(
            env=env_sim,
            policy=policy,
            baseline=baseline,
            batch_size=4000,
            max_path_length=1000,
            n_itr=test_iter,
            discount=0.99,
            step_size=0.01,
    #         Uncomment both lines (this and the plot parameter below) to enable plotting
    #         plot=True,
        )
        algo.train()
        
        ################### Testing in GT parameter #################
        print("sim mass: ", algo.env.wrapped_env.env.env.model.body_mass)
        
        env_real = normalize(GymEnv(vv["env"],record_video=False, record_log=False))
        print("real mass: ", env_real.wrapped_env.env.env.model.body_mass)

        algo_real = copy.deepcopy(algo)
        algo_real.env = env_real
        sampler_cls = BatchSampler
        sampler = sampler_cls(algo_real)
        sampler.start_worker()
        paths = sampler.obtain_samples(1)
        paths_returns = [sum(path["rewards"]) for path in paths]
        sampler.shutdown_worker()
        print("Real sample ave returns: \n\n\n\n\n\n !!!!!!!!!!!!!!!!!!!", np.mean(paths_returns))   
        returns.append(np.mean(paths_returns))
                
        path = rollout(env_real, algo.policy, max_path_length=1000,
                       animated=False, speedup=1)
        path["returns"] = sum(path["rewards"])
        print("rollout return: \n\n\n\n\n\n\n\n !!!!!!!!!!!!!!!!!!!!!", path["returns"] )
#         returns.append(path['returns'])
        ##############################################################
        
        
    print("returns: ", returns)
    now = datetime.now()
    save_data_filename = "SIM_returns_"+vv["env"]+"_iter_"+str(total_iter)+"_"+str(now)
    with open(save_data_filename, 'wb') as f:
        pickle.dump(returns, f)

variants = VG().variants()


for v in variants:
    run_experiment_lite(
        run_task,
        # Number of parallel workers for sampling
        n_parallel=1,
        # Only keep the snapshot parameters for the last iteration
        snapshot_mode="last",
        # Specifies the seed for the experiment. If this is not provided, a random seed
        # will be used
        seed=v["seed"],
        mode="local",
        exp_prefix = "SIM_trpo_"+v["env"]+"_32_32_4000_iter_"+str(v["total_iter"]),
        variant=v,
    #     plot=True,
    )


python /home/zhusj/repos/rllab/scripts/run_experiment_lite.py  --args_data 'gAJjY2xvdWRwaWNrbGUuY2xvdWRwaWNrbGUKX2ZpbGxfZnVuY3Rpb24KcQAoY2Nsb3VkcGlja2xlLmNsb3VkcGlja2xlCl9tYWtlX3NrZWxfZnVuYwpxAWNjbG91ZHBpY2tsZS5jbG91ZHBpY2tsZQpfYnVpbHRpbl90eXBlCnECWAgAAABDb2RlVHlwZXEDhXEEUnEFKEsBSwBLF0sSS0NjX2NvZGVjcwplbmNvZGUKcQZYLAMAAHQAAGQBAHwAAMKDAgABdAEAdAIAfAAAZAIAGWQDAGQEAGQFAGQEAMKDAQLCgwEAfQEAdAMAagQAdAUAdAYAZwIAwoMBAH0CAHwCAGQGAGQHAGQIAHQHAGoDAGoIAMKDAAAUF8KDAgB9AwB8AQBqCQBqCgBqCgBqCwBqDAB9BAB0BwBqDQB8BADCgwEAfQQAfAQAfAMAFH0EAHwEAHwBAGoJAGoKAGoKAGoLAF8MAHQOAGQJAHwBAGoPAGQKAGQqAMKDAAJ9BQB0EABkCQB8AQBqDwDCgwABfQYAZwAAfQcAfAAAZAwAGX0IAGQNAH0JAHQRAHwIAHwJABvCgwEAfQoAeMKZAXQSAGQOAHwKAMKDAgBEXcKIAX0LAHQAAGQPAHwLAMKDAgABdBMAZAIAfAEAZBAAfAUAZBEAfAYAZBIAZBMAZBQAZBUAZBYAfAkAZBcAZBgAZBkAZBoAwoMACH0MAHwMAGoUAMKDAAABdAAAZBsAfAwAagoAagkAagoAagoAagsAagwAwoMCAAF0AQB0AgB8AABkAgAZZAMAZAQAZAUAZAQAwoMBAsKDAQB9DQB0AABkHAB8DQBqCQBqCgBqCgBqCwBqDADCgwIAAXQVAGoWAHwMAMKDAQB9DgB8DQB8DgBfCgB0FwB9DwB8DwB8DgDCgwEAfRAAfBAA

In [54]:
############### real = default

############## Train in real env ########################

def run_task(vv):
    # Please note that different environments with different action spaces may
    # require different policies. For example with a Discrete action space, a
    # CategoricalMLPPolicy works, but for a Box action space may need to use
    # a GaussianMLPPolicy (see the trpo_gym_pendulum.py example)
    print("vv: ", vv)
    env_real = normalize(GymEnv(vv["env"],record_video=False, record_log=False))
    
    print("real mass: ", env_real.wrapped_env.env.env.model.body_mass)
    
    mb = env_real.wrapped_env.env.env.model.body_mass
    mb = np.array(mb)
#     mb = mb*0.8
    env_real.wrapped_env.env.env.model.body_mass = mb
    print("real mass: ", env_real.wrapped_env.env.env.model.body_mass)
    
    
    policy = GaussianMLPPolicy(
        env_spec=env_real.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32)
    )

    baseline = LinearFeatureBaseline(env_spec=env_real.spec)
    returns = []
    total_iter = vv["total_iter"]
    test_iter = 10
    num_iters = int(total_iter/test_iter)
    
    for i in range(0,num_iters):
        print("iter: ", i)
        
        algo = TRPO(
            env=env_real,
            policy=policy,
            baseline=baseline,
            batch_size=4000,
            max_path_length=1000,
            n_itr=test_iter,
            discount=0.99,
            step_size=0.01,
    #         Uncomment both lines (this and the plot parameter below) to enable plotting
    #         plot=True,
        )
        algo.train()
        
        ################### Testing in GT parameter #################
        print("real mass: ", env_real.wrapped_env.env.env.model.body_mass)
        
#         env2 = copy.deepcopy(env)
#         mb = env2.wrapped_env.env.env.model.body_mass
#         mb = np.array(mb)
#         mb = mb*0.6
#         env2.wrapped_env.env.env.model.body_mass = mb

#         print("real mass: ", env2.wrapped_env.env.env.model.body_mass)

#         path = rollout(env2, policy, max_path_length=1000,
#                        animated=False, speedup=1)

        path = rollout(env_real, policy, max_path_length=1000,
                       animated=False, speedup=1)
        path["returns"] = sum(path["rewards"])
        print("returns: ", path['returns'])
        returns.append(path['returns'])
        ##############################################################
        
        
    print("returns: ", returns)
    now = datetime.now()
    save_data_filename = "REAL_returns_"+vv["env"]+"_iter_"+str(total_iter)+"_"+str(now)
    with open(save_data_filename, 'wb') as f:
        pickle.dump(returns, f)
        
    
variants = VG().variants()

for v in variants:
    print("seed: ", v["seed"])
    run_experiment_lite(
        run_task,
        # Number of parallel workers for sampling
        n_parallel=1,
        # Only keep the snapshot parameters for the last iteration
        snapshot_mode="last",
        # Specifies the seed for the experiment. If this is not provided, a random seed
        # will be used
        seed=v["seed"],
        exp_prefix = "REAL_trpo_"+v["env"]+"_32_32_4000_iter_"+str(v["total_iter"]),
        variant=v,
    #     plot=True,
    )


seed:  1
python /home/zhusj/repos/rllab/scripts/run_experiment_lite.py  --args_data 'gAJjY2xvdWRwaWNrbGUuY2xvdWRwaWNrbGUKX2ZpbGxfZnVuY3Rpb24KcQAoY2Nsb3VkcGlja2xlLmNsb3VkcGlja2xlCl9tYWtlX3NrZWxfZnVuYwpxAWNjbG91ZHBpY2tsZS5jbG91ZHBpY2tsZQpfYnVpbHRpbl90eXBlCnECWAgAAABDb2RlVHlwZXEDhXEEUnEFKEsBSwBLD0sSS0NjX2NvZGVjcwplbmNvZGUKcQZYWAIAAHQAAGQBAHwAAMKDAgABdAEAdAIAfAAAZAIAGWQDAGQEAGQFAGQEAMKDAQLCgwEAfQEAdAAAZAYAfAEAagMAagQAagQAagUAagYAwoMCAAF8AQBqAwBqBABqBABqBQBqBgB9AgB0BwBqCAB8AgDCgwEAfQIAfAIAfAEAagMAagQAagQAagUAXwYAdAAAZAYAfAEAagMAagQAagQAagUAagYAwoMCAAF0CQBkBwB8AQBqCgBkCABkIwDCgwACfQMAdAsAZAcAfAEAagoAwoMAAX0EAGcAAH0FAHwAAGQKABl9BgBkCwB9BwB0DAB8BgB8BwAbwoMBAH0IAHjDmgB0DQBkDAB8CADCgwIARF3DiQB9CQB0AABkDQB8CQDCgwIAAXQOAGQCAHwBAGQOAHwDAGQPAHwEAGQQAGQRAGQSAGQTAGQUAHwHAGQVAGQWAGQXAGQYAMKDAAh9CgB8CgBqDwDCgwAAAXQAAGQGAHwBAGoDAGoEAGoEAGoFAGoGAMKDAgABdBAAfAEAfAMAZBIAZBMAZBkAZAQAZBoAZBsAwoMCA30LAHQRAHwLAGQcABnCgwEAfAsAZB0APHQAAGQeAHwLAGQdABnCgwIAAXwFAGoSAHwLAGQdABnCgwEAAXEAAVd0AABkHgB8BQDCgwIAAXQTAGoUAMK

In [48]:
### real = 0.6 * defualt, sim = defualt

############ Train in sim env ###############
def run_task(vv):
    # Please note that different environments with different action spaces may
    # require different policies. For example with a Discrete action space, a
    # CategoricalMLPPolicy works, but for a Box action space may need to use
    # a GaussianMLPPolicy (see the trpo_gym_pendulum.py example)
    print("vv: ", vv)
    env = normalize(GymEnv(vv["env"],record_video=False, record_log=False))

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32)
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)
    returns = []
    total_iter = vv["total_iter"]
    test_iter = 10
    num_iters = int(total_iter/test_iter)
    
    for i in range(0,num_iters):
        print("iter: ", i)
        
        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=4000,
            max_path_length=1000,
            n_itr=test_iter,
            discount=0.99,
            step_size=0.01,
    #         Uncomment both lines (this and the plot parameter below) to enable plotting
    #         plot=True,
        )
        algo.train()
        
        ################### Testing in GT parameter #################
        print("mass: ", env.wrapped_env.env.env.model.body_mass)
        
        env2 = copy.deepcopy(env)
        mb = env2.wrapped_env.env.env.model.body_mass
        mb = np.array(mb)
        mb = mb*0.6
        env2.wrapped_env.env.env.model.body_mass = mb

        print("mass: ", env2.wrapped_env.env.env.model.body_mass)

        algo_real = copy.deepcopy(algo)
        algo_real.env = env2
        sampler_cls = BatchSampler
        sampler = sampler_cls(algo_real)
        sampler.start_worker()
        paths = sampler.obtain_samples(1)
        paths_returns = [sum(path["rewards"]) for path in paths]
        sampler.shutdown_worker()
        print("Real sample ave returns: \n\n\n\n\n\n !!!!!!!!!!!!!!!!!!!", np.mean(paths_returns))   
        returns.append(np.mean(paths_returns))
        
        
        path = rollout(env2, algo.policy, max_path_length=1000,
                       animated=False, speedup=1)
        path["returns"] = sum(path["rewards"])
        print("rollout return: \n\n\n\n\n\n\n\n !!!!!!!!!!!!!!!!!!!!!", path["returns"] )
#         returns.append(path['returns'])
        ##############################################################
        
        
    print("returns: ", returns)
    now = datetime.now()
    save_data_filename = "SIM_returns_"+vv["env"]+"_iter_"+str(total_iter)+"_"+str(now)
    with open(save_data_filename, 'wb') as f:
        pickle.dump(returns, f)

variants = VG().variants()


for v in variants:
    run_experiment_lite(
        run_task,
        # Number of parallel workers for sampling
        n_parallel=1,
        # Only keep the snapshot parameters for the last iteration
        snapshot_mode="last",
        # Specifies the seed for the experiment. If this is not provided, a random seed
        # will be used
        seed=v["seed"],
        mode="local",
        exp_prefix = "SIM_trpo_"+v["env"]+"_32_32_4000_iter_"+str(v["total_iter"]),
        variant=v,
    #     plot=True,
    )


python /home/zhusj/repos/rllab/scripts/run_experiment_lite.py  --args_data 'gAJjY2xvdWRwaWNrbGUuY2xvdWRwaWNrbGUKX2ZpbGxfZnVuY3Rpb24KcQAoY2Nsb3VkcGlja2xlLmNsb3VkcGlja2xlCl9tYWtlX3NrZWxfZnVuYwpxAWNjbG91ZHBpY2tsZS5jbG91ZHBpY2tsZQpfYnVpbHRpbl90eXBlCnECWAgAAABDb2RlVHlwZXEDhXEEUnEFKEsBSwBLFUsSS0NjX2NvZGVjcwplbmNvZGUKcQZY3gIAAHQAAGQBAHwAAMKDAgABdAEAdAIAfAAAZAIAGWQDAGQEAGQFAGQEAMKDAQLCgwEAfQEAdAMAZAYAfAEAagQAZAcAZCgAwoMAAn0CAHQFAGQGAHwBAGoEAMKDAAF9AwBnAAB9BAB8AABkCQAZfQUAZAoAfQYAdAYAfAUAfAYAG8KDAQB9BwB4w4YBdAcAZAsAfAcAwoMCAERdwrUBfQgAdAAAZAwAfAgAwoMCAAF0CABkAgB8AQBkDQB8AgBkDgB8AwBkDwBkEABkEQBkEgBkEwB8BgBkFABkFQBkFgBkFwDCgwAIfQkAfAkAagkAwoMAAAF0AABkGAB8AQBqCgBqCwBqCwBqDABqDQDCgwIAAXQOAGoPAHwBAMKDAQB9CgB8CgBqCgBqCwBqCwBqDABqDQB9CwB0EABqEQB8CwDCgwEAfQsAfAsAZBkAFH0LAHwLAHwKAGoKAGoLAGoLAGoMAF8NAHQAAGQYAHwKAGoKAGoLAGoLAGoMAGoNAMKDAgABdA4Aag8AfAkAwoMBAH0MAHwKAHwMAF8LAHQSAH0NAHwNAHwMAMKDAQB9DgB8DgBqEwDCgwAAAXwOAGoUAGQaAMKDAQB9DwBkGwBkHADChAAAfA8ARMKDAQB9EAB8DgBqFQDCgwAAAXQAAGQdAHQQAGoWAHwQAMKDAQDCgwIA

KeyboardInterrupt: 

In [50]:
############### real = 0.6 * default

############## Train in real env ########################

def run_task(vv):
    # Please note that different environments with different action spaces may
    # require different policies. For example with a Discrete action space, a
    # CategoricalMLPPolicy works, but for a Box action space may need to use
    # a GaussianMLPPolicy (see the trpo_gym_pendulum.py example)
    print("vv: ", vv)
    env_real = normalize(GymEnv(vv["env"],record_video=False, record_log=False))
    
    print("real mass: ", env_real.wrapped_env.env.env.model.body_mass)
    
    mb = env_real.wrapped_env.env.env.model.body_mass
    mb = np.array(mb)
    mb = mb*0.6
    env_real.wrapped_env.env.env.model.body_mass = mb
    print("real mass: ", env_real.wrapped_env.env.env.model.body_mass)
    
    
    policy = GaussianMLPPolicy(
        env_spec=env_real.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32)
    )

    baseline = LinearFeatureBaseline(env_spec=env_real.spec)
    returns = []
    total_iter = vv["total_iter"]
    test_iter = 10
    num_iters = int(total_iter/test_iter)
    
    for i in range(0,num_iters):
        print("iter: ", i)
        
        algo = TRPO(
            env=env_real,
            policy=policy,
            baseline=baseline,
            batch_size=4000,
            max_path_length=1000,
            n_itr=test_iter,
            discount=0.99,
            step_size=0.01,
    #         Uncomment both lines (this and the plot parameter below) to enable plotting
    #         plot=True,
        )
        algo.train()
        
        ################### Testing in GT parameter #################
        print("real mass: ", env_real.wrapped_env.env.env.model.body_mass)
        
#         env2 = copy.deepcopy(env)
#         mb = env2.wrapped_env.env.env.model.body_mass
#         mb = np.array(mb)
#         mb = mb*0.6
#         env2.wrapped_env.env.env.model.body_mass = mb

#         print("real mass: ", env2.wrapped_env.env.env.model.body_mass)

#         path = rollout(env2, policy, max_path_length=1000,
#                        animated=False, speedup=1)

        path = rollout(env_real, policy, max_path_length=1000,
                       animated=False, speedup=1)
        path["returns"] = sum(path["rewards"])
        print("returns: ", path['returns'])
        returns.append(path['returns'])
        ##############################################################
        
        
    print("returns: ", returns)
    now = datetime.now()
    save_data_filename = "REAL_returns_"+vv["env"]+"_iter_"+str(total_iter)+"_"+str(now)
    with open(save_data_filename, 'wb') as f:
        pickle.dump(returns, f)
        
    
variants = VG().variants()

for v in variants:
    run_experiment_lite(
        run_task,
        # Number of parallel workers for sampling
        n_parallel=1,
        # Only keep the snapshot parameters for the last iteration
        snapshot_mode="last",
        # Specifies the seed for the experiment. If this is not provided, a random seed
        # will be used
        seed=v["seed"],
        exp_prefix = "REAL_trpo_"+v["env"]+"_32_32_4000_iter_"+str(v["total_iter"]),
        variant=v,
    #     plot=True,
    )


python /home/zhusj/repos/rllab/scripts/run_experiment_lite.py  --args_data 'gAJjY2xvdWRwaWNrbGUuY2xvdWRwaWNrbGUKX2ZpbGxfZnVuY3Rpb24KcQAoY2Nsb3VkcGlja2xlLmNsb3VkcGlja2xlCl9tYWtlX3NrZWxfZnVuYwpxAWNjbG91ZHBpY2tsZS5jbG91ZHBpY2tsZQpfYnVpbHRpbl90eXBlCnECWAgAAABDb2RlVHlwZXEDhXEEUnEFKEsBSwBLD0sSS0NjX2NvZGVjcwplbmNvZGUKcQZYYgIAAHQAAGQBAHwAAMKDAgABdAEAdAIAfAAAZAIAGWQDAGQEAGQFAGQEAMKDAQLCgwEAfQEAdAAAZAYAfAEAagMAagQAagQAagUAagYAwoMCAAF8AQBqAwBqBABqBABqBQBqBgB9AgB0BwBqCAB8AgDCgwEAfQIAfAIAZAcAFH0CAHwCAHwBAGoDAGoEAGoEAGoFAF8GAHQAAGQGAHwBAGoDAGoEAGoEAGoFAGoGAMKDAgABdAkAZAgAfAEAagoAZAkAZCQAwoMAAn0DAHQLAGQIAHwBAGoKAMKDAAF9BABnAAB9BQB8AABkCwAZfQYAZAwAfQcAdAwAfAYAfAcAG8KDAQB9CAB4w5oAdA0AZA0AfAgAwoMCAERdw4kAfQkAdAAAZA4AfAkAwoMCAAF0DgBkAgB8AQBkDwB8AwBkEAB8BABkEQBkEgBkEwBkFABkFQB8BwBkFgBkFwBkGABkGQDCgwAIfQoAfAoAag8AwoMAAAF0AABkBgB8AQBqAwBqBABqBABqBQBqBgDCgwIAAXQQAHwBAHwDAGQTAGQUAGQaAGQEAGQbAGQcAMKDAgN9CwB0EQB8CwBkHQAZwoMBAHwLAGQeADx0AABkHwB8CwBkHgAZwoMCAAF8BQBqEgB8CwBkHgAZwoMBAAFxCgFXdAAAZB8AfAUAwoMCAAF0EwBq

KeyboardInterrupt: 

In [18]:
datafile =  '../../data/local/trpo-Swimmer-v1-32-32-4000-40/trpo_Swimmer-v1_32_32_4000_40_2017_09_05_18_37_21_0015/progress.csv'
df = pd.read_csv(datafile)
trace1 = go.Scatter(
                    y=df['AverageReturn'], # Data
                    mode='lines', name='training' # Additional options
                   )
trace２ = go.Scatter(
                    y=np.array([20.202899152265687, 11.770700709886349, 21.400652194445151, 25.354204357634103, 20.553359266759884, 22.836364003706514, 22.404311541517821, 22.305505271388089, 23.721623681697373, 20.799683421540408]), # Data
                    mode='lines', name='testing (mass*0.6)' # Additional options
                   )
layout = go.Layout(title='Inverted Pendulum TRPO Perfomance',
                   plot_bgcolor='rgb(230, 230,230)')

fig = go.Figure(data=[trace1, trace2], layout=layout)

# Plot data in the notebook
plotly.offline.iplot(fig)

In [None]:
def run_task(*_):
    # Please note that different environments with different action spaces may
    # require different policies. For example with a Discrete action space, a
    # CategoricalMLPPolicy works, but for a Box action space may need to use
    # a GaussianMLPPolicy (see the trpo_gym_pendulum.py example)
    env = normalize(GymEnv("InvertedPendulum-v1",record_video=False))

    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32)
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)
    returns = []
    total_ier = 100
    test_iter = 10
    num_iters = int(total_ier/test_iter)
    
    for i in range(0,num_iters):
        print("iter: ", i)
        
        ################### Testing in different parameter #################
        print("mass: ", env.wrapped_env.env.env.env.model.body_mass)
        
        env2 = copy.deepcopy(env)
        mb = env2.wrapped_env.env.env.env.model.body_mass
        mb = np.array(mb)
        mb[2,0] = mb[2,0]*0.6
        env2.wrapped_env.env.env.env.model.body_mass = mb

        print("mass: ", env2.wrapped_env.env.env.env.model.body_mass)

        path = rollout(env2, policy, max_path_length=1000,
                       animated=False, speedup=1)
        path["returns"] = sum(path["rewards"])
        returns.append(path['returns'])
        #################################################
        
        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=4000,
            max_path_length=env.horizon,
            n_itr=test_iter,
            discount=0.99,
            step_size=0.01,
    #         Uncomment both lines (this and the plot parameter below) to enable plotting
    #         plot=True,
        )
        algo.train()
    print("returns: ", returns)


run_experiment_lite(
    run_task,
    # Number of parallel workers for sampling
    n_parallel=8,
    # Only keep the snapshot parameters for the last iteration
    snapshot_mode="last",
    # Specifies the seed for the experiment. If this is not provided, a random seed
    # will be used
    seed=1,
    exp_prefix = "trpo_ip_32_32_4000_100"
#     plot=True,
)


python /home/zhusj/repos/rllab/scripts/run_experiment_lite.py  --args_data 'gAJjY2xvdWRwaWNrbGUuY2xvdWRwaWNrbGUKX2ZpbGxfZnVuY3Rpb24KcQAoY2Nsb3VkcGlja2xlLmNsb3VkcGlja2xlCl9tYWtlX3NrZWxfZnVuYwpxAWNjbG91ZHBpY2tsZS5jbG91ZHBpY2tsZQpfYnVpbHRpbl90eXBlCnECWAgAAABDb2RlVHlwZXEDhXEEUnEFKEsASwBLDUsSS0djX2NvZGVjcwplbmNvZGUKcQZY1wEAAHQAAHQBAGQBAGQCAGQDAMKDAQHCgwEAfQEAdAIAZAQAfAEAagMAZAUAZCAAwoMAAn0CAHQEAGQEAHwBAGoDAMKDAAF9AwBnAAB9BABkBwB9BQBkCAB9BgB0BQB8BQB8BgAbwoMBAH0HAHhOAXQGAGQJAHwHAMKDAgBEXT0BfQgAdAcAZAoAfAgAwoMCAAF0BwBkCwB8AQBqCABqCQBqCQBqCQBqCgBqCwDCgwIAAXQMAGoNAHwBAMKDAQB9CQB8CQBqCABqCQBqCQBqCQBqCgBqCwB9CgB0DgBqDwB8CgDCgwEAfQoAfAoAZCEAGWQNABR8CgBkIgA8fAoAfAkAaggAagkAagkAagkAagoAXwsAdAcAZAsAfAkAaggAagkAagkAagkAagoAagsAwoMCAAF0EAB8CQB8AgBkDgBkDwBkEABkAwBkEQBkEgDCgwIDfQsAdBEAfAsAZBMAGcKDAQB8CwBkFAA8fAQAahIAfAsAZBQAGcKDAQABdBMAZBUAfAEAZBYAfAIAZBcAfAMAZBgAZBkAZA4AfAEAahQAZBoAfAYAZBsAZBwAZB0AZB4AwoMACH0MAHwMAGoVAMKDAAABcXQAV3QHAGQfAHwEAMKDAgABZAAAU3EHWAYAAABsYXRpbjFxCIZxCVJxCihOWBMAAABJbnZlcnRlZFBl

In [10]:
datafile =  '../../data/local/trpo-ip-32-32-4000-100/trpo_ip_32_32_4000_100_2017_09_05_17_48_37_0002/progress.csv'
df = pd.read_csv(datafile)
trace1 = go.Scatter(
                    y=df['AverageReturn'], # Data
                    mode='lines', name='training' # Additional options
                   )
trace２ = go.Scatter(
                    y=np.array([10.0, 4.0, 5.0, 4.0, 3.0, 12.0, 30.0, 66.0, 15.0, 45.0, 78.0, 102.0, 29.0, 55.0, 81.0, 115.0, 133.0, 103.0, 166.0, 170.0, 154.0, 26.0, 486.0, 1000.0, 306.0, 283.0, 819.0, 394.0, 478.0, 524.0, 873.0, 893.0, 1000.0, 1000.0, 802.0, 293.0, 1000.0, 1000.0, 224.0, 604.0]), # Data
                    mode='lines', name='testing (mass*0.6)' # Additional options
                   )
layout = go.Layout(title='Inverted Pendulum TRPO Perfomance',
                   plot_bgcolor='rgb(230, 230,230)')

fig = go.Figure(data=[trace1, trace2], layout=layout)

# Plot data in the notebook
plotly.offline.iplot(fig)

In [None]:
datafile =  '../../data/local/trpo-ip-32-32-4000-100/trpo_ip_32_32_4000_100_2017_09_05_17_48_37_0002/progress.csv'
df = pd.read_csv(datafile)
trace1 = go.Scatter(
                    y=df['AverageReturn'], # Data
                    mode='lines', name='training' # Additional options
                   )
trace２ = go.Scatter(
                    x=np.array([1:10:100])
                    y=np.array([10.0, 4.0, 5.0, 4.0, 3.0, 12.0, 30.0, 66.0, 15.0, 45.0, 78.0, 102.0, 29.0, 55.0, 81.0, 115.0, 133.0, 103.0, 166.0, 170.0, 154.0, 26.0, 486.0, 1000.0, 306.0, 283.0, 819.0, 394.0, 478.0, 524.0, 873.0, 893.0, 1000.0, 1000.0, 802.0, 293.0, 1000.0, 1000.0, 224.0, 604.0]), # Data
                    mode='lines', name='testing (mass*0.6)' # Additional options
                   )
layout = go.Layout(title='Inverted Pendulum TRPO Perfomance',
                   plot_bgcolor='rgb(230, 230,230)')

fig = go.Figure(data=[trace1, trace2], layout=layout)

# Plot data in the notebook
plotly.offline.iplot(fig)

In [12]:
datafile =  '../../data/local/trpo-ip-32-32-4000-100/trpo_ip_32_32_4000_100_2017_09_05_17_48_37_0003/progress.csv'
df = pd.read_csv(datafile)
trace1 = go.Scatter(
                    y=df['AverageReturn'], # Data
                    mode='lines', name='training' # Additional options
                   )
trace２ = go.Scatter(
                    y = np.array([11.0, 4.0, 3.0, 5.0, 5.0, 7.0, 31.0, 35.0, 6.0, 50.0, 79.0, 92.0, 38.0, 62.0, 144.0, 139.0, 163.0, 136.0, 94.0, 15.0, 225.0, 228.0, 443.0, 283.0, 805.0, 68.0, 761.0, 1000.0, 52.0, 1000.0, 1000.0, 694.0, 1000.0, 1000.0, 744.0, 1000.0, 1000.0, 7.0, 72.0, 1000.0, 594.0, 1000.0, 214.0, 110.0, 1000.0, 565.0, 185.0, 557.0, 246.0, 1000.0, 1000.0, 280.0, 545.0, 1000.0, 1000.0, 1000.0, 451.0, 857.0, 255.0, 312.0, 462.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 325.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 462.0, 549.0, 1000.0, 233.0, 1000.0, 1000.0, 1000.0, 733.0]
),
                    mode='lines', name='testing (mass*0.6)' # Additional options
                   )
layout = go.Layout(title='Inverted Pendulum TRPO Perfomance',
                   plot_bgcolor='rgb(230, 230,230)')

fig = go.Figure(data=[trace1, trace2], layout=layout)

# Plot data in the notebook
plotly.offline.iplot(fig)

In [3]:
datafile =  '../../data/local/trpo-ip-32-32-4000-100/trpo_ip_32_32_4000_100_2017_09_05_17_48_37_0001/progress.csv'
df = pd.read_csv(datafile)
trace1 = go.Scatter(
                    y=df['AverageReturn'], # Data
                    mode='lines', name='logx' # Additional options
                   )
layout = go.Layout(title='Simple Plot from csv data',
                   plot_bgcolor='rgb(230, 230,230)')

fig = go.Figure(data=[trace1], layout=layout)

# Plot data in the notebook
plotly.offline.iplot(fig)