In [7]:
from rllab.algos.trpo import TRPO

from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
from rllab.envs.gym_env import GymEnv
from rllab.envs.normalized_env import normalize
from rllab.misc.instrument import run_experiment_lite
from rllab.misc.instrument import VariantGenerator, variant
from rllab.policies.categorical_mlp_policy import CategoricalMLPPolicy
from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
from rllab.policies.deterministic_mlp_policy import DeterministicMLPPolicy
from rllab.sampler.utils import collect_data, rollout
from rllab.sampler import parallel_sampler


import gym
import numpy as np
import GPy
import GPyOpt

import matplotlib.pyplot as plt
%matplotlib inline

import matplotlib
import pandas as pd


import plotly
plotly.offline.init_notebook_mode() 
import plotly.plotly as py
import plotly.graph_objs as go

import numpy as np
import pandas as pd
import copy
import pickle
from datetime import datetime

import random, os
from operator import add, sub

In [27]:
def myf(X):
    X = X.T
    env_sim_BO.wrapped_env.env.env.model.body_mass = X
        
    dim = env_sim_BO.wrapped_env.env.env.model.data.qpos.shape[0] 
    env_sim_BO.wrapped_env.env.env.set_state(states[0,:dim],states[0,dim:])

    sim_states = []
    rewards = []
    env_sim_BO.reset()
    
    for i in range(0,actions.shape[0]):
        next_o, r, d, env_info = env_sim_BO.step(actions[i,:])
        rewards.append(r)        
        sim_states.append(next_o)

    dist = sum(sum(abs(observations[1:,:]-np.array(sim_states))))/observations.shape[0]
    
    return dist

def BO(bounds, X_init, Y_init):
    print("bounds: ", bounds)
    print("X_init size: ", X_init.size)
    
    global env_sim_BO
    env_sim_BO = normalize(GymEnv(env_name,record_video=False, record_log=False))
    
    if X_init.size == 0:
        myBopt = GPyOpt.methods.BayesianOptimization(myf, domain=bounds)
    else:
        myBopt = GPyOpt.methods.BayesianOptimization(myf, domain=bounds, X = X_init, Y = Y_init)
        
    time_budget = 30
    print("running BO with time budget: ", time_budget)
    myBopt.run_optimization(max_time=time_budget, eps = 0.01)
    print("myBopt.x_opt: ", myBopt.x_opt, "\n\n\n")
    return myBopt

class VG(VariantGenerator):

    @variant
    def env(self):
#         return ["HalfCheetah-v1","Hopper-v1","Walker2d-v1", "Swimmer-v1","InvertedPendulum-v1"]
        return ["Swimmer-v1"]
    @variant
    def total_iter(self):
#         return [1000, 200, 1000, 400, 100]
        return [1000]
    @variant
    def seed(self):
        return list(range(1, 101))

In [30]:
############# sim = [0.5,0.9]-[1.1,1.5]* real

def run_task(vv):
    # Please note that different environments with different action spaces may
    # require different policies. For example with a Discrete action space, a
    # CategoricalMLPPolicy works, but for a Box action space may need to use
    # a GaussianMLPPolicy (see the trpo_gym_pendulum.py example)
    global env_name
    env_name = vv["env"]
    env_real = normalize(GymEnv(vv["env"],record_video=False, record_log=False))
    model_body_mass = env_real.wrapped_env.env.env.model.body_mass    
    print("real mass: ", model_body_mass)    
    bounds = []
    for i in range(0,model_body_mass.shape[0]):
        bounds.append({'name': 'mass', 'type': 'continuous', 'domain': (model_body_mass[i,0]*0.5,model_body_mass[i,0]*1.5)})
    
    env_sim = normalize(GymEnv(vv["env"],record_video=False, record_log=False))
    
    op = random.choice([add, sub])
    factor = op(1, 0.1 + 0.4*np.random.random_sample())
    mb = env_sim.wrapped_env.env.env.model.body_mass
    mb = np.array(mb)
    mb = mb*factor
    env_sim.wrapped_env.env.env.model.body_mass = mb
    policy = GaussianMLPPolicy(
        env_spec=env_sim.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32)
    )
    
    X_init = np.empty(shape=(0, 0))
    Y_init = np.empty(shape=(0, 0))
    
    baseline = LinearFeatureBaseline(env_spec=env_sim.spec)
    returns = []
    total_iter = vv["total_iter"]
    test_iter = 10
    num_iters = int(total_iter/test_iter)
    
    algo = TRPO(
            env=env_sim,
            policy=policy,
            baseline=baseline,
            batch_size=4000,
            max_path_length=1000,
            n_itr=10,
            discount=0.99,
            step_size=0.01,
    #         Uncomment both lines (this and the plot parameter below) to enable plotting
    #         plot=True,
        )
    for i in range(0,num_iters):
        print("iter: ", i)
#         algo.n_itr = algo.n_itr + test_iter
        current_mass = algo.env.wrapped_env.env.env.model.body_mass
        print("algo env mass: ", current_mass)
        algo.train()
        
        ################### Testing in GT parameter #################
                
        env_real = normalize(GymEnv(vv["env"],record_video=False, record_log=False))
        print("real mass: ", env_real.wrapped_env.env.env.model.body_mass)
        parallel_sampler.populate_task(env_real, algo.policy, scope=algo.scope)
        paths_real = parallel_sampler.sample_paths(
            policy_params=algo.policy.get_param_values(),
            max_samples=algo.batch_size,
            max_path_length=algo.max_path_length,
            env_params=env_real.get_param_values(),
            scope=algo.scope,
        )
        paths_real_returns = [sum(path["rewards"]) for path in paths_real]
        parallel_sampler.terminate_task(scope=algo.scope)
        ave_returns = np.mean(paths_real_returns)
        print("Real sample ave returns: ", ave_returns, "\n\n\n\n")
        returns.append(ave_returns)
        
        env_real = normalize(GymEnv(vv["env"],record_video=False, record_log=False))
        print("real mass: ", env_real.wrapped_env.env.env.model.body_mass)
        path = collect_data(env_real, algo.policy, max_path_length=1000,
                       animated=False, speedup=1)
        path["returns"] = sum(path["rewards"])
        print("real returns:  ", path['returns'], path['rewards'].shape , "\n\n\n\n")
#         returns.append(path['returns'])
        
        global states, observations, actions
        states = path['states']
        observations = path['observations']
        actions = path['actions']

        print("sim mass: ", algo.env.wrapped_env.env.env.model.body_mass)
        path_2 = rollout(algo.env, algo.policy, max_path_length=1000,
                       animated=False, speedup=1)
        path_2["returns"] = sum(path_2["rewards"])
        print("sim returns: ", path_2['returns'], path_2["rewards"].shape, "\n\n\n\n") 
        
        myBopt = BO(bounds, X_init, Y_init)
        X_init = myBopt.X
        Y_init = myBopt.Y
        
        env_sim_new = normalize(GymEnv(vv["env"],record_video=False, record_log=False))        
        env_sim_new.wrapped_env.env.env.model.body_mass = np.array(myBopt.x_opt)
        algo = TRPO(
            env=env_sim_new,
            policy=policy,
            baseline=baseline,
            batch_size=4000,
            max_path_length=1000,
            n_itr=10,
            discount=0.99,
            step_size=0.01,
    #         Uncomment both lines (this and the plot parameter below) to enable plotting
    #         plot=True,
        )
        algo.env = env_sim_new        
        ##############################################################
        
    print("returns: ", returns)
    save_data_filename = "./"+mydir+"/BO_returns_"+vv["env"]+"_iter_"+str(vv["seed"])+"_"+datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

    with open(save_data_filename, 'wb') as f:
        pickle.dump(returns, f)


In [31]:
variants = VG().variants()
for v in variants:
    print("seed: ", v["seed"])
    if v["seed"]==1:
        global mydir
        mydir = v["env"]+"_"+datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
        os.makedirs(mydir)
    run_experiment_lite(
        run_task,
        # Number of parallel workers for sampling
        n_parallel=1,
        # Only keep the snapshot parameters for the last iteration
        snapshot_mode="last",
        # Specifies the seed for the experiment. If this is not provided, a random seed
        # will be used
        seed=v["seed"],
        exp_prefix = "BO_trpo_"+"_32_32_4000_iter_"+str(v["total_iter"])+"_"+mydir,
        variant=v,
    #     plot=True,
    )

seed:  1
python /home/zhusj/repos/rllab/scripts/run_experiment_lite.py  --use_cloudpickle 'True'  --snapshot_mode 'last'  --exp_name 'BO_trpo__32_32_4000_iter_1000_Swimmer-v1_2017-09-10_20-20-49_2017_09_10_00_15_27_0085'  --variant_data 'gANjcmxsYWIubWlzYy5pbnN0cnVtZW50ClZhcmlhbnREaWN0CnEAKYFxAShYAwAAAGVudnECWAoAAABTd2ltbWVyLXYxcQNYCAAAAGV4cF9uYW1lcQRYVQAAAEJPX3RycG9fXzMyXzMyXzQwMDBfaXRlcl8xMDAwX1N3aW1tZXItdjFfMjAxNy0wOS0xMF8yMC0yMC00OV8yMDE3XzA5XzEwXzAwXzE1XzI3XzAwODVxBVgEAAAAc2VlZHEGSwFYCgAAAHRvdGFsX2l0ZXJxB03oA1gMAAAAX2hpZGRlbl9rZXlzcQhdcQl1aAFiLg=='  --log_dir '/home/zhusj/repos/rllab/data/local/BO-trpo--32-32-4000-iter-1000-Swimmer-v1-2017-09-10-20-20-49/BO_trpo__32_32_4000_iter_1000_Swimmer-v1_2017-09-10_20-20-49_2017_09_10_00_15_27_0085'  --seed '1'  --n_parallel '1'  --args_data 'gAJjY2xvdWRwaWNrbGUuY2xvdWRwaWNrbGUKX2ZpbGxfZnVuY3Rpb24KcQAoY2Nsb3VkcGlja2xlLmNsb3VkcGlja2xlCl9tYWtlX3NrZWxfZnVuYwpxAWNjbG91ZHBpY2tsZS5jbG91ZHBpY2tsZQpfYnVpbHRpbl90eXBlCnECWAgAAABDb2RlVHlwZXEDhXEEUnEFK

KeyboardInterrupt: 