In [1]:
import sys
sys.path.insert(0, './py_torch_trpo')
from baselines.common import set_global_seeds, tf_util as U
import gym
import roboschool
import numpy as np
import random
from expert import *
import matplotlib.pyplot as plt
import time
import pandas as pd
import seaborn as sns
from gym import spaces
from base_line_model.TRPO_agent import TRPO_agent_new
from base_line_model.mlp import MlpPolicy_new
from baselines import logger

plt.style.use('seaborn-white')
sns.set(context = "paper", font = "monospace", font_scale=2)
seed = 1
logger.configure()
U.make_session(num_cpu=16).__enter__()
set_global_seeds(seed)

Logging to /tmp/openai-2018-05-05-06-59-36-611306


# Dummy observer and adversary agent

In [2]:
# dummy class, use for construction of adversary and observer environment
class dummy_adversary_env(object):
    def __init__(self):
        self.env = gym.make("RoboschoolInvertedPendulum-v1")
        self.action_space = spaces.Box(low=-1, high=1, shape=(self.env.observation_space.shape[0],))
        self.observation_space = self.env.observation_space
        
    def action_ev(self, s):
        return action_space.sample()
    
class dummy_observer_env(object):
    def __init__(self):
        self.env = gym.make("RoboschoolInvertedPendulum-v1")
        self.action_space = spaces.Box(low=-1, high=1, shape=(self.env.observation_space.shape[0],))
        self.observation_space = self.env.observation_space
    
    def action_ev(self, s):
        return s

# Adversary environment

In [3]:
class adversary_env(object):
    def __init__(self):
        self.env = gym.make("RoboschoolInvertedPendulum-v1")
        self.threshold = np.array([ 0.14244403,  0.07706523,  0.00016789,  0.00789366,  0.02395424]) # standard deviation
        # pre-trained controller
        self.agent = SmallReactivePolicy(self.env.observation_space, self.env.action_space) # declare sample trained agent
        self.env.seed(0)
        
        # observer agent
        self.observer = dummy_observer_env()
        
        self.ratio = 0.7
        self.max_turn = 1000
        self.action_space = spaces.Box(low=-1, high=1, shape=(self.env.observation_space.shape[0],))
        self.observation_space = self.env.observation_space
        
        self.obsr = 0
        self.epi_num = 0
        self.total_score = 0
        self.first = True
        self.score_list = []
        self.epi_list = []
        self.env.metadata
    
    # -----------------------   define reward for adversary agent --------------------------------- 
    def reward(self, st):
        return np.abs(st[3]) - 0.08 # sin theta - 0.8
    
    def step(self, a):
        self.epi_num = self.epi_num + 1
        obs = np.clip(a,-1,1)*self.threshold*self.ratio + self.obsr
        
        # observer take the state input
        obs = self.observer.action_ev(obs)
        
        ac = self.agent.act(obs)
        self.obsr, r, done, _ = self.env.step(ac)
        
        if self.epi_num >= self.max_turn:
            done = True
        
        if self.first and done:
            self.first = False
            self.score_list = [self.total_score]
            self.epi_list = [self.epi_num]
            
        final_r = self.reward(self.obsr)
        if done and self.epi_num < self.max_turn:
            final_r = 15 # terminal cost 
        
        self.total_score += final_r
        return self.obsr, final_r, done, 0
        
        
    def seed(self, a):
        pass
    
    def reset(self):
        self.obsr = self.env.reset()        
        if not self.first: 
            self.score_list.append(self.total_score)
            self.epi_list.append(self.epi_num)
        
        self.epi_num = 0
        self.total_score = 0
        return self.obsr
    
    def env_reset(self):
        self.obsr = 0
        self.epi_num = 0
        self.total_score = 0
        self.first = True
        self.score_list = []
        self.epi_list = []
    
    def result_plot(self):
        fon_size = 19
        x = list(range(0, len(self.score_list[1:])))
        fig=plt.figure(figsize=(18, 4), dpi= 80, facecolor='w', edgecolor='k')
        plt.subplot(1,2,1)
        plt.scatter(x,self.score_list[1:], s=5)
        plt.xlabel('episodes',fontsize=fon_size)
        plt.ylabel('cumulative reward',fontsize=fon_size)
        plt.subplot(1,2,2)
        x = list(range(0, len(self.epi_list)))
        plt.scatter(x,self.epi_list, s=5)
        plt.xlabel('episodes',fontsize=fon_size)
        plt.ylabel('time steps',fontsize=fon_size)
        plt.ylim([0,1200])
        plt.show()

# Observer environment

In [4]:
class observer_env(object):
    agn_index = 1
    def __init__(self):
        # parameter
        self.env = gym.make("RoboschoolInvertedPendulum-v1")
        self.env.seed(0)
        # ------------------ pre-trained agent -----------------------------
        self.agent = SmallReactivePolicy(self.env.observation_space, self.env.action_space) # declare sample trained agent
        self.ratio = 0.7
        self.threshold = np.array([ 0.14244403,  0.07706523,  0.00016789,  0.00789366,  0.02395424])
        
        # ------ dummy adversarial agent -------------------
        self.adv_agn = dummy_adversary_env()
        
        self.max_turn = 1000
        self.action_space = spaces.Box(low=-1, high=1, shape=(self.env.observation_space.shape[0],))
        self.observation_space = self.env.observation_space
        self.obsr = 0
        self.epi_num = 0
        self.total_score = 0
        self.first = True
        self.score_list = []
        self.epi_list = []
        self.env.metadata
        
    # define reward function
    def reward(self, st):
        return 1 #np.abs(st[3])-0.08#(np.abs(st[3])-0.00786861)*100
    
    def step(self, a):
        self.epi_num = self.epi_num + 1
        
        ac = self.agent.act(a)
        self.obsr, r, done, _ = self.env.step(ac)
        
        if self.epi_num >= self.max_turn:
            done = True
        
        if self.first and done: ###################################
            self.first = False
            self.score_list = [self.total_score]
            self.epi_list = [self.epi_num]
        
        final_r = self.reward(self.obsr)
        
        self.total_score += final_r
        
        # return noise output (adversarial)
        action = self.adv_agn.action_ev(self.obsr)
        obs = np.clip(action,-1,1)*self.threshold*self.ratio + self.obsr
        return obs, final_r, done, 0
    
    
    def seed(self, a):
        pass
    
    def reset(self):
        self.obsr = self.env.reset()
        if not self.first:
            self.score_list.append(self.total_score)
            self.epi_list.append(self.epi_num)
        
        self.epi_num = 0
        self.total_score = 0
        return self.obsr
    
    def env_reset(self):
        self.obsr = 0
        self.epi_num = 0
        self.total_score = 0
        self.first = True
        self.score_list = []
        self.epi_list = []
    
    def result_plot(self):
        fon_size = 19
        x = list(range(0, len(self.score_list)))
        fig=plt.figure(figsize=(18, 4), dpi= 80, facecolor='w', edgecolor='k')
        plt.subplot(1,2,1)
        plt.scatter(x,self.score_list, s=5)
        plt.xlabel('episodes',fontsize=fon_size)
        plt.ylabel('cumulative reward',fontsize=fon_size)

        plt.subplot(1,2,2)
        x = list(range(0, len(self.epi_list)))
        plt.scatter(x,self.epi_list, s=5)
        plt.xlabel('episodes',fontsize=fon_size)
        plt.ylabel('time steps',fontsize=fon_size)
        plt.ylim([0,1200])
        plt.show()

# Adversary agent

In [5]:
dummy_env1 = dummy_adversary_env()
class pargm1(object):
    def __init__(self):
        self.timesteps_per_batch = 5000 # what to train on
        self.max_kl = 0.01
        self.cg_iters = 10
        self.gamma = 0.995
        self.lam =  0.97# advantage estimation
        self.entcoeff=0.0
        self.cg_damping=0.1
        self.vf_stepsize=1e-3
        self.vf_iters =5
        self.max_timesteps = 5000
        self.max_episodes=0
        self.max_iters=0  # time constraint
        self.max_epi_avg = 1001
        self.callback=None

def policy_fn1(name, ob_space, ac_space):
        return MlpPolicy_new(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=128, num_hid_layers=2)
    
parg = pargm1()
adversary = TRPO_agent_new('pi1', dummy_env1, policy_fn1, parg)

[2018-05-05 06:59:36,863] Making new env: RoboschoolInvertedPendulum-v1


Init param sum 1.91204


In [6]:
#adversary.restore("adversary")

# Observer agent

In [7]:
dummy_env2 = dummy_observer_env()
class pargm2(object):
    def __init__(self):
        self.timesteps_per_batch = 5000 # what to train on
        self.max_kl = 0.01
        self.cg_iters = 10
        self.gamma = 0.995
        self.lam =  0.97# advantage estimation
        self.entcoeff=0.0
        self.cg_damping=0.1
        self.vf_stepsize=1e-3
        self.vf_iters =5
        self.max_timesteps = 5000
        self.max_episodes=0
        self.max_iters=0  # time constraint
        self.max_epi_avg = 1001
        self.callback=None

def policy_fn2(name, ob_space, ac_space):
        return MlpPolicy_new(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=128, num_hid_layers=2)

    
parg = pargm2()
observer = TRPO_agent_new('pi2', dummy_env2, policy_fn2, parg)

[2018-05-05 06:59:38,143] Making new env: RoboschoolInvertedPendulum-v1


Init param sum 22.8453


In [8]:
#observer.restore("observer")

In [9]:
obs_env = observer_env()
adv_env = adversary_env()
# replace all dummy instance
obs_env.adv_agn = adversary
adv_env.observer = observer
adversary.env = adv_env
observer.env = obs_env

[2018-05-05 06:59:39,404] Making new env: RoboschoolInvertedPendulum-v1
[2018-05-05 06:59:39,409] Making new env: RoboschoolInvertedPendulum-v1
[2018-05-05 06:59:39,412] Making new env: RoboschoolInvertedPendulum-v1
[2018-05-05 06:59:39,415] Making new env: RoboschoolInvertedPendulum-v1


In [10]:
for i in range(1000):
    print("\n -------------- adversary update -------------- \n")
    adversary.learn()
    print("\n -------------- observer update -------------- \n")
    observer.learn()


 -------------- adversary update -------------- 

********** Iteration 0 ************
[35msampling[0m
[35mdone in 8.369 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.052 seconds[0m
[35mcg[0m
      iter residual norm  soln norm
         0     0.0166          0
         1    0.00267     0.0111
         2    0.00484     0.0332
         3   0.000833     0.0738
         4    8.4e-05      0.144
         5    0.00064      0.149
         6   2.43e-05      0.153
         7   0.000141      0.158
         8   2.67e-06       0.16
         9   3.56e-06      0.162
        10      1e-07      0.162
[35mdone in 0.163 seconds[0m
Expected: 0.012 Actual: 0.013
Stepsize OK!
[35mvf[0m
[35mdone in 1.176 seconds[0m
--------------------------------
| EpLenMean       | 27.7       |
| EpRewMean       | 15         |
| EpThisIter      | 180        |
| EpisodesSoFar   | 180        |
| TimeElapsed     | 9.87       |
| TimestepsSoFar  | 4971       |
| entloss         | 0.0        |
| entropy         | 

Stepsize OK!
[35mvf[0m
[35mdone in 1.119 seconds[0m
-------------------------------
| EpLenMean       | 70.4      |
| EpRewMean       | 70.4      |
| EpThisIter      | 74        |
| EpisodesSoFar   | 182       |
| TimeElapsed     | 18.2      |
| TimestepsSoFar  | 9920      |
| entloss         | 0.0       |
| entropy         | 7.09472   |
| ev_tdlam_before | 0.393     |
| meankl          | 0.0091759 |
| optimgain       | 0.0294718 |
| surrgain        | 0.0294718 |
-------------------------------

 -------------- adversary update -------------- 

********** Iteration 0 ************
[35msampling[0m
[35mdone in 8.928 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.029 seconds[0m
[35mcg[0m
      iter residual norm  soln norm
         0     0.0289          0
         1     0.0125      0.018
         2    0.00263     0.0434
         3     0.0153      0.097
         4    0.00116      0.131
         5    0.00239      0.144
         6    0.00145      0.153
         7    0.00051      0.


 -------------- adversary update -------------- 

********** Iteration 0 ************
[35msampling[0m
[35mdone in 7.628 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.026 seconds[0m
[35mcg[0m
      iter residual norm  soln norm
         0     0.0131          0
         1    0.00222     0.0159
         2    0.00052     0.0347
         3   0.000333     0.0434
         4   0.000138     0.0456
         5   0.000278     0.0577
         6   3.78e-05      0.061
         7   8.54e-05     0.0671
         8   0.000135     0.0726
         9   1.98e-05     0.0773
        10   1.43e-06     0.0801
[35mdone in 0.104 seconds[0m
Expected: 0.009 Actual: 0.009
Stepsize OK!
[35mvf[0m
[35mdone in 1.036 seconds[0m
--------------------------------
| EpLenMean       | 1e+03      |
| EpRewMean       | -75.4      |
| EpThisIter      | 5          |
| EpisodesSoFar   | 5          |
| TimeElapsed     | 8.86       |
| TimestepsSoFar  | 5000       |
| entloss         | 0.0        |
| entropy         | 

[35msampling[0m
[35mdone in 7.820 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.026 seconds[0m
[35mcg[0m
      iter residual norm  soln norm
         0     0.0172          0
         1    0.00123      0.018
         2   0.000741     0.0233
         3   0.000835     0.0448
         4   0.000224     0.0517
         5   0.000293     0.0667
         6   8.45e-05     0.0718
         7   0.000152     0.0748
         8   3.44e-05     0.0912
         9   9.56e-06     0.0921
        10   9.75e-06     0.0926
[35mdone in 0.104 seconds[0m
Expected: 0.009 Actual: 0.009
Stepsize OK!
[35mvf[0m
[35mdone in 1.056 seconds[0m
--------------------------------
| EpLenMean       | 1e+03      |
| EpRewMean       | -74.6      |
| EpThisIter      | 5          |
| EpisodesSoFar   | 5          |
| TimeElapsed     | 9.07       |
| TimestepsSoFar  | 5000       |
| entloss         | 0.0        |
| entropy         | 7.09472    |
| ev_tdlam_before | 0.113      |
| meankl          | 0.0076707  |
| optimg

[35mdone in 8.199 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.023 seconds[0m
[35mcg[0m
      iter residual norm  soln norm
         0    0.00975          0
         1     0.0006     0.0128
         2   0.000716     0.0176
         3   0.000357      0.025
         4   0.000115     0.0287
         5   0.000346     0.0372
         6   5.39e-05     0.0469
         7   6.89e-05     0.0489
         8   3.15e-05     0.0536
         9    4.7e-05     0.0557
        10   3.88e-05     0.0606
[35mdone in 0.109 seconds[0m
Expected: 0.006 Actual: 0.007
Stepsize OK!
[35mvf[0m
[35mdone in 1.198 seconds[0m
--------------------------------
| EpLenMean       | 1e+03      |
| EpRewMean       | -74.6      |
| EpThisIter      | 5          |
| EpisodesSoFar   | 5          |
| TimeElapsed     | 9.59       |
| TimestepsSoFar  | 5000       |
| entloss         | 0.0        |
| entropy         | 7.09472    |
| ev_tdlam_before | 0.233      |
| meankl          | 0.00832593 |
| optimgain       | 0.0065

[35msampling[0m
[35mdone in 9.493 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.022 seconds[0m
[35mcg[0m
      iter residual norm  soln norm
         0     0.0199          0
         1    0.00317     0.0157
         2    0.00286     0.0249
         3    0.00103     0.0402
         4    0.00427     0.0706
         5    0.00113      0.113
         6    0.00108      0.126
         7   0.000417      0.138
         8   0.000337      0.142
         9   0.000167      0.152
        10   0.000134      0.154
[35mdone in 0.113 seconds[0m
Expected: 0.012 Actual: 0.010
Stepsize OK!
[35mvf[0m
[35mdone in 1.147 seconds[0m
--------------------------------
| EpLenMean       | 986        |
| EpRewMean       | 986        |
| EpThisIter      | 5          |
| EpisodesSoFar   | 5          |
| TimeElapsed     | 10.8       |
| TimestepsSoFar  | 4930       |
| entloss         | 0.0        |
| entropy         | 7.09472    |
| ev_tdlam_before | -0.0182    |
| meankl          | 0.00652637 |
| optimg

[35mdone in 7.730 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.027 seconds[0m
[35mcg[0m
      iter residual norm  soln norm
         0     0.0281          0
         1    0.00292     0.0207
         2    0.00508      0.044
         3    0.00175     0.0632
         4     0.0037     0.0766
         5    0.00258      0.133
         6    0.00114      0.151
         7    0.00144      0.167
         8    0.00033      0.177
         9   0.000377      0.181
        10   9.32e-05      0.187
[35mdone in 0.115 seconds[0m
Expected: 0.014 Actual: 0.012
Stepsize OK!
[35mvf[0m
[35mdone in 1.032 seconds[0m
-------------------------------
| EpLenMean       | 1e+03     |
| EpRewMean       | 1e+03     |
| EpThisIter      | 5         |
| EpisodesSoFar   | 5         |
| TimeElapsed     | 8.97      |
| TimestepsSoFar  | 5000      |
| entloss         | 0.0       |
| entropy         | 7.09472   |
| ev_tdlam_before | 0.0195    |
| meankl          | 0.0066379 |
| optimgain       | 0.0118563 |
| sur

[35msampling[0m
[35mdone in 8.387 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.025 seconds[0m
[35mcg[0m
      iter residual norm  soln norm
         0     0.0158          0
         1     0.0081     0.0236
         2    0.00131     0.0374
         3    0.00117     0.0433
         4   0.000253     0.0489
         5   0.000329     0.0517
         6   0.000261     0.0568
         7    0.00026     0.0705
         8   0.000222     0.0766
         9   3.62e-05     0.0843
        10   1.31e-05     0.0863
[35mdone in 0.110 seconds[0m
Expected: 0.011 Actual: 0.011
Stepsize OK!
[35mvf[0m
[35mdone in 1.024 seconds[0m
--------------------------------
| EpLenMean       | 1e+03      |
| EpRewMean       | -73.3      |
| EpThisIter      | 5          |
| EpisodesSoFar   | 5          |
| TimeElapsed     | 9.61       |
| TimestepsSoFar  | 5000       |
| entloss         | 0.0        |
| entropy         | 7.09472    |
| ev_tdlam_before | 0.685      |
| meankl          | 0.00855586 |
| optimg

[35msampling[0m
[35mdone in 7.281 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.024 seconds[0m
[35mcg[0m
      iter residual norm  soln norm
         0     0.0142          0
         1    0.00514     0.0146
         2    0.00499     0.0372
         3    0.00206     0.0499
         4    0.00127     0.0603
         5   0.000661     0.0651
         6   0.000376     0.0728
         7   0.000223     0.0759
         8   0.000405      0.081
         9   0.000216     0.0951
        10   0.000249      0.103
[35mdone in 0.110 seconds[0m
Expected: 0.011 Actual: 0.010
Stepsize OK!
[35mvf[0m
[35mdone in 0.990 seconds[0m
--------------------------------
| EpLenMean       | 1e+03      |
| EpRewMean       | 1e+03      |
| EpThisIter      | 5          |
| EpisodesSoFar   | 5          |
| TimeElapsed     | 8.47       |
| TimestepsSoFar  | 5000       |
| entloss         | 0.0        |
| entropy         | 7.09472    |
| ev_tdlam_before | -2.03e-06  |
| meankl          | 0.00790512 |
| optimg

[35mdone in 7.325 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.022 seconds[0m
[35mcg[0m
      iter residual norm  soln norm
         0      0.035          0
         1     0.0133     0.0227
         2     0.0102     0.0515
         3    0.00219     0.0687
         4    0.00266     0.0751
         5    0.00178     0.0902
         6   0.000868     0.0999
         7    0.00239       0.12
         8   0.000717      0.141
         9   0.000818      0.154
        10   0.000552      0.171
[35mdone in 0.093 seconds[0m
Expected: 0.016 Actual: 0.014
Stepsize OK!
[35mvf[0m
[35mdone in 0.971 seconds[0m
--------------------------------
| EpLenMean       | 1e+03      |
| EpRewMean       | 1e+03      |
| EpThisIter      | 5          |
| EpisodesSoFar   | 5          |
| TimeElapsed     | 8.48       |
| TimestepsSoFar  | 5000       |
| entloss         | 0.0        |
| entropy         | 7.09472    |
| ev_tdlam_before | 2.06e-05   |
| meankl          | 0.00782845 |
| optimgain       | 0.0142

[35msampling[0m
[35mdone in 8.092 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.030 seconds[0m
[35mcg[0m
      iter residual norm  soln norm
         0     0.0309          0
         1    0.00551     0.0139
         2    0.00594     0.0217
         3    0.00199     0.0324
         4    0.00132     0.0383
         5    0.00223     0.0567
         6    0.00115     0.0669
         7    0.00132      0.084
         8    0.00141     0.0974
         9   0.000548      0.115
        10   0.000143      0.118
[35mdone in 0.113 seconds[0m
Expected: 0.011 Actual: 0.011
Stepsize OK!
[35mvf[0m
[35mdone in 1.236 seconds[0m
--------------------------------
| EpLenMean       | 1e+03      |
| EpRewMean       | -63.2      |
| EpThisIter      | 5          |
| EpisodesSoFar   | 5          |
| TimeElapsed     | 9.53       |
| TimestepsSoFar  | 5000       |
| entloss         | 0.0        |
| entropy         | 7.09472    |
| ev_tdlam_before | 0.536      |
| meankl          | 0.00777012 |
| optimg

[35mdone in 7.837 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.025 seconds[0m
[35mcg[0m
      iter residual norm  soln norm
         0     0.0188          0
         1    0.00469      0.013
         2    0.00531     0.0243
         3   0.000752     0.0402
         4    0.00101     0.0442
         5   0.000319      0.055
         6    0.00084      0.065
         7   0.000584     0.0754
         8   0.000195      0.099
         9   0.000173      0.103
        10   1.46e-05      0.107
[35mdone in 0.112 seconds[0m
Expected: 0.010 Actual: 0.010
Stepsize OK!
[35mvf[0m
[35mdone in 1.072 seconds[0m
--------------------------------
| EpLenMean       | 1e+03      |
| EpRewMean       | -73.4      |
| EpThisIter      | 5          |
| EpisodesSoFar   | 5          |
| TimeElapsed     | 9.11       |
| TimestepsSoFar  | 5000       |
| entloss         | 0.0        |
| entropy         | 7.09472    |
| ev_tdlam_before | 0.0587     |
| meankl          | 0.00777638 |
| optimgain       | 0.0102

[35msampling[0m
[35mdone in 7.567 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.025 seconds[0m
[35mcg[0m
      iter residual norm  soln norm
         0     0.0105          0
         1    0.00382     0.0148
         2    0.00236     0.0334
         3   0.000827     0.0387
         4   0.000589     0.0424
         5    0.00125     0.0504
         6   0.000451     0.0617
         7   0.000654     0.0694
         8   0.000287     0.0793
         9   0.000256     0.0853
        10   0.000293     0.0937
[35mdone in 0.108 seconds[0m
Expected: 0.009 Actual: 0.009
Stepsize OK!
[35mvf[0m
[35mdone in 0.963 seconds[0m
--------------------------------
| EpLenMean       | 1e+03      |
| EpRewMean       | 1e+03      |
| EpThisIter      | 5          |
| EpisodesSoFar   | 5          |
| TimeElapsed     | 8.72       |
| TimestepsSoFar  | 5000       |
| entloss         | 0.0        |
| entropy         | 7.09472    |
| ev_tdlam_before | -0.000306  |
| meankl          | 0.00795295 |
| optimg

[35msampling[0m
[35mdone in 8.307 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.026 seconds[0m
[35mcg[0m
      iter residual norm  soln norm
         0     0.0198          0
         1    0.00377    0.00975
         2     0.0019     0.0137
         3   0.000612     0.0193
         4   0.000934     0.0256
         5   0.000716     0.0342
         6   0.000493     0.0455
         7   0.000772     0.0527
         8   0.000122     0.0672
         9   0.000147     0.0711
        10    5.6e-05      0.073
[35mdone in 0.129 seconds[0m
Expected: 0.008 Actual: 0.008
Stepsize OK!
[35mvf[0m
[35mdone in 1.277 seconds[0m
--------------------------------
| EpLenMean       | 1e+03      |
| EpRewMean       | -72.8      |
| EpThisIter      | 5          |
| EpisodesSoFar   | 5          |
| TimeElapsed     | 9.83       |
| TimestepsSoFar  | 5000       |
| entloss         | 0.0        |
| entropy         | 7.09472    |
| ev_tdlam_before | 0.0917     |
| meankl          | 0.00856412 |
| optimg

[35msampling[0m
[35mdone in 7.886 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.026 seconds[0m
[35mcg[0m
      iter residual norm  soln norm
         0      0.033          0
         1    0.00761     0.0132
         2    0.00915     0.0284
         3    0.00561     0.0645
         4    0.00736     0.0934
         5    0.00252      0.146
         6     0.0024      0.155
         7    0.00141      0.174
         8    0.00171      0.184
         9   0.000852      0.197
        10   0.000675      0.201
[35mdone in 0.120 seconds[0m
Expected: 0.016 Actual: 0.015
Stepsize OK!
[35mvf[0m
[35mdone in 1.014 seconds[0m
--------------------------------
| EpLenMean       | 1e+03      |
| EpRewMean       | 1e+03      |
| EpThisIter      | 5          |
| EpisodesSoFar   | 5          |
| TimeElapsed     | 9.12       |
| TimestepsSoFar  | 5000       |
| entloss         | 0.0        |
| entropy         | 7.09472    |
| ev_tdlam_before | 0.00898    |
| meankl          | 0.00659314 |
| optimg

[35msampling[0m


KeyboardInterrupt: 