In [1]:
import env
import gym
from model_free import TD3

class ActionRepeat(object):
    def __init__(self, env, amount):
        self._env = env
        self._amount = amount
        self._env._max_episode_steps = self._env._max_episode_steps // amount

    def __getattr__(self, name):
        return getattr(self._env, name)

    def step(self, action):
        total_reward = 0

        for _ in range(self._amount):
            obs, reward, _, _ = self._env.step(action)
            total_reward += reward

        return obs, total_reward, False, {}

    def reset(self, *args, **kwargs):
        return self._env.reset(*args, **kwargs)

env_name, env_str = 'MyHalfCheetah-v2', 'half_cheetah'
env = gym.make(env_name)
env = ActionRepeat(env, 4)

In [2]:
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])
                   
policy = TD3(state_dim, action_dim, max_action)
policy.load(env_name, 'save/TD3')

In [3]:
import numpy as np

def print_rollout_stats(obs, acts, reward_sum):
    print("Cumulative reward ", reward_sum)
    print("Action min {}, max {}, mean {}, std {}".format(
        acts.min(), acts.max(), acts.mean(), acts.std()))
    print("Obs min {}, max {}, mean {}, std {}".format(
        obs.min(), obs.max(), obs.mean(), obs.std()))

def sample_rollout(env, policy):
    observations, actions, reward_sum = [env.reset()], [], 0

    for t in range(env._max_episode_steps):
        actions.append(policy.act(observations[t]))
        obs, reward, _, _ = env.step(actions[t])
        observations.append(obs)
        reward_sum += reward

    return np.array(observations), np.array(actions), reward_sum
    
    
O, A = [], []
for _ in range(20):
    obs, acts, reward_sum = sample_rollout(env, policy)
    O.append(obs)
    A.append(acts)
                   
O, A = np.array(O), np.array(A)
np.save('TD3_obs.npy', O)
np.save('TD3_act.npy', A)

In [4]:
O, A = np.load('TD3_obs.npy'), np.load('TD3_act.npy')
print(O.shape, A.shape)
print(O.min(), O.max(), O.mean(), O.std())
print(A.min(), A.max(), A.mean(), A.std())

(20, 251, 18) (20, 250, 6)
-12.336326217142314 14.718697760264245 0.7352599223611681 2.1942347662318755
-1.0 1.0 -0.44623846 0.7994219


In [5]:
O = np.load('expert_demonstrations/%s/expert_obs.npy' % env_str)
A = np.load('expert_demonstrations/%s/expert_act.npy' % env_str)
action_bound = env.action_space.high[0]
print(O.shape, A.shape)
print(O.min(), O.max(), O.mean(), O.std())
print(A.min(), A.max(), A.mean(), A.std())
print((A > action_bound).mean() + (A < -action_bound).mean())
A = A.clip(-action_bound, action_bound)
print(A.min(), A.max(), A.mean(), A.std())
print((A > action_bound).mean() + (A < -action_bound).mean())
np.save('expert_demonstrations/%s/expert_act_clipped.npy' % env_str, A)

(20, 251, 18) (20, 250, 6)
-17.2598140331567 466.09138757351496 11.581367552568011 51.45506850905242
-2.1904756893737196 2.157680768107007 0.0727562300912073 0.8969060018290144
0.36443333333333333
-1.0 1.0 0.04962332196652116 0.7189090617904839
0.0


In [17]:
env._get_obs().shape()

AttributeError: attempted to get missing private attribute '_get_obs'

In [6]:
O = np.load('TD3_obs.npy')
for obs in O[1]:
    print(obs.mean())

0.003153189712112166
0.2465493737723724
0.05500830218536222
0.31009675441398626
0.6198053941473844
0.4869296304847638
0.405669161688485
0.715645389195832
0.3802738850495617
0.38182478438605716
0.8514349179731628
0.726162175551539
0.6586207477361681
0.9254067439254753
0.7512954428298099
0.7109934457430346
0.7094032323119674
0.8019485187372498
0.747324673914257
0.6610437979484807
0.7178413057854184
0.8721927936121902
0.6626242043128375
0.8584471378281374
0.5973081969009659
0.8463045902657306
0.48007948055776034
0.8718310290200436
0.6007714856386062
0.8598820493177165
0.7337096365659758
0.34883407131686006
0.8223317786524986
0.9165135973816783
0.7629992080213696
0.9498284725116366
0.3882212258165559
0.9308344374136747
0.5779225801511952
0.6669762165175268
0.7418723876614893
0.7423628096278309
1.0110690036333128
0.9148003317349508
0.9781055237551455
0.7976149772896914
0.9504150218596088
0.5512632165863709
1.0627403977660523
0.9608293544038058
1.0446683291053893
0.619854235039909
1.05170063

In [7]:
O = np.load('expert_demonstrations/%s/expert_obs.npy' % env_str)
for obs in O[0]:
    print(obs[2])

-0.024809023797316282
0.09019494544689409
-0.008918224802306326
-0.15917843048015792
-0.4010779804783073
-0.496642129963342
-0.4381436851605807
-0.3015310394411798
-0.1490267673900459
0.33735351684413506
0.9106375019076508
2.083977957819713
3.2062940693942648
4.018262136625541
5.253952793027937
5.683547093521751
6.088027623894855
6.4000531520902255
5.73624358005557
5.779002444995669
5.8460378813556835
5.821827062537631
6.149583365213306
5.8869629558060135
5.832016440634667
5.825258724235282
5.9341983621811405
7.216045098256817
8.357130899688233
9.783852014800908
11.695839473101836
13.190416175772677
15.248469564390811
17.07484421636302
18.442464687451594
20.06842725252947
21.83365201000165
23.827416264574726
25.38865489469344
27.340536648422233
29.188493613502473
31.068180221677316
33.454426542750504
34.53853460719114
36.36236666009228
37.59343290224041
38.634497483481056
39.58550622339121
41.15892480119729
43.3048040352506
45.60752512594769
47.18192963899897
49.12399290839175
50.55352

In [14]:
O1 = np.load('TD3_obs.npy')
O2 = np.load('expert_demonstrations/%s/expert_obs.npy' % env_str)
for obs1, obs2 in zip(O1[0], O2[0]):
    print(obs1 - obs2)

[ 0.         -0.09686022 -0.03026892  0.09204895 -0.01290204  0.01938083
 -0.07372141  0.0391     -0.08631285  0.1571901  -0.10830046 -0.21470587
 -0.18811098  0.1940403  -0.00554363 -0.23583042  0.16517835 -0.07463191]
[ 4.49139610e-01 -1.06569473e-01 -1.08082080e-03  2.75109273e-01
  9.07266960e-01  4.66428921e-01 -7.50727690e-01 -4.74136901e-01
 -2.53709741e-01  4.83576048e-01  8.78551077e-01  1.04991252e+00
  8.20440043e-01 -3.66409120e+00 -1.21771983e+00  2.61553655e+00
  1.13487308e+00 -7.54354130e-01]
[-1.0677207  -0.02347708  0.11332753 -1.26923954 -0.91922519 -0.78716677
  0.1641876  -0.22806108 -0.36977885 -0.44170553  0.71464264 -3.11507217
 -3.01376654 -4.04086126 -1.05801167 -0.2891185   8.04766592  7.58167762]
[-0.32154086  0.06038345 -0.028182    1.33601511  1.10182582 -0.22519454
 -1.09320558 -1.41489474 -1.00673199 -0.55822966  1.54717711  2.60094562
 -6.29125649 -5.52958474 -3.01727861  3.65924273 -1.18732857  1.04662388]
[-1.70405037  0.01933206  0.37550912 -0.387432

In [8]:
env.reset()
state = env.sim.get_state()
env.sim.set_state(state)
print(env.step(np.ones(env.action_space.shape)))
env.sim.set_state(state)
print(env.step(np.ones(env.action_space.shape) * 2))
env.sim.set_state(state)
print(env.step(np.ones(env.action_space.shape) * -1))
env.sim.set_state(state)
print(env.step(np.ones(env.action_space.shape) * -2))
env.sim.set_state(state)
print(env.step(np.ones(env.action_space.shape) * 0.5))

(array([-0.0931318 , -0.1893496 , -0.10108624,  0.47824186,  0.53145603,
        0.49524015,  0.71350048,  0.6513962 ,  0.51187408, -0.09638678,
       -1.55077777,  0.75399848, -1.91274486, -2.94163854, -1.96745641,
       -0.13528475, -5.41078966, -0.05418581]), -0.7334638576027419, False, {})
(array([-0.0931318 , -0.1893496 , -0.10108624,  0.47824186,  0.53145603,
        0.49524015,  0.71350048,  0.6513962 ,  0.51187408, -0.09638678,
       -1.55077777,  0.75399848, -1.91274486, -2.94163854, -1.96745641,
       -0.13528475, -5.41078966, -0.05418581]), -7.933463857602743, False, {})
(array([ 0.42250957, -0.13857564,  0.22261785, -0.51694316, -0.52034723,
       -0.41995824, -0.66445702, -0.5947448 , -0.51148235,  0.40282455,
       -1.74792844, -0.98471174,  0.82150578,  4.74352975,  0.01052509,
        3.45232942,  3.14208079,  0.10856296]), -4.0198240988165255, False, {})
(array([ 0.42250957, -0.13857564,  0.22261785, -0.51694316, -0.52034723,
       -0.41995824, -0.66445702, -0.5