In [1]:
import env
import gym
from model_free import TD3

class ActionRepeat(object):
    def __init__(self, env, amount):
        self._env = env
        self._amount = amount
        self._env._max_episode_steps = self._env._max_episode_steps // amount

    def __getattr__(self, name):
        return getattr(self._env, name)

    def step(self, action):
        total_reward = 0

        for _ in range(self._amount):
            obs, reward, _, _ = self._env.step(action)
            total_reward += reward

        return obs, total_reward, False, {}

    def reset(self, *args, **kwargs):
        return self._env.reset(*args, **kwargs)

env = gym.make('MyHalfCheetah-v2')
env = ActionRepeat(env, 4)

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])
                   
policy = TD3(state_dim, action_dim, max_action)
policy.load('MyHalfCheetah-v2', 'save/TD3')

In [2]:
import numpy as np

def print_rollout_stats(obs, acts, reward_sum):
    print("Cumulative reward ", reward_sum)
    print("Action min {}, max {}, mean {}, std {}".format(
        acts.min(), acts.max(), acts.mean(), acts.std()))
    print("Obs min {}, max {}, mean {}, std {}".format(
        obs.min(), obs.max(), obs.mean(), obs.std()))

def sample_rollout(env, policy):
    observations, actions, reward_sum = [env.reset()], [], 0

    for t in range(env._max_episode_steps):
        actions.append(policy.act(observations[t]))
        obs, reward, _, _ = env.step(actions[t])
        observations.append(obs)
        reward_sum += reward

    return np.array(observations), np.array(actions), reward_sum
    
    
O, A = [], []
for _ in range(20):
    obs, acts, reward_sum = sample_rollout(env, policy)
    O.append(obs)
    A.append(acts)
                   
O, A = np.array(O), np.array(A)
np.save('TD3_obs.npy', O)
np.save('TD3_act.npy', A)

In [18]:
O, A = np.load('TD3_obs.npy'), np.load('TD3_act.npy')
print(O.shape, A.shape)
print(O.min(), O.max(), O.mean(), O.std())
print(A.min(), A.max(), A.mean(), A.std())

(20, 251, 18) (20, 250, 6)
-14.181554903081388 14.709950892518957 0.7288564677261283 2.1870828065655603
-1.0 1.0 -0.44035676 0.8007284


In [17]:
env.reset()
state = env.sim.get_state()
env.sim.set_state(state)
print(env.step(np.ones(env.action_space.shape)))
env.sim.set_state(state)
print(env.step(np.ones(env.action_space.shape) * 2))
env.sim.set_state(state)
print(env.step(np.ones(env.action_space.shape) * 0.5))

(array([-0.03721834, -0.1751069 , -0.12655779,  0.48382039,  0.55996988,
        0.49546049,  0.71287399,  0.62377733,  0.51271492, -0.02325877,
       -1.46481013,  0.91993719, -2.76050593, -3.01385295, -2.40930781,
       -0.207205  , -5.03486061, -0.07091486]), 0.06025153047648446, False, {})
(array([-0.03721834, -0.1751069 , -0.12655779,  0.48382039,  0.55996988,
        0.49546049,  0.71287399,  0.62377733,  0.51271492, -0.02325877,
       -1.46481013,  0.91993719, -2.76050593, -3.01385295, -2.40930781,
       -0.207205  , -5.03486061, -0.07091486]), -7.139748469523517, False, {})
(array([-0.00453046, -0.14839396, -0.06351854,  0.26520561,  0.27608194,
        0.25260678,  0.44108708,  0.33240966,  0.33058094, -0.06338864,
       -1.61868918,  0.54272409, -2.30551309, -1.5785326 , -1.35781413,
       -1.29138198, -0.95469576, -0.8698062 ]), 1.3223125350474048, False, {})


In [19]:
O = np.load('expert_demonstrations/half_cheetah/expert_obs.npy')
A = np.load('expert_demonstrations/half_cheetah/expert_act.npy')
print(O.shape, A.shape)
print(O.min(), O.max(), O.mean(), O.std())
print(A.min(), A.max(), A.mean(), A.std())