In [1]:
# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

## Todo list

- [x] Refactor imitation train/eval loop
- [ ] Invertable networks
- [ ] LQR loss
- [ ] Compare to the paper (Extracting Latent State Representations with Linear Dynamics from Rich Observations)
    * That paper has problems with Hopper (PyBullet)

In [2]:
from pathlib import Path
import random
import math
import torch
import torch.nn as nn
import mujoco_py
import gym
import numpy as np
import tqdm

import matplotlib.pyplot as plt
import pandas as pd
from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler

In [3]:
import rl_research.algo.ppo as ppo
import rl_research.algo.ppo.ppo
import rl_research.algo.ppo.model
import rl_research.algo.trivial_ddl.ddl as ddl
import rl_research.recording as recording

In [4]:
run_path         = Path("runs/ppo_mj_hopper_prod9")
pth_path         = run_path / "ckpt/best.pth"
pth_preproc_path = run_path / "ckpt/best.preproc.pth"

In [5]:
actor_critic = torch.load(pth_path)
preprocessor = ppo.ppo.Preprocessor.from_state(torch.load(pth_preproc_path))
agent = ppo.ppo.PPO_Agent(preprocessor, actor_critic)

In [6]:
mean_rew, rews, agg_trajectory = recording.evaluate_n_episodes("Hopper-v3", agent, 300)
print(f"Mean reward (Expert PPO): {mean_rew}")

100%|█████████████████████████████████████████| 300/300 [01:26<00:00,  3.47it/s]

Mean reward (Expert PPO): 3350.4009052207903





In [34]:
df_obs      = pd.DataFrame(data=agg_trajectory.obs, columns=[f"obs_{i}" for i in range(agg_trajectory.obs[0].shape[0])])
df_obs_next = pd.DataFrame(data=agg_trajectory.obs_next, columns=[f"obs_next_{i}" for i in range(agg_trajectory.obs_next[0].shape[0])])
df_act      = pd.DataFrame(data=agg_trajectory.act, columns=[f"act_{i}" for i in range(agg_trajectory.act[0].shape[0])])
df_done     = pd.DataFrame(data=agg_trajectory.done, columns=["done"]).astype(int)

In [59]:
traj_end_ids = df_done.index.to_numpy()[df_done.done.to_numpy() == 1]
traj_start_ids = np.zeros(traj_end_ids.shape[0])
traj_start_ids[1:] = traj_end_ids[:-1] + 1

In [62]:
traj_start_ids.astype(int, copy=False)
traj_end_ids.astype(int, copy=False)

array([   999,   1999,   2999,   3666,   4666,   5666,   6666,   7666,
         8666,   9666,  10666,  11215,  12215,  13215,  14215,  15215,
        16215,  17215,  18215,  19215,  20215,  20970,  21970,  22970,
        23970,  24970,  25970,  26970,  27970,  28533,  29533,  30082,
        31082,  32082,  33082,  34082,  35082,  36082,  36687,  37687,
        38687,  39249,  40249,  41142,  42142,  42695,  43695,  44253,
        45253,  46253,  47253,  48253,  49253,  50187,  51187,  52187,
        53187,  54187,  55187,  56187,  57145,  58145,  59145,  60145,
        61145,  62145,  63061,  64061,  65061,  66061,  67061,  68061,
        69061,  70061,  71061,  72061,  73061,  73681,  74681,  75681,
        76681,  77563,  78563,  79451,  80451,  81451,  82451,  83451,
        84451,  85451,  86451,  87451,  88451,  89451,  90451,  91451,
        92451,  93451,  94451,  95451,  96401,  97401,  98401,  99401,
       100401, 101281, 102281, 103281, 103917, 104917, 105917, 106917,
      

In [73]:
obs_size = agg_trajectory.obs[0].shape[0]
act_size = agg_trajectory.act[0].shape[0]
ddl_model = ddl.Model(obs_size, act_size, hidden_size=256, latent_size=32)
ddl_policy = ddl.PolicyModel(act_size, hidden_size=256, latent_size=32)
ddl_agent = ddl.Agent(ddl_model, ddl_policy, preprocessor)

# NOTES
* to numpy

In [65]:
def split_data(df_act, df_obs, df_obs_next, val_frac=0.20):
    assert df_obs.shape[0] == df_obs_next.shape[0] == df_act.shape[0]
    assert 0.0 <= val_frac <= 1.0
    
    num_data_points = df_act.shape[0]
    num_train_data_ids = math.ceil((1 - val_frac) * num_data_points)

    data_ids = list(range(num_data_points))
    random.shuffle(data_ids)

    train_data_ids = data_ids[:num_train_data_ids]
    val_data_ids = data_ids[num_train_data_ids:]

    return (df_act.iloc[train_data_ids], df_obs.iloc[train_data_ids], df_obs_next.iloc[train_data_ids],
            df_act.iloc[val_data_ids], df_obs.iloc[val_data_ids], df_obs_next.iloc[val_data_ids])

In [66]:
(
    df_act_train, df_obs_train, df_obs_next_train,
    df_act_val, df_obs_val, df_obs_next_val
) = split_data(
    df_act, df_obs, df_obs_next
)

In [74]:
for i_epoch in range(15):
    train_stats, eval_stats = ddl.train_latent_one_epoch(
        ddl_model,
        preprocessor,
        df_act_train, df_obs_train, df_obs_next_train,
        df_act_val, df_obs_val, df_obs_next_val,
        lr=1e-4, batch_size=32, regression_weight=1.0
    )
    print(f"Train Epoch #{i_epoch}: {train_stats}")
    print(f"Eval  Epoch #{i_epoch}: {eval_stats}")

Train Epoch #0: (det(A)=48263072.000, det(B)=-612080320.000, loss=983.7062966, regression_loss=0.7242612, gersh(A)=495.98108, gersh(B)=487.00096, is_stable(A)=False, is_stable(B)=False, time=10.6s)
Eval  Epoch #0: (det(A)=48263072.000, det(B)=-612080320.000, loss=506.5692243, regression_loss=0.1490889, gersh(A)=258.42993, gersh(B)=247.99020, is_stable(A)=False, is_stable(B)=False, time=0.9s)
Train Epoch #1: (det(A)=243.223, det(B)=190768.109, loss=241.6274936, regression_loss=0.1184914, gersh(A)=126.01329, gersh(B)=115.49571, is_stable(A)=False, is_stable(B)=False, time=11.5s)
Eval  Epoch #1: (det(A)=243.223, det(B)=190768.109, loss=60.3686060, regression_loss=0.3483042, gersh(A)=34.83260, gersh(B)=25.18771, is_stable(A)=False, is_stable(B)=False, time=0.9s)
Train Epoch #2: (det(A)=995.262, det(B)=9256607.000, loss=18.3141022, regression_loss=0.2146491, gersh(A)=9.61369, gersh(B)=8.48576, is_stable(A)=True, is_stable(B)=True, time=11.1s)
Eval  Epoch #2: (det(A)=995.262, det(B)=9256607.

In [75]:
for i_epoch in range(24):
    train_stats, eval_stats = ddl.train_policy_one_epoch(
        ddl_model,
        ddl_policy,
        preprocessor,
        df_act_train, df_obs_train, df_obs_next_train,
        df_act_val, df_obs_val, df_obs_next_val,
        lr=1e-3, batch_size=32
    )
    print(f"Train Epoch #{i_epoch}: {train_stats}")
    print(f"Eval  Epoch #{i_epoch}: {eval_stats}")

Train Epoch #0: (loss=2.9432009, time=7.14s)
Eval  Epoch #0: (loss=2.6546783, time=0.57s)
Train Epoch #1: (loss=2.4812912, time=6.42s)
Eval  Epoch #1: (loss=2.2648967, time=0.58s)
Train Epoch #2: (loss=2.1813376, time=6.52s)
Eval  Epoch #2: (loss=2.1803170, time=0.58s)
Train Epoch #3: (loss=2.0918030, time=6.70s)
Eval  Epoch #3: (loss=2.0035190, time=0.59s)
Train Epoch #4: (loss=2.0352445, time=6.49s)
Eval  Epoch #4: (loss=1.9267732, time=0.63s)
Train Epoch #5: (loss=2.0016060, time=6.68s)
Eval  Epoch #5: (loss=1.9122217, time=0.60s)
Train Epoch #6: (loss=1.9510232, time=6.67s)
Eval  Epoch #6: (loss=1.9864523, time=0.65s)
Train Epoch #7: (loss=1.8987660, time=6.71s)
Eval  Epoch #7: (loss=1.8322712, time=0.59s)
Train Epoch #8: (loss=1.8628444, time=7.09s)
Eval  Epoch #8: (loss=1.7706153, time=0.74s)
Train Epoch #9: (loss=1.8241644, time=7.69s)
Eval  Epoch #9: (loss=1.8140022, time=0.72s)
Train Epoch #10: (loss=1.7982441, time=7.80s)
Eval  Epoch #10: (loss=1.7357274, time=0.70s)
Train Ep

In [76]:
mean_rew, _, _ = recording.evaluate_n_episodes("Hopper-v3", ddl_agent, 300)
print(f"Mean reward (DDL): {mean_rew}")

100%|█████████████████████████████████████████| 300/300 [00:03<00:00, 79.98it/s]

Mean reward (DDL): 48.13361213910028





In [14]:
import numpy as np

In [15]:
env = gym.make("Hopper-v3")
random_baseline_agent = recording.RandomAgent(env.action_space)
mean_rew, _, _ = recording.evaluate_n_episodes(env, random_baseline_agent, 300)
print(f"Mean reward (Random): {mean_rew}")

100%|████████████████████████████████████████| 300/300 [00:01<00:00, 252.61it/s]

Mean reward (Random): 19.048736695626665



