<a href="https://colab.research.google.com/github/yasu-k2/adapt-rl/blob/master/adapt_rl.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Adapt RL

## Requirements

In [None]:
!sudo apt update
!sudo apt install -y xvfb
!sudo apt install -y python-opengl
!sudo apt install -y libopencv-dev

In [None]:
!pip install matplotlib
!pip install tensorflow
!pip install torch torchvision
!pip install gym
!pip install pybullet
!pip install ray[rllib]
!pip install gym-notebook-wrapper

In [None]:
!git clone https://github.com/ray-project/ray.git
!mkdir ./ray/logdir ./ray/tmp

In [None]:
%cd ./ray

In [None]:
# fix
#!cat -n /usr/local/lib/python3.6/dist-packages/torch/serialization.py
!sed -i -e "486s/None/'cpu'/" /usr/local/lib/python3.6/dist-packages/torch/serialization.py

## Function Definition

In [None]:
import matplotlib.pyplot as plt
import gym
import gnwrapper
import ray
from ray import tune
from ray.tune.registry import register_env
from ray.rllib.agents import ppo

plt.rcParams["animation.embed_limit"] = 100.0

In [None]:
class HumanoidBulletEnv_alt(gym.Env):
    import pybullet_envs
    def __init__(self, env_config):
        env = gym.make("HumanoidBulletEnv-v0")
        #env = gnwrapper.Monitor(env, directory='./tmp/', force=True, video_callable=lambda ep: True)
        env = gnwrapper.LoopAnimation(env)
        self.env = env
        self.action_space = self.env.action_space
        self.observation_space = self.env.observation_space
        self.hack = env_config["hack"]
    def reset(self):
        return self.env.reset()
    def step(self, action):
        if self.hack:
            # Hack 17 DOF
            action[10] = 0.0
        return self.env.step(action)
    def render(self, mode="human"):
        return self.env.render(mode)
    def display(self):
      #return self.env.display(reset=True)
      return self.env.display()

In [None]:
def ppo_humanoid_train(config, stop, log_dir='./logdir'):
    results = tune.run("PPO", config=config, stop=stop, local_dir=log_dir, checkpoint_at_end=True, checkpoint_freq=100)
    checkpoints = results.get_trial_checkpoints_paths(trial=results.get_best_trial("episode_reward_mean"), metric="episode_reward_mean")
    print(checkpoints)
    return checkpoints

In [None]:
def ppo_humanoid_train_altenv(config, stop, chkdir, log_dir='./logdir'):
    results = tune.run(ppo.PPOTrainer, config=config, stop=stop, local_dir=log_dir, checkpoint_at_end=True, checkpoint_freq=10, restore=chk_dir)
    checkpoints = results.get_trial_checkpoints_paths(trial=results.get_best_trial("episode_reward_mean"), metric="episode_reward_mean")
    print(checkpoints)
    return checkpoints

In [None]:
def ppo_humanoid_test(config, chk_dir):
    agent = ppo.PPOTrainer(config=config, env="HumanoidBulletEnv_alt")
    agent.restore(chk_dir)
    env_config = config["env_config"]
    env = HumanoidBulletEnv_alt(env_config)
    episode_reward = 0.0
    done = False
    env.render() #
    obs = env.reset()
    while not done:
        env.render() #
        action = agent.compute_action(obs)
        obs, reward, done, info = env.step(action)
        episode_reward += reward
    print("Episode Reward:", episode_reward)
    env.display()
    env.close()
    del agent
    del env

## Exectution

In [None]:
ray.init()
register_env("HumanoidBulletEnv_alt", lambda config: HumanoidBulletEnv_alt(config))

In [None]:
env_config = {"hack": False}
# config: reduced num_workers, num_gpus
config = {
    "env": "HumanoidBulletEnv_alt",
    "env_config": env_config,
    "model": {
        "free_log_std": True,
    },
    "lr": .0001,
    "framework": "torch",
    "gamma": 0.995,
    "lambda": 0.95,
    "clip_param": 0.2,
    "kl_coeff": 1.0,
    "num_sgd_iter": 20,
    "sgd_minibatch_size": 32768,
    "horizon": 5000,
    "train_batch_size": 320000,
    "num_workers": 1,
    "num_gpus": 1,
    "batch_mode": "complete_episodes",
    "observation_filter": "MeanStdFilter",
}
# stop: reduced episode_reward_mean
stop = {
    "episode_reward_mean": 1200,
}

In [None]:
chks = ppo_humanoid_train(config, stop)

In [None]:
#!rm ./tmp/*
chk_dir = '/content/ray/logdir/PPO/PPO_HumanoidBulletEnv_alt_ID/checkpoint_No/checkpoint-No'
ppo_humanoid_test(config, chk_dir)

In [None]:
config_altenv = config
config_altenv["env_config"]["hack"] = True
stop_altenv = stop

In [None]:
chks_altenv = ppo_humanoid_train_altenv(config_altenv, stop_altenv, chk_dir)

In [None]:
#!rm ./tmp/*
chk_dir_altenv = '/content/ray/logdir/PPO/PPO_HumanoidBulletEnv_alt_ID/checkpoint_No/checkpoint-No'
ppo_humanoid_test(config_altenv, chk_dir_altenv)

In [None]:
ray.shutdown()