In [None]:
import os
import gym
import ray
import time
import math
import numpy as np
import pandas as pd
from ray import tune
from ppo_torch_policy import SimpleTorchPolicy
from ray.rllib.algorithms.ppo import PPOConfig, PPO
from SimpleTorchModel import SimpleCustomTorchModel
from ray.rllib.utils.framework import try_import_torch
from normalize_advantages import NormalizeAdvantagesCallback

from ray.tune.tune_config import TuneConfig
from ray.tune.tuner import Tuner
from ray.air.config import ScalingConfig

In [None]:
path = os.getcwd()
torch, nn = try_import_torch()
ray.init(num_cpus = 32, num_gpus = 1)

In [None]:
%%time
# NOTE: this env does not support multi-agent so it doesn't run
# this was used as only a base example of how to run such a policy, model, and callback
env_name = 'HalfCheetah-v4'
env = gym.make(env_name)
obs_space = env.observation_space
action_space = env.action_space

config = PPOConfig().training(
    gamma = 0.99,
    lambda_ = 0.95,
    # kl_coeff = 0.5,
    num_sgd_iter = 30,
    lr_schedule = [[0, 0.0003], [15_000_000, 0.00025], [30_000_000, 0.0002], [50_000_000, 0.0001]],
    vf_loss_coeff = 0.5,
    vf_clip_param = 15.0,
    clip_param = 0.2,
    grad_clip_by ='norm', 
    train_batch_size = 65_000, 
    sgd_minibatch_size = 4_096,
    grad_clip = 0.5,
    model = {'custom_model': 'SimpleCustomTorchModel', 
           'vf_share_layers': False,
           'fcnet_hiddens': [256,256],
           'fcnet_activation': 'LeakyReLU',
             #this isn't used for some models, but doesn't hurt to keep it
           'custom_model_config': {
                'num_gaussians': 2,
               'num_outputs': action_space.shape[0]
           }
            }
).environment(env = env_name
).rollouts(
num_rollout_workers = 28
).resources(num_gpus = 1
).callbacks(NormalizeAdvantagesCallback
).multi_agent(
    policies = {
        'policy_1': (SimpleTorchPolicy, obs_space, action_space, {}),
    },
    policy_mapping_fn = lambda agent_id: 'policy_1' if agent_id % 2 == 0 else 'policy_2'
)


algo = config.build()

# config['policy'] = SimpleTorchPolicy

analysis = tune.run(
    "PPO",
    config=config.to_dict(),
    stop={"training_iteration": 1},
    checkpoint_freq=10,
    checkpoint_at_end=True,
    local_dir="./ray_results", 
)

num_iterations = 1
results = []

for i in range(num_iterations):
    result = analysis.results_df
    mean_reward = result['episode_reward_mean'].iloc[-1]
    mean_length = result['episode_len_mean'].iloc[-1]
    print(f"Iteration: {i}, Mean Reward: {mean_reward}")
    results.append([mean_reward, mean_length])


# results_df = pd.DataFrame(results)
    
ray.shutdown()
