In [None]:
import os
import ray
import time
import math
import numpy as np
import pandas as pd
from ray.rllib.algorithms.ppo import PPOConfig
from SimpleTorchModel import SimpleCustomTorchModel
from ray.rllib.utils.framework import try_import_torch
from normalize_advantages import NormalizeAdvantagesCallback

path = os.getcwd()
torch, nn = try_import_torch()
ray.init()

In [None]:
%%time
config = PPOConfig().training(
    gamma = 0.99,
    lambda_ = 0.95,
    # kl_coeff = 0.5,
    num_sgd_iter = 30,
    lr_schedule = [[0, 0.0003], [15_000_000, 0.00025], [30_000_000, 0.0002], [50_000_000, 0.0001]],
    vf_loss_coeff = 0.5,
    vf_clip_param = 15.0,
    clip_param = 0.2,
    grad_clip_by ='norm', 
    train_batch_size = 65_000, 
    sgd_minibatch_size = 4_096,
    grad_clip = 0.5,
    model = {'custom_model': 'SimpleCustomTorchModel', 
           'vf_share_layers': False,
           'fcnet_hiddens': [256,256],
           'fcnet_activation': 'LeakyReLU',
             #this isn't used for some models, but doesn't hurt to keep it
           'custom_model_config': {
                'num_gaussians': 2,
           }
            }
).environment(env = 'HalfCheetah-v4'
).rollouts(
num_rollout_workers = 28
).resources(num_gpus = 1
).callbacks(NormalizeAdvantagesCallback
)


algo = config.build()

num_iterations = 1
results = []

for i in range(num_iterations):
    result = algo.train()
    print(f"Iteration: {i}, Mean Reward: {result['episode_reward_mean']}")
    results.append([result['episode_reward_mean'], result['episode_len_mean']])


results_df = pd.DataFrame(results)
    
ray.shutdown()
