In [1]:
import ray
import time
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.algorithms.callbacks import DefaultCallbacks
from ray.rllib.models import ModelCatalog
from ray.rllib.utils.framework import try_import_torch
from ray.rllib.algorithms.ppo import PPOConfig
from ray import tune, air
from ray.rllib.core.models.configs import MLPHeadConfig
from ray.rllib.core.models.catalog import Catalog
from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC
from ray.rllib.utils.annotations import OverrideToImplementCustomLogic
from gymnasium.spaces import Box
from ray.train.torch import TorchTrainer
from ray.train import ScalingConfig
import ray.rllib.algorithms.ppo as ppo
from ray.rllib.utils.typing import Dict, TensorType, List, ModelConfigDict
import gymnasium as gym
import matplotlib.pyplot as plt
from ray.rllib.models.torch.misc import SlimFC, AppendBiasLayer
from ray.rllib.policy.torch_policy import TorchPolicy
from ray.rllib.policy.policy_template import build_policy_class
from ray.rllib.policy.sample_batch import SampleBatch
import numpy as np
import pandas as pd
from ray import tune
import math
from torch.distributions.normal import Normal
from ray.tune.schedulers import ASHAScheduler
from torch.utils.tensorboard import SummaryWriter
import pandas as pd
import os
import plotly.express as px
import plotly.graph_objects as go
import shutil

### PPO MoG Implementation

### Data flow: obs -> forward() -> model_out \-> value_function() -> V(s)

In [2]:
path = os.getcwd()

In [3]:
torch, nn = try_import_torch()

In [4]:
ray.init()

2024-03-19 12:18:04,917	INFO worker.py:1724 -- Started a local Ray instance.


0,1
Python version:,3.10.13
Ray version:,2.9.2




### MOG and NLL

In [5]:
global adder
adder = 1.000001
global num_gaussians
num_gaussians = 2
global parquet_file_name
parquet_file_name = 'logs/nll_2gaussians_65M.parquet'

class CustomTorchModelMOG(TorchModelV2, nn.Module):
    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
        super(CustomTorchModelMOG, self).__init__(obs_space, action_space, num_outputs, model_config, name)
        
        TorchModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name)
        
        nn.Module.__init__(self)

        self.action_space = action_space
        
        self.actor_fcnet = TorchFC(obs_space, action_space, action_space.shape[0]*2, model_config, name + "_actor")
        #the output dimensions for the actor is a function of the action space!!!! 
        #it's not batch_size, 2 for mean / variance, but instead 2*(the number of action dimensions) 
        #so to as model the distributinons for each action
        
        self.critic_fcnet = TorchFC(obs_space, action_space, num_gaussians*3, model_config, name + "_critic")

        torch.autograd.set_detect_anomaly(True)

        # self.log_data = pd.DataFrame(columns = ['timestep', 'logp', 'loga', 'sigma_clamped', 'td_targets_expanded', 'mu_pred', 'mus', 'summing_log'])
        
        # self.log_step = 0
    
    @OverrideToImplementCustomLogic
    def forward(self, input_dict, state, seq_lens):
        # Actor forward pass
        raw_action_logits, _ = self.actor_fcnet(input_dict, state, seq_lens)

        '''
        According to Ray, we do not need to process the action logits and can send the raw action logits to the policy
        '''

        elu = torch.nn.ELU()
        
        '''
        These components represent the critic's prediction on the current state:
        -So when calling "value_output, _ = self.critic_fcnet(input_dict, state, seq_lens)" we get components respective of how many gaussians we want
        
        -For example, if we want 3 gaussians to make up the value distribution then this will output a tensor[batch_size, 9] where we can slice based 
        on the number of mus, variances, and weights respectively

        -We can then access these to 
        '''

        #critic forward pass
        value_output, _ = self.critic_fcnet(input_dict, state, seq_lens)

        i = num_gaussians
        means = value_output[:, :i]
        self._u = means

        
        sigmas = value_output[:, i:i*2]
        sigmas = elu(sigmas) + adder
        self._sigmas = sigmas
        
        alphas = value_output[:, i*2:]
        alphas = torch.nn.functional.softmax(alphas, dim=-1)
        self._alphas = alphas

        return raw_action_logits, state

    @OverrideToImplementCustomLogic
    def value_function(self):
        multiply = self._u * self._alphas
        values = torch.sum(multiply, dim = 1)
        return values
    
    '''
    -We need the sample from MoG as a means of doing the n-step return which is common for reducing
    variance, but at the cost of increasing bias...fortunately, betwen 3-7 samples is enough to 
    approximately have the bias/variance tradeoff nearly equal
    --This is important to do to increase convergence and also decrease computation time compared to
    the Monte Carlo approach which is sampling a bunch of trajectories
    
    -From the sample we need to compute the log probabilites of the samples
    '''

    def predict_gmm_params(self, cur_obs):
        input_dict = {'obs': cur_obs}
        state = []  
        seq_lens = None
        value_output, _ = self.critic_fcnet(input_dict, state, seq_lens)

        elu = torch.nn.ELU()

        i = num_gaussians
        
        means = value_output[:, :i]
        
        sigmas_prev = value_output[:, i:i*2]
        sigmas = elu(sigmas_prev) + adder
        
        alphas = value_output[:, i*2:]
        
        return means, sigmas, alphas
    
    def compute_log_likelihood(self, td_targets, mu_pred, sigma_pred, alphas_pred):
        
        td_targets_expanded = td_targets.unsqueeze(1)
        
        sigma_clamped = torch.clamp(sigma_pred, 1e-9, None)
        # alphas_clamped = torch.clamp(alpha_pred, 1e-30, 1e5)
        
        log_2_pi = torch.log(2*torch.tensor(math.pi))
        
        mus = td_targets_expanded - mu_pred
        
        logp = torch.clamp(-torch.log(sigma_clamped) - .5 * log_2_pi - torch.square(mus) / (2*torch.square(sigma_clamped)), -1e9, None)
        loga = torch.nn.functional.log_softmax(alphas_pred, dim=-1)

        summing_log = -torch.logsumexp(logp + loga, dim=-1)

        # self.log_to_dataframe(logp = logp, loga = loga, sigma_clamped = sigma_clamped, td_targets_expanded = td_targets_expanded, mu_pred = mu_pred, mus = mus, 
        #                      summing_log = summing_log)
        
        return summing_log


    @OverrideToImplementCustomLogic
    def custom_loss(self, policy_loss, sample_batch):
        gamma = 0.99
        cur_obs = sample_batch[SampleBatch.CUR_OBS]
        next_states = sample_batch[SampleBatch.NEXT_OBS]
        rewards = sample_batch[SampleBatch.REWARDS]
        dones = sample_batch[SampleBatch.DONES]

        mu_pred, sigma_pred, w_pred = self.predict_gmm_params(cur_obs)
        mu_target, sigma_target, w_target = self.predict_gmm_params(next_states)
        w_target = torch.nn.functional.softmax(w_target, dim = -1)

        
        next_state_value = torch.sum(mu_target * w_target, dim = 1).clone().detach()
        td_targets = rewards + gamma * next_state_value * (1 - dones.float())
        
        log_likelihood = self.compute_log_likelihood(td_targets, mu_pred, sigma_pred, w_pred)
        log_likelihood = torch.clamp(log_likelihood, -10, 80)
        nll_loss = torch.mean(log_likelihood)
        
        total_loss = [loss + nll_loss for loss in policy_loss]
    
        return total_loss

    # def log_to_dataframe(self, logp, loga, sigma_clamped, td_targets_expanded, mu_pred, mus, summing_log):
    #     new_log_entry = pd.DataFrame({
    #         'timestep': [self.log_step],
    #         'logp': [logp.mean().item() if logp is not None else np.nan],
    #         'loga': [loga.mean().item() if loga is not None else np.nan],
    #         'sigma_clamped': [sigma_clamped.mean().item() if sigma_clamped is not None else np.nan],
    #         'td_targets_expanded': [td_targets_expanded.mean().item() if td_targets_expanded is not None else np.nan],
    #         'mu_pred': [mu_pred.mean().item() if mu_pred is not None else np.nan],
    #         'mus': [mus.mean().item() if mus is not None else np.nan],
    #         'summing_log': [summing_log.min().item() if summing_log is not None else np.nan]
    #     })

    #     self.log_data = pd.concat([self.log_data, new_log_entry], ignore_index = True)

    #     if self.log_step % 1000 == 0:
    #         self.save_to_parquet()
    #     self.log_step +=1

    # def save_to_parquet(self):
    #     self.log_data.to_parquet(parquet_file_name)

'''
Fixes:
(1)Target sigmas were not strictly positive: added the elu activation along with a positive constant
(2)Fixed the critic network's sigmas to be elu with an added constant isntead of squares
(3)Added the cdf of the normal distribution (second term) to the loss function as per GMAC's paper
(3->)this fixed the negative distance values that were happening since this should not have happened
(3->)due to the way the expectation is taken between dists will always be positive
(4)Added internal dispersion back into the energy distance once the delta method was fixed with (3) above
(4->)This has an appropriate magnitude for the loss compared to the policy loss so no scaling is needed
(5)The result of the four main points above gives results of 1_000+ in the Cheetah-v4 env. after 100 its
(6)Fixed the loss energy loss equation to incorporate the erf with the corrected equation from wikipedia (folded normal distribution)
(6->)Having the sqrt of the variances within the erf gives less reward and therefore it not used (conflict between wiki and research)
(6->)Adding the erf made a massive difference in performance and much more stability
(7)Added clamp to the logsumexp term (backward pass gave nans)
'''


ModelCatalog.register_custom_model("custom_torch_model_mog", CustomTorchModelMOG)

In [6]:
%%time

config = PPOConfig().training(
    gamma = 0.99,
    lambda_ = 0.95,
    # kl_coeff = 0.5,
    num_sgd_iter = 30,
    lr_schedule = [[0, 0.0003], [15_000_000, 0.00025], [30_000_000, 0.0002], [50_000_000, 0.0001]],
    vf_loss_coeff = 1.0,
    vf_clip_param = 15.0,
    clip_param = 0.3,
    grad_clip_by='norm', 
    train_batch_size=65_500, 
    sgd_minibatch_size=4_096,
    grad_clip = 1.0,
    model={'custom_model': 'custom_torch_model_mog', 'vf_share_layers': False, 'fcnet_hiddens': [2048,2048],'fcnet_activation': 'LeakyReLU'},
).environment(env='HalfCheetah-v4'
).rollouts(
num_rollout_workers = 28
# num_envs_per_worker = 4,
).resources(num_gpus = 1)
#.callbacks(MyCustomCallback
#)

algo = config.build()

num_iterations = 1000
results = []

for i in range(num_iterations):
    result = algo.train()
    print(f"Iteration: {i}, Mean Reward: {result['episode_reward_mean']}")
    results.append(result['episode_reward_mean'])


results_df = pd.DataFrame(results)
results_df.to_csv(path + '/results/nll/65M_run_2gaussians.csv')
    
ray.shutdown()


`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
  self.actor_fcnet = TorchFC(obs_space, action_space, action_space.shape[0]*2, model_config, name + "_actor")
  self.critic_fcnet = TorchFC(obs_space, action_space, num_gaussians*3, model_config, name + "_critic")


Iteration: 0, Mean Reward: -346.455060381677
Iteration: 1, Mean Reward: -341.0677681875303
Iteration: 2, Mean Reward: -326.5555595896037
Iteration: 3, Mean Reward: -311.95730853814985
Iteration: 4, Mean Reward: -292.88575104210076
Iteration: 5, Mean Reward: -280.15038664721
Iteration: 6, Mean Reward: -269.59837117891567
Iteration: 7, Mean Reward: -275.50928472144557
Iteration: 8, Mean Reward: -263.8139485614666
Iteration: 9, Mean Reward: -249.99821590543527
Iteration: 10, Mean Reward: -243.11622935386825
Iteration: 11, Mean Reward: -240.05116588953058
Iteration: 12, Mean Reward: -224.60351434577703
Iteration: 13, Mean Reward: -212.75370817146646
Iteration: 14, Mean Reward: -199.4989335172376
Iteration: 15, Mean Reward: -187.14852084094866
Iteration: 16, Mean Reward: -177.06711426671595
Iteration: 17, Mean Reward: -161.15269495355943
Iteration: 18, Mean Reward: -150.04339737516747
Iteration: 19, Mean Reward: -135.51913792059202
Iteration: 20, Mean Reward: -120.16439049651875
Iteration: 

  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.py", line 739, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.10/dist-packages/tornado/platform/asyncio.py", line 205, in start
    self.asyncio_loop.run_forever()
  File "/usr/lib/python3.10/asyncio/base_events.py", line 603, in run_forever
    self._run_once()
  File "/usr/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once
    handle._run()
  File "/usr/lib/python3.10/asyncio/events.py", line 80, in _run
    self._context.run(self._call

ValueError: Function 'ExpBackward0' returned nan values in its 0th output.
 tracebackTraceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/ray/rllib/policy/torch_policy_v2.py", line 1392, in _worker
    loss_out[opt_idx].backward(retain_graph=True)
  File "/usr/local/lib/python3.10/dist-packages/torch/_tensor.py", line 396, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
  File "/usr/local/lib/python3.10/dist-packages/torch/autograd/__init__.py", line 173, in backward
    Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
RuntimeError: Function 'ExpBackward0' returned nan values in its 0th output.

In tower 0 on device cuda:0