In [None]:
import os
import sys
import ray
import torch
import pandas as pd
import torch.nn as nn
sys.path.append(os.getcwd())
from models.cbp_model import CBPModel
from models.cbp_model import CBPModel
import plotly.graph_objects as go
from ray.rllib.policy import sample_batch
from ray.rllib.models import ModelCatalog
from lop.algos.cbp_linear import CBPLinear
from ray.rllib.models.torch.misc import SlimFC
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.utils.annotations import override
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.utils.framework import try_import_torch
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC

In [None]:
path = os.getcwd()
torch, nn = try_import_torch()
ray.init()

In [None]:
class CBPModel(TorchModelV2, nn.Module):
    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
        TorchModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name)
        nn.Module.__init__(self)

        self.actor_fcnet = TorchFC(obs_space, action_space, action_space.shape[0]*2, model_config, name + 
                                   "_actor")
        hidden_layer_size = model_config['fcnet_hiddens'][0]
        # fancy stuff later
        # num_layers = len(hidden_layer_size)
        # # make layers
        # for i in range(1, num_layers+1):
        #         if i == 1:
        #             self.fc{i} = nn.Linear(obs_space.shape[0], hidden_layer_size)
        #         else:
        #             self.fc{i} = nn.Linear(hidden_layer_size, hidden_layer_size)
        # # output layer - just use mean for now
        # self.fc_out =  nn.Linear(hidden_layer_size, 1)
        self.act = nn.LeakyReLU()
        self.fc1 = nn.Linear(obs_space.shape[0], hidden_layer_size)
        self.fc2 = nn.Linear(hidden_layer_size, hidden_layer_size)
        self.fc3 = nn.Linear(hidden_layer_size, hidden_layer_size)
        self.fc4 = nn.Linear(hidden_layer_size, 1)

        self.cbp1 = CBPLinear(self.fc1, self.fc2, replacement_rate=1e-4, maturity_threshold=100, init='kaiming', act_type='leaky_relu')
        self.cbp2 = CBPLinear(self.fc2, self.fc3, replacement_rate=1e-4, maturity_threshold=100, init='kaiming', act_type='leaky_relu')
        self.cbp3 = CBPLinear(self.fc3, self.fc4, replacement_rate=1e-4, maturity_threshold=100, init='kaiming', act_type='leaky_relu')

    @override(TorchModelV2)
    def forward(self, input_dict, state, seq_lens):
        logits, _ = self.actor_fcnet(input_dict, state, seq_lens)
        means, log_stds = torch.chunk(logits, 2, -1)
        means_clamped = torch.clamp(means, -1, 1)
        log_stds_clamped = torch.clamp(log_stds, -10, 0)
        logits = torch.cat((means_clamped, log_stds_clamped), dim = -1)

        '''-----CBP implementation for critic network-----'''
        obs = input_dict['obs']
        x = self.act(self.fc1(obs))
        x = self.cbp1(x)
        x = self.act(self.fc2(x))
        x = self.cbp2(x)
        x = self.act(self.fc3(x))
        x = self.cbp3(x)
        # no activation on the output since this will be a scalar of value
        self.value = self.fc4(x)        
        return logits, state

    @override(TorchModelV2)
    def value_function(self):
        return self.value.squeeze(-1)

# register the custom model to make it available to Ray/RLlib
ModelCatalog.register_custom_model("CBPModel", CBPModel)

In [None]:
class SimpleCustomTorchModel(TorchModelV2, nn.Module):
    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
        TorchModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name)
        nn.Module.__init__(self)

        # self.critic_fcnet = TorchFC(obs_space, action_space, 1, model_config, name + "_critic")
        self.actor_fcnet = TorchFC(obs_space, action_space, action_space.shape[0]*2, model_config, name + 
                                   "_actor")
        # create network the same as CBP for equivalent testing
        hidden_layer_size = model_config['fcnet_hiddens'][0]
        self.act = nn.LeakyReLU()
        self.fc1 = nn.Linear(obs_space.shape[0], hidden_layer_size)
        self.fc2 = nn.Linear(hidden_layer_size, hidden_layer_size)
        self.fc3 = nn.Linear(hidden_layer_size, hidden_layer_size)
        self.fc4 = nn.Linear(hidden_layer_size, 1)

    @override(TorchModelV2)
    def forward(self, input_dict, state, seq_lens):
        # Get the model output
        logits, _ = self.actor_fcnet(input_dict, state, seq_lens)
        means, log_stds = torch.chunk(logits, 2, -1)
        # assuming means are normalized between -1 and 1
        means_clamped = torch.clamp(means, -1, 1)
        # this is based on the means being -1 to 1 so the std_dev domain would be [0,1)
        # where exp(-10) and exp(0) would give the above domain for std_dev
        log_stds_clamped = torch.clamp(log_stds, -10, 0)
        logits = torch.cat((means_clamped, log_stds_clamped), dim = -1)

        obs = input_dict['obs']
        x = self.act(self.fc1(obs))
        x = self.act(self.fc2(x))
        x = self.act(self.fc3(x))
        # no activation on the output since this will be a scalar of value
        self.value = self.fc4(x)    
        return logits, state

    @override(TorchModelV2)
    def value_function(self):
        return self.value.squeeze(-1)

# register the custom model to make it available to Ray/RLlib
ModelCatalog.register_custom_model("SimpleCustomTorchModel", SimpleCustomTorchModel)

In [None]:
ModelCatalog.register_custom_model("CBPModel", CBPModel)

In [None]:
%%time
config = PPOConfig().training(
    gamma = 0.99,
    lambda_ = 0.95,
    # kl_coeff = 0.5,
    num_sgd_iter = 15,
    lr_schedule = [[0, 0.0003], [15_000_000, 0.00025], [30_000_000, 0.0002], [50_000_000, 0.0001]],
    vf_loss_coeff = 0.5,
    vf_clip_param = 15.0,
    clip_param = 0.2,
    grad_clip_by ='norm', 
    train_batch_size = 16_000, 
    sgd_minibatch_size = 4_000,
    grad_clip = 0.5,
    model = {'custom_model': 'CBPModel', 
           'vf_share_layers': False,
           'fcnet_hiddens': [256,256],
           'fcnet_activation': 'LeakyReLU',
             #this isn't used for some models, but doesn't hurt to keep it
           'custom_model_config': {
                'num_gaussians': 2,
           }
            }
).environment(env = 'HalfCheetah-v4'
).rollouts(
num_rollout_workers = 28
).resources(num_gpus = 1
)


algo = config.build()

num_iterations = 625
results = []

for i in range(num_iterations):
    result = algo.train()
    print(f"Iteration: {i}, Mean Reward: {result['episode_reward_mean']}")
    results.append([result['episode_reward_mean'], result['episode_len_mean']])


# results_df = pd.DataFrame(results)
    
ray.shutdown()

In [None]:
cbp_first = pd.read_csv(f"/root/ray_results/PPO_HalfCheetah-v4_2024-10-03_13-15-08rf5l5dxr/progress.csv")
normal_model = pd.read_csv(f"/root/ray_results/PPO_HalfCheetah-v4_2024-10-03_14-29-26yrfio2ms/progress.csv")

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(y=cbp_first['episode_reward_mean'], mode='lines', name='Mean Reward CBP', yaxis='y1'))
fig.add_trace(go.Scatter(y=normal_model['episode_reward_mean'], mode='lines', name='Mean Reward Normal', yaxis='y1'))

fig.add_trace(go.Scatter(y=cbp_first['info/learner/default_policy/learner_stats/vf_loss'], mode='lines', name='VF Loss CBP', yaxis='y2'))
fig.add_trace(go.Scatter(y=normal_model['info/learner/default_policy/learner_stats/vf_loss'], mode='lines', name='VF Loss normal', yaxis='y2'))

fig.update_layout(
    title='Episode Reward Mean and VF Loss Over Time',
    xaxis=dict(title='Episode'),
    yaxis=dict(title='Episode Reward Mean', side='left'),
    yaxis2=dict(title='VF Loss', side='right', overlaying='y', anchor='x'),
    legend=dict(x=0.78, y=0.1),
    width=1000,
    height=600 
)

fig.show()