In [None]:
import os
import ray
import time
import math
import numpy as np
import pandas as pd
from ray import tune
import seaborn as sns
import gymnasium as gym
from copy import deepcopy
import plotly.express as px
from gymnasium import spaces
from pettingzoo import AECEnv
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from typing import Any, Dict, List
from ray.rllib.env import PettingZooEnv
from ray.tune.logger import pretty_print
from models.MOGTorchModel import MOGTorchModel
from ray.rllib.algorithms.ppo import PPOConfig
from policies.ppo_sb3_loss import CustomLossPolicy
# from models.PyFlytModel_MOG import PyFlytModel_MOG
# from models.PyFlytModel_ENN import PyFlytModel_ENN
from ray.rllib.utils.framework import try_import_torch
from policies.ppo_torch_policy import SimpleTorchPolicy
from ray.rllib.env.multi_agent_env import MultiAgentEnv
from models.CentralCriticModel import CentralStackedModel
from models.SimpleTorchModel import SimpleCustomTorchModel
from models.SimpleTorchModel_param import SimpleCustomTorchModelParam
from policies.basic_centralized_critic import CentralCriticPolicy
from add_ons.normalize_advantages import NormalizeAdvantagesCallback
from ray.rllib.algorithms.ppo.ppo_torch_policy import PPOTorchPolicy
from policies.centralized_critic_stack_state import StackedCentralPolicy

import PyFlyt.gym_envs
from ray.tune.registry import register_env
from PyFlyt.gym_envs import FlattenWaypointEnv
from PyFlyt.gym_envs.quadx_envs import quadx_hover_env, quadx_waypoints_env
from PyFlyt.pz_envs.fixedwing_envs.ma_fixedwing_dogfight_env import MAFixedwingDogfightEnv

In [None]:
path = os.getcwd()
torch, nn = try_import_torch()
ray.init()

In [None]:
class CustomDogfightEnv(MultiAgentEnv):
    def __init__(self, 
                 config, 
                 env: AECEnv = None):

        super().__init__()
        if env is None:
            self.env = MAFixedwingDogfightEnv()
        else:
            self.env = env
        self.env.reset()
        self.agent_ids = self.env.possible_agents
        self.observation_space = self.env.observation_space(self.env.agents[0])
        self.action_space = self.env.action_space(self.env.agents[0])

        # self.custom_reward_wrapper = CustomRewardWrapper(self.env)

        assert all(
            self.env.observation_space(agent) == self.observation_space
            for agent in self.env.agents
        ), (
            "Observation spaces for all agents must be identical. Perhaps "
            "SuperSuit's pad_observations wrapper can help (useage: "
            "`supersuit.aec_wrappers.pad_observations(env)`"
        )

        assert all(
            self.env.action_space(agent) == self.action_space
            for agent in self.env.agents
        ), (
            "Action spaces for all agents must be identical. Perhaps "
            "SuperSuit's pad_action_space wrapper can help (usage: "
            "`supersuit.aec_wrappers.pad_action_space(env)`"
        )
        self._agent_ids = set(self.env.agents)


    def reset(self, seed=None, options=None):
        observations, infos = self.env.reset()

        obs_dict: MultiAgentDict = {}
        infos_1 = {}
        infos_2 = {}
        for agent_id in observations.keys():
            obs_dict[agent_id] = observations[agent_id]
            if agent_id == 'uav_0':
                infos_1[agent_id] = infos[agent_id]
            elif agent_id == 'uav_1':
                infos_2[agent_id] = infos[agent_id]
        # populate infos dict if it has no data in it
        if not infos:
            infos = {}
        
        if 'uav_0' not in infos or not infos['uav_0']:
            infos['uav_0'] = {
                'wins': np.array([False, False]),
                'healths': np.array([1., 1.])
            }
        
        if 'uav_1' not in infos or not infos['uav_1']:
            infos['uav_1'] = {
                'wins': np.array([False, False]),
                'healths': np.array([1., 1.])
            }
    
    
        return obs_dict, infos

    def step(self, action_dict):
        observations, rewards, terminations, truncations, infos = self.env.step(action_dict)

        terminations["__all__"] = any(terminations.values())
        truncations["__all__"] = any(truncations.values())

        obs_dict: MultiAgentDict = {}
        reward_dict: MultiAgentDict = {}
        termination_dict: MultiAgentDict = {}
        
        truncation_dict: MultiAgentDict = {}
        info_dict: MultiAgentDict = {}

        for agent_id in observations.keys():
            obs_dict[agent_id] = observations[agent_id]
            reward_dict[agent_id] = rewards[agent_id]
            termination_dict[agent_id] = terminations[agent_id]
            termination_dict['__all__'] = any(termination_dict.values())
            truncation_dict[agent_id] = truncations[agent_id]
            truncation_dict['__all__'] = any(truncation_dict.values())
            info_dict[agent_id] = infos[agent_id]
            #populate these info_dicts if no data
        if not info_dict or 'uav_0' not in info_dict or 'uav_1' not in info_dict:
            info_dict = {
                'uav_0': {
                    'wins': array([False, False]),
                    'healths': array([1., 1.])
                },
                'uav_1': {
                    'wins': array([False, False]),
                    'healths': array([1., 1.])
                }
            }
        

        # processed_rewards = {
        #     agent_id: self.custom_reward_wrapper.reward(reward)
        #     for agent_id, reward in rewards.items()
        # }
            

        return obs_dict, reward_dict, termination_dict, truncation_dict, info_dict


def env_creator(config):
    return CustomDogfightEnv(config)
register_env('MAFixedwingDogfightEnv', env_creator)

In [None]:
def policy_mapping_fn(agent_id, episode, worker, **kwargs):
    # Check if agent_id is a digit
    if agent_id.isdigit():
        return 'policy_1' if int(agent_id) % 2 == 0 else 'policy_2'
    # Handle agent_ids like 'uav_0', 'uav_1', etc.
    return 'policy_1' if int(agent_id.split('_')[1]) % 2 == 0 else 'policy_2'

In [None]:
env_config = {
    'spawn_height': 5.0,
    'damage_per_hit': 0.02,
    'lethal_distance': 15.0,
    'lethal_angle_radians': 0.1,
    'assisted_flight': True,
    'sparse_reward': False,
    'flight_dome_size': 150.0,
    'max_duration_seconds': 60.0,
    'agent_hz': 30,
    'render_mode': None,
}

In [None]:
# %%time

# env_example = env_creator(env_config)
# obs_space = env_example.observation_space
# action_space = env_example.action_space

# config = PPOConfig().training(
#     gamma = 0.99,
#     lambda_ = 0.95,
#     # kl_coeff = 0.5,
#     num_sgd_iter = 30,
#     lr_schedule = [[0, 0.0003], [5_000_000, 0.00020], [10_000_000, 0.00015], [15_000_000, 0.0001]],
#     # lr = 0.0003,
#     vf_loss_coeff = 0.5,
#     # vf_clip_param = 1.0,
#     clip_param = 0.3,
#     grad_clip_by ='norm', 
#     train_batch_size = 2_000, 
#     sgd_minibatch_size = 500,
#     grad_clip = 0.5,
#     # kl_coeff = 0.01,
#     # entropy_coeff = 0.001,
#     optimizer = {
#         'weight_decay': 0.001
#     },
#     model = {'custom_model': 'SimpleCustomTorchModel', #SimpleCustomTorchModel MOGTorchModel
#            'vf_share_layers': False,
#            'fcnet_hiddens': [256,256],
#            'fcnet_activation': 'LeakyReLU',
#            'custom_model_config': {
#                 'num_gaussians': 3,
#                 'num_layers': 2,
#                 # 'num_outputs': action_space.shape[0],
#                 # 'parquet_file_name': 'logs/critic_logging_sigma.parquet',
#            }
#             }
# ).environment(
#     env = 'MAFixedwingDogfightEnv',
#     env_config = env_config
# ).rollouts(
# num_rollout_workers = 10
# ).resources(num_gpus = 1
# ).multi_agent(
#     policies = {
#         'policy_1': (CustomLossPolicy, obs_space, action_space, {}),
#         'policy_2': (CustomLossPolicy, obs_space, action_space, {}),
#     },
#     policy_mapping_fn=policy_mapping_fn
# )

# # .callbacks(NormalizeAdvantagesCallback
# # )

# # analysis = tune.run(
# #     'PPO',
# #     config=config.to_dict(),
# #     stop={'training_iteration':300},
# #     checkpoint_freq=10,
# #     checkpoint_at_end=True,
# #     # local_dir='./ray_results'
# # )


# algo = config.build()

# num_iterations = 1500
# results = []

# for i in range(num_iterations):
#     result = algo.train()
#     if i % 10 == 0:
#         # print(f"Iteration: {i}, Mean Reward: {result['env_runners']['episode_reward_mean']} episode length: {result['env_runners']['episode_len_mean']}")
#         print(f"Iteration: {i}, Policy 1 Mean Reward: {result['env_runners']['policy_reward_mean']['policy_1']} loss: {result['info']['learner']['policy_1']['learner_stats']['total_loss']}\n"
#               f"Iteration: {i}, Policy 2 Mean Reward: {result['env_runners']['policy_reward_mean']['policy_2']} loss: {result['info']['learner']['policy_2']['learner_stats']['total_loss']}\n"
#               f"Iteration: {i}, episode length: {result['env_runners']['episode_len_mean']}\n"
#         )

#     results.append([result['env_runners']['episode_reward_mean'], result['env_runners']['episode_len_mean']])

# results_df = pd.DataFrame(results)

# ray.shutdown()


In [None]:
%%time

# change model and policy configurations

env_example = env_creator(env_config)
obs_space = env_example.observation_space
action_space = env_example.action_space

config = PPOConfig().training(
    gamma = 0.99,
    lambda_ = 0.95,
    # kl_coeff = 0.5,
    num_sgd_iter = 30,
    lr_schedule = [[0, 0.0003], [5_000_000, 0.00020], [10_000_000, 0.00015], [15_000_000, 0.0001]],
    # lr = 0.0003,
    vf_loss_coeff = 0.5,
    # vf_clip_param = 1.0,
    clip_param = 0.3,
    grad_clip_by ='norm', 
    train_batch_size = 1_500, 
    sgd_minibatch_size = 250,
    grad_clip = 0.5,
    # kl_coeff = 0.01,
    entropy_coeff = 0.01,
    # optimizer = {
    #     'weight_decay': 0.001
    # },
).environment(
    env = 'MAFixedwingDogfightEnv',
    env_config = env_config
).rollouts(
num_rollout_workers = 10
).resources(num_gpus = 1
).multi_agent(
    policies={
        'policy_1': (StackedCentralPolicy, obs_space, action_space, {
            'model': {
                'custom_model': 'CentralStackedModel',
                'vf_share_layers': False,
                'fcnet_hiddens': [256, 256],
                'fcnet_activation': 'LeakyReLU',
                'custom_model_config': {
                    'num_gaussians': 2,
                    'num_layers': 2,
                    'num_agents': 2,
                    'num_frames': 5,
                    'vf_clipped_loss': 0.5,
                    'opp_action_in_cc': False,
                    'global_state_flag': False,
                    'gamma': 0.99,                  
                }
            }
        }),
        'policy_2': (StackedCentralPolicy, obs_space, action_space, {
            'model': {
                'custom_model': 'CentralStackedModel',
                'vf_share_layers': False,
                'fcnet_hiddens': [256, 256],
                'fcnet_activation': 'LeakyReLU',
                'custom_model_config': {
                    'num_gaussians': 2,
                    'num_layers': 2,
                    'num_agents': 2,
                    'num_frames': 5,
                    'vf_clipped_loss': 0.5,
                    'opp_action_in_cc': False,
                    'global_state_flag': False,
                    'gamma': 0.99,
                    
                }
            }
        }),
    },
    policy_mapping_fn=policy_mapping_fn
)


algo = config.build()

num_iterations = 1500
results = []

for i in range(num_iterations):
    result = algo.train()
    if i % 10 == 0:
        # print(f"Iteration: {i}, Mean Reward: {result['env_runners']['episode_reward_mean']} episode length: {result['env_runners']['episode_len_mean']}")
        print(f"Iteration: {i}, Policy 1 Mean Reward: {result['env_runners']['policy_reward_mean']['policy_1']} loss: {result['info']['learner']['policy_1']['learner_stats']['total_loss']}\n"
              f"Iteration: {i}, Policy 2 Mean Reward: {result['env_runners']['policy_reward_mean']['policy_2']} loss: {result['info']['learner']['policy_2']['learner_stats']['total_loss']}\n"
              f"Iteration: {i}, episode length: {result['env_runners']['episode_len_mean']}\n")

    results.append([result['env_runners']['episode_reward_mean'], result['env_runners']['episode_len_mean']])

results_df = pd.DataFrame(results)

ray.shutdown()


In [None]:
data1 = pd.read_csv(f"/workspace/pyflyt/logs/log_std_list.csv")

In [None]:
data1.head()

In [None]:
import seaborn as sns

In [None]:
filtered = data1[data1<-1]
filtered2 = data1[(data1 < 0.0001) & (data1 > -0.0001)]
filtered3 = data1[data1>10]

In [None]:
filtered.dropna(axis = 0, how = 'all')

In [None]:
filtered2.dropna(axis = 0, how = 'all')

In [None]:
filtered3.dropna(axis = 0, how = 'all')

In [None]:
sns.kdeplot(filtered3)

In [None]:
action_0 = filtered3['0'].dropna(axis = 0, how = 'all')

In [None]:
sns.kdeplot(action_0)

In [None]:
data_logp = pd.read_csv(f"/workspace/pyflyt/logs/logp_list.csv")

In [None]:
filtered_logp = data_logp[data_logp<-50]

In [None]:
filtered_logp.dropna(axis = 0, how='all')

In [None]:
sns.kdeplot(filtered_logp)

In [None]:
results_df = pd.DataFrame(results)
experiment_type = 'reg_critic_models'
results_df.to_csv(path + '/logs/'+experiment_type+'.csv')

In [None]:
plt.plot(results)
plt.title('Training Progress - Mean Reward per Episode')
plt.xlabel('Iteration')
plt.ylabel('Mean Reward')
# plt.savefig('Basic PPO - HalfCheetah-v4')
plt.show()

In [None]:
# env = FlattenWaypointEnv(gym.make(id='PyFlyt/QuadX-Waypoints-v1', flight_mode=-1), context_length=1)

env = CustomDogfightEnv(config=env_config) 

obs_list = []
obs, info = env.reset()
# env.env.env.env.env.drones[0].set_mode(-1)
obs = obs
obs_list += [obs]

reward_list = []
action_list = []
start = time.time()
for i in range(10*40):
    # compute_action = algo.get_policy('policy_1').compute_actions(obs)
    # action = compute_action['default']
    # # obs, reward, terminated, truncated, info = env.step(np.zeros((4))+.79)
    # obs, reward, terminated, truncated, info = env.step(action)
    action_dict = {}
    for agent_id in obs:
        policy_id = 'policy_1' if agent_id == 'uav_0' else 'policy_2'
        input_dict = {
            "obs": torch.tensor([obs[agent_id]], dtype=torch.float32)  # Convert to tensor
        }
        compute_action = algo.get_policy(policy_id).compute_actions_from_input_dict(input_dict)
        action_dict[agent_id] = compute_action[0][0] 
        action_dict[agent_id] = np.clip(compute_action[0][0], -1.0, 1.0)  # Clip the action to valid range

    obs, reward, terminated, truncated, info = env.step(action_dict)


    
    obs_list += [obs]
    
    reward_list += [reward]
    action_list += [action_dict]
    
    if terminated["__all__"] or any(info[agent_id].get('collision', False) or info[agent_id].get('out_of_bounds', False) for agent_id in obs):
        break

arrays = np.array(obs_list)
obs_array = np.vstack(arrays)
reward_array = np.array(reward_list)
action_array = np.array(action_list) 
env.close()

In [None]:
obs_array.shape

In [None]:
obs_array[20][0]

In [None]:
obs_array_uav_0 = np.array([obs['uav_0'] for obs in obs_list])
obs_array_uav_1 = np.array([obs['uav_1'] for obs in obs_list])

# Ensure the extracted array has the correct shape
obs_array_uav_0 = np.vstack(obs_array_uav_0)
obs_array_uav_1 = np.vstack(obs_array_uav_1)

In [None]:
scatter_uav_0 = go.Scatter3d(
    x=obs_array_uav_0[:, 10],
    y=obs_array_uav_0[:, 11],
    z=obs_array_uav_0[:, 12],
    mode='markers',
    marker=dict(
        size=5,
        color='red',
        opacity=0.6
    ),
    name='uav_0'
)
scatter_uav_1 = go.Scatter3d(
    x=obs_array_uav_1[:, 10],
    y=obs_array_uav_1[:, 11],
    z=obs_array_uav_1[:, 12],
    mode='markers',
    marker=dict(
        size=5,
        color='blue',
        opacity=0.6
    ),
    name='uav_1'
)
fig = go.Figure(data=[scatter_uav_0, scatter_uav_1])

# Save the plot as an HTML file
fig.write_html('3D_renders/3d_drone_space4_experiment.html')

In [None]:
import seaborn as sns

In [None]:
dataframes = {}
for filename in os.listdir(path+'/logs/test_runs'):
    if filename.endswith('.csv'):
        file_path = os.path.join(path+'/logs/test_runs', filename)
        df = pd.read_csv(file_path)
        key = os.path.splitext(filename)[0]
        dataframes[key] = df


data_list = []
labels = []
output_desired = 'length' #else will give length

for key, df in dataframes.items():
    if output_desired == 'reward':
        data_list.append(df.iloc[:,0])
        labels.append(f"reward for {key}")
    else:
        data_list.append(df.iloc[:,1])
        labels.append(f"length for {key}")

for data in data_list:
    sns.kdeplot(data, fill = True)

plt.legend(title = 'Modes', labels = labels)
plt.title(f"{output_desired}")
plt.show()

In [None]:
dataframes = {}
for filename in os.listdir(path+'/logs/test_runs'):
    if filename.endswith('.csv'):
        file_path = os.path.join(path+'/logs/test_runs', filename)
        df = pd.read_csv(file_path)
        key = os.path.splitext(filename)[0]
        dataframes[key] = df


reward = []
labels = []
output_desired = 'reward' #else will give length

for key, df in dataframes.items():
    plt.scatter(df.iloc[:,0], df.iloc[:,1])
    labels.append(f"length for {key}")

plt.legend(title = 'Different runs', labels = labels)
plt.title(f"{output_desired} over time")
plt.show()