In [None]:
import os
import ray
import time
import math
import numpy as np
import pandas as pd
from ray import tune
import seaborn as sns
import gymnasium as gym
import plotly.express as px
import matplotlib.pyplot as plt
from ray.rllib.env import PettingZooEnv
from ray.tune.logger import pretty_print
from models.PyFlytModel import PyFlytModel
from ray.rllib.algorithms.ppo import PPOConfig
from models.PyFlytModel_MOG import PyFlytModel_MOG
from models.PyFlytModel_ENN import PyFlytModel_ENN
from ray.rllib.utils.framework import try_import_torch
from policies.ppo_torch_policy import SimpleTorchPolicy
from ray.rllib.env.multi_agent_env import MultiAgentEnv
from models.SimpleTorchModel import SimpleCustomTorchModel
from utils.normalize_advantages import NormalizeAdvantagesCallback


import PyFlyt.gym_envs
from ray.tune.registry import register_env
from PyFlyt.gym_envs import FlattenWaypointEnv
from PyFlyt.gym_envs.quadx_envs import quadx_hover_env, quadx_waypoints_env
from PyFlyt.pz_envs.fixedwing_envs.ma_fixedwing_dogfight_env import MAFixedwingDogfightEnv

In [None]:
path = os.getcwd()
torch, nn = try_import_torch()
ray.init()

In [None]:
class RewardWrapper(gym.RewardWrapper):
    def __init__(self, env):
        super().__init__(env)
    def reward(self, reward):
        if reward >= 99.0 or reward <= -99.0:
            return reward / 10
        return reward

In [None]:
class CustomDogfightEnv(MultiAgentEnv):
    def __init__(self, config):
        
        self.env = MAFixedwingDogfightEnv(**config)
        
        self.agent_ids = self.env.possible_agents
        self.observation_space = self.env.observation_space(self.env.possible_agents[0])
        self.action_space = self.env.action_space(self.env.possible_agents[0])

    def reset(self):
        observations, infos = self.env.reset()
        return observations, infos

    def step(self, actions):
        observations, rewards, terminations, truncations, infos = self.env.step(actions)
        return observations, rewards, terminations, truncations, infos

# Register the environment
def env_creator(config):
    return CustomDogfightEnv(config)
register_env("MAFixedwingDogfightEnv", env_creator)


In [None]:
# def env_creator(env_config):
#     return MAFixedwingDogfightEnv(assisted_flight = True)
# register_env("MAFixedwingDogfightEnv", env_creator)

In [None]:
def policy_mapping_fn(agent_id, episode, worker, **kwargs):
    # Check if agent_id is a digit
    if agent_id.isdigit():
        return 'policy_1' if int(agent_id) % 2 == 0 else 'policy_2'
    # Handle agent_ids like 'uav_0', 'uav_1', etc.
    return 'policy_1' if int(agent_id.split('_')[1]) % 2 == 0 else 'policy_2'

In [None]:
# def env_creator(config):
#     return MAFixedwingDogfightEnv(**config)
# register_env('MAFixedwingDogfightEnv', lambda config: PettingZooEnv(env_creator(config)))

In [None]:
# env_creator = lambda config: MAFixedwingDogfightEnv(**config)

# register_env('MAFixedwingDogfightEnv', lambda config: PettingZooEnv(env_creator(config)))

In [None]:
# def env_creator(args):
#     env = MAFixedwingDogfightEnv.parallel_env(
#         spawn_height = 15.0,
#         damage_per_hit = 0.02,
#         lethal_distance = 15.0,
#         lethal_angle_radians = 0.1,
#         assisted_flight = True,
#         sparse_rewar = False,
#         flight_dome_size = 150.0,
#         max_duration_seconds = 60.0,
#         agent_hz = 30,
#         render_mode= None,
#     )
#     return env

In [None]:
env_config = {
    'spawn_height': 5.0,
    'damage_per_hit': 0.02,
    'lethal_distance': 15.0,
    'lethal_angle_radians': 0.1,
    'assisted_flight': True,
    'sparse_reward': False,
    'flight_dome_size': 150.0,
    'max_duration_seconds': 60.0,
    'agent_hz': 30,
    'render_mode': None,
}

In [None]:
%%time

env_example = env_creator(env_config)
obs_space = env_example.observation_space
action_space = env_example.action_space

config = PPOConfig().training(
    gamma = 0.99,
    lambda_ = 0.95,
    # kl_coeff = 0.5,
    num_sgd_iter = 30,
    lr_schedule = [[0, 0.0003], [15_000_000, 0.00025], [30_000_000, 0.0002], [50_000_000, 0.0001]],
    vf_loss_coeff = 0.5,
    vf_clip_param = 15.0,
    clip_param = 0.2,
    grad_clip_by ='norm', 
    train_batch_size = 65_000, 
    sgd_minibatch_size = 4_096,
    grad_clip = 0.5,
    model = {'custom_model': 'SimpleCustomTorchModel', 
           'vf_share_layers': False,
           'fcnet_hiddens': [256,256],
           'fcnet_activation': 'LeakyReLU',
             #this isn't used for some models, but doesn't hurt to keep it
           'custom_model_config': {
                'num_gaussians': 2,
               'num_outputs': action_space.shape[0]
           }
            }
).environment(
    env = 'MAFixedwingDogfightEnv',
    env_config = env_config
).rollouts(
num_rollout_workers = 28
).resources(num_gpus = 1
).callbacks(NormalizeAdvantagesCallback
).multi_agent(
    policies = {
        'policy_1': (SimpleTorchPolicy, obs_space, action_space, {}),
        'policy_2': (SimpleTorchPolicy, obs_space, action_space, {}),
    },
    policy_mapping_fn=policy_mapping_fn
)

# analysis = tune.run(
#     'PPO',
#     config=config.to_dict(),
#     stop={'training_iteration':300},
#     checkpoint_freq=10,
#     checkpoint_at_end=True,
#     # local_dir='./ray_results'
# )


algo = config.build()

num_iterations = 300
results = []

for i in range(num_iterations):
    result = algo.train()
    print(f"Iteration: {i}, Mean Reward: {result['episode_reward_mean']}")
    results.append([result['episode_reward_mean'], result['episode_len_mean']])


results_df = pd.DataFrame(results)
    
ray.shutdown()


In [None]:
results_df = pd.DataFrame(results)
experiment_type = 'enn_2dim'
results_df.to_csv(path + '/logs/test_runs/'+experiment_type+'.csv')

In [None]:
results

In [None]:
plt.plot(results)
plt.title('Training Progress - Mean Reward per Episode')
plt.xlabel('Iteration')
plt.ylabel('Mean Reward')
# plt.savefig('Basic PPO - HalfCheetah-v4')
plt.show()

In [None]:
algo.logdir

In [None]:
env = FlattenWaypointEnv(gym.make(id='PyFlyt/QuadX-Waypoints-v1', flight_mode=-1), context_length=1)

obs_list = []
obs, info = env.reset()
# env.env.env.env.env.drones[0].set_mode(-1)
targets = env.unwrapped.waypoints.targets
points = np.concatenate((obs[10:13].reshape(-1,3), targets))
obs = {'default': obs}
obs_list += [obs]

reward_list = []
action_list = []
start = time.time()
for i in range(10*40):
    compute_action = algo.compute_actions(obs)
    action = compute_action['default']
    # obs, reward, terminated, truncated, info = env.step(np.zeros((4))+.79)
    obs, reward, terminated, truncated, info = env.step(action)

    obs = {'default': obs}
    
    obs_list += [obs]
    
    reward_list += [reward]
    action_list += [action]
    
    if terminated or info['num_targets_reached'] == 4:
        break

arrays = [d['default'] for d in obs_list]
obs_array = np.vstack(arrays)
reward_array = np.array(reward_list)
action_array = np.array(action_list) 
env.close()

In [None]:
plotly_figure = px.scatter_3d(x=obs_array[:,10], y=obs_array[:,11], z=obs_array[:,12], opacity=.6, color=np.arange(len(obs_array)))
plotly_figure.add_scatter3d(x=targets[:,0], y=targets[:,1], z=targets[:,2], marker={'color':'green', 'symbol':'square-open', 'size':25, 'line':{'width':10}}, mode='markers')
plotly_figure.write_html(path+'/3D_renders/3d_drone_space4_'+experiment_type+'.html')

In [None]:
import seaborn as sns

In [None]:
dataframes = {}
for filename in os.listdir(path+'/logs/test_runs'):
    if filename.endswith('.csv'):
        file_path = os.path.join(path+'/logs/test_runs', filename)
        df = pd.read_csv(file_path)
        key = os.path.splitext(filename)[0]
        dataframes[key] = df


data_list = []
labels = []
output_desired = 'length' #else will give length

for key, df in dataframes.items():
    if output_desired == 'reward':
        data_list.append(df.iloc[:,0])
        labels.append(f"reward for {key}")
    else:
        data_list.append(df.iloc[:,1])
        labels.append(f"length for {key}")

for data in data_list:
    sns.kdeplot(data, fill = True)

plt.legend(title = 'Modes', labels = labels)
plt.title(f"{output_desired}")
plt.show()

In [None]:
dataframes = {}
for filename in os.listdir(path+'/logs/test_runs'):
    if filename.endswith('.csv'):
        file_path = os.path.join(path+'/logs/test_runs', filename)
        df = pd.read_csv(file_path)
        key = os.path.splitext(filename)[0]
        dataframes[key] = df


reward = []
labels = []
output_desired = 'reward' #else will give length

for key, df in dataframes.items():
    plt.scatter(df.iloc[:,0], df.iloc[:,1])
    labels.append(f"length for {key}")

plt.legend(title = 'Different runs', labels = labels)
plt.title(f"{output_desired} over time")
plt.show()