In [None]:
# Training Loop
# Declare agent
import sys
# Add the subfolder to sys.path
sys.path.append('./tennis')
from unityagents import UnityEnvironment
import agent
import utils
import importlib
importlib.reload(agent)
importlib.reload(utils)
import numpy as np

env = UnityEnvironment(file_name="tennis/Tennis.app")
# env = UnityEnvironment(file_name="Tennis_Windows_x86_64/Tennis.exe")
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# Environment parameters
env_info = env.reset(train_mode=True)[brain_name]
num_agents = len(env_info.agents)
obs_dim = env_info.vector_observations.shape[1]
print("state size: ", obs_dim)
act_dim = brain.vector_action_space_size
print("action size: ", act_dim)
state_dim = obs_dim * num_agents

# Initialize MAPPO
mappo = agent.MAPPO(obs_dim, act_dim, state_dim, num_agents, gamma=0.99, clip_param=0.2, ppo_epochs=10, lr=1e-4,
                 batch_size=512, gae_lambda=0.9, entropy_coef=0.02)
buffer = utils.ReplayBufferMAPPO()


In [None]:
import os
from collections import deque

os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

# Training parameters
max_episodes = 25000
max_steps = 1000
update_freq = 2048
print_interval = 20          # Print stats every 10 episodes
save_interval = 100          # Save model every 100 episodes
solved_threshold = 2.0       # Consider solved when average score > 0.5

# Track rewards
episode_rewards = []
moving_avg = []
moving_var = []  # New variance tracking
best_avg_reward = -np.inf
reward_window = deque(maxlen=100)  # For rolling average

# Create directory for model saves
if not os.path.exists('models'):
    os.makedirs('models')

# Training loop
for episode in range(max_episodes):
    env_info = env.reset(train_mode=True)[brain_name]
    states = env_info.vector_observations
    episode_reward= np.zeros(num_agents)

    for step in range(max_steps):
        global_state = states.reshape(-1)
        actions = []
        log_probs = []

        # Get actions from all agents
        for agent_id in range(num_agents):
            action, log_prob = mappo.act(states[agent_id], agent_id)
            actions.append(action)
            log_probs.append(log_prob)

        # Environment step
        env_info = env.step(actions)[brain_name]
        next_states = env_info.vector_observations
        rewards = env_info.rewards
        dones = env_info.local_done

        # Store experiences
        episode_reward += rewards
        next_global_state = next_states.reshape(-1)

        for agent_id in range(num_agents):
            buffer.add(
                global_state=global_state,
                individual_obs=states[agent_id],
                action=actions[agent_id],
                # reward=sum(rewards),
                reward=rewards[agent_id],
                next_global_state=next_global_state,
                done=dones[agent_id],
                log_prob=log_probs[agent_id],
                agent_idx=agent_id
            )

        states = next_states

        # Update when buffer is full
        if len(buffer.rewards) >= update_freq:
            mappo.update(buffer, episode, max_episodes)

        if np.any(dones):
            break
    
    # Update reward tracking
    episode_rewards.append(max(episode_reward))
    reward_window.append(max(episode_reward))
    current_avg = np.mean(reward_window)
    current_var = np.var(list(reward_window))  # Calculate variance
    moving_avg.append(current_avg)
    moving_var.append(current_var)  # Store variance
    
    # Save best model
    if current_avg > (best_avg_reward + 0.05):
        best_avg_reward = current_avg
        mappo.save(episode=episode)
    
    # Print statistics
    if (episode + 1) % print_interval == 0:
        print(f"Episode {episode + 1}, Current Reward: {max(episode_reward):.2f}, Average Reward (Last 100): {current_avg:.2f}, Reward Var (Last 100): {current_var:.2f}, Max Reward: {np.max(episode_rewards[-print_interval:]):.2f}, Min Reward: {np.min(episode_rewards[-print_interval:]):.2f}")

    # Early stopping if solved
    if current_avg >= solved_threshold:
        print(f"\nEnvironment solved in {episode + 1} episodes!")
        print(f"Average Reward: {current_avg:.2f}")
        break

env.close()

mappo.save(episode=max_episodes)

# Plotting the rewards (add this at the end)
import matplotlib.pyplot as plt
# Modified Plotting Section
plt.figure(figsize=(12, 6))

# Calculate standard deviation from variance for visualization
std_dev = np.sqrt(np.array(moving_var))

# Plot main reward curve and average
plt.plot(episode_rewards, alpha=0.3, color='blue', label='Episode Reward')
plt.plot(moving_avg, linewidth=2, color='darkblue', label='100-Episode Average')

# Plot variance as shaded area around the mean
plt.fill_between(range(len(moving_avg)), 
                 np.array(moving_avg) - np.array(moving_var) * 2.,
                 np.array(moving_avg) + np.array(moving_var) * 2.,
                 color='skyblue', alpha=0.7, label='Variance')

plt.xlabel('Episode')
plt.ylabel('Reward')
plt.title('Training Progress with Reward Variance')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig('reward_with_variance.png')
plt.show()

In [None]:
# Test the model

# Declare agent
import sys
# Add the subfolder to sys.path
sys.path.append('./tennis')
from unityagents import UnityEnvironment
import agent
import utils
import importlib
importlib.reload(agent)
importlib.reload(utils)
import numpy as np
env = UnityEnvironment(file_name="tennis/Tennis.app")
# env = UnityEnvironment(file_name="Tennis_Windows_x86_64/Tennis.exe")
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# Environment parameters
env_info = env.reset(train_mode=False)[brain_name]
num_agents = len(env_info.agents)
obs_dim = env_info.vector_observations.shape[1]
print("state size: ", obs_dim)
act_dim = brain.vector_action_space_size
print("action size: ", act_dim)
state_dim = obs_dim * num_agents

# Initialize MAPPO
mappo = agent.MAPPO(obs_dim, act_dim, state_dim, num_agents, gamma=0.99, clip_param=0.2, ppo_epochs=10, lr=1e-4,
                 batch_size=512, gae_lambda=0.9, entropy_coef=0.02)
# Load from checkpoint
loaded_episode = mappo.load("tennis/mappo_models/mappo_checkpoint_ep25000.pth", load_optimizer=False)
print(f"Resuming training from episode {loaded_episode}")

env_info = env.reset(train_mode=False)[brain_name] # reset the environment
states = env_info.vector_observations              # get the current state
for _ in range(3000):
    actions = []
    log_probs = []
    for agent_id in range(num_agents):
        action, log_prob = mappo.act(states[agent_id], agent_id)
        actions.append(action)
        log_probs.append(log_prob)
    env_info = env.step(actions)[brain_name]
    states = env_info.vector_observations
env.close()

