# Independently Training Machines/Agents (IPPO)

Each Machine/Agent learns on its own (no collaboration, and they don't train together). Decentralized Training, Decentralized Execution.

The PPO's are trained separately for each agent. During training, the chosen agent choses its own action while the others choose random actions.

In [None]:
# %pip install tensorboard
# %pip install stable-baselines3
# %pip install stable-baselines3[extra]
# %pip install pytorch
# %pip install gymnasium
# %pip install ipywidgets

In [1]:
import sys
from pathlib import Path

import json

# Add project root to Python path
project_root = Path().resolve().parent.parent.parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

data_file = project_root / "data" / "Input_JSON_Schedule_Optimization.json"
with open(data_file) as f:
    data = json.load(f) # ensure that this works

In [9]:
"""
Train independent agents (one per machine) using PPO.
Each agent learns independently without knowing about other agents.
This is the simplest MARL approach.
"""

import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback
import numpy as np
from typing import Dict
import os

from backend.schedulers.marl.factory_gym_env import FactoryMultiAgentEnv
from backend.factory_logic_loader import FactoryLogicLoader
from backend.job_builder import JobBuilder
from backend.schemas import ProductRequest


class IndependentAgentWrapper(gym.Env):
    """
    Wrapper that presents single-agent interface for one machine.
    Used to train each agent independently with Stable-Baselines3.
    """
    def __init__(self, multi_agent_env: FactoryMultiAgentEnv, agent_id: str):
        self.env = multi_agent_env
        self.agent_id = agent_id # this independent agent
        self.observation_space = multi_agent_env.observation_spaces[agent_id]
        self.action_space = multi_agent_env.action_spaces[agent_id]
        
    def reset(self, seed=None, options=None):
        observations, _ = self.env.reset(seed=seed, options=options)
        return observations[self.agent_id], {}
    
    def step(self, action):
        # all agents take random actions
        actions = {
            agent: self.env.action_spaces[agent].sample()
            for agent in self.env.agents
        }
        actions[self.agent_id] = action  # Override this agent with learned action
        
        observations, rewards, terminations, truncations, infos = self.env.step(actions)
            
        terminated = terminations[self.agent_id]
        truncated = truncations[self.agent_id]
        obs = observations[self.agent_id]
        reward = rewards[self.agent_id]
        info = infos[self.agent_id]
        return obs, reward, terminated, truncated, info


def train_independent_agents(
    factory_logic,
    jobs,
    total_timesteps=1152,
    save_dir="models/marl_independent"
):
    """Train one agent per machine independently"""
    
    os.makedirs(save_dir, exist_ok=True)
    
    # Create base environment
    base_env = FactoryMultiAgentEnv(
        factory_logic=factory_logic,
        initial_jobs=jobs,
        max_steps=1152
    )
    
    agents = {}
    
    # Train each agent
    for agent_id in base_env.possible_agents:
        print(f"\n{'='*50}")
        print(f"Training agent: {agent_id}")
        print(f"{'='*50}")
        
        # Create environment for this agent
        env = FactoryMultiAgentEnv(
            factory_logic=factory_logic,
            initial_jobs=jobs,
            max_steps=1152
        )
        single_agent_env = IndependentAgentWrapper(env, agent_id)
        vec_env = DummyVecEnv([lambda: single_agent_env])
        
        # Create PPO agent
        model = PPO(
            "MlpPolicy",
            vec_env,
            verbose=1,
            learning_rate=3e-4,
            n_steps=2048,
            batch_size=64,
            n_epochs=10,
            gamma=0.99,
            tensorboard_log=f"{save_dir}/tensorboard/{agent_id}"
        )
        
        # Setup callbacks
        checkpoint_callback = CheckpointCallback(
            save_freq=10_000,
            save_path=f"{save_dir}/checkpoints/{agent_id}",
            name_prefix=f"ppo_{agent_id}"
        )
        
        # Train
        model.learn(
            total_timesteps=total_timesteps,
            callback=checkpoint_callback,
            progress_bar=True
        )
        
        # Save final model
        model.save(f"{save_dir}/{agent_id}_final")
        print(f"Saved model for {agent_id}")
        
        agents[agent_id] = model
    
    return agents


def evaluate_trained_agents(agents: Dict, factory_logic, jobs, num_episodes=10):
    """Evaluate the trained agents"""
    
    print(f"\n{'='*50}")
    print("EVALUATING TRAINED AGENTS")
    print(f"{'='*50}")
    
    episode_rewards = []
    episode_costs = []
    episode_completion_rates = []
    
    for episode in range(num_episodes):
        env = FactoryMultiAgentEnv(
            factory_logic=factory_logic,
            initial_jobs=jobs,
            max_steps=1152
        )
        
        obs, _ = env.reset()
        done = False
        episode_reward = 0.0
        infos = None  # will hold last infos dict
        
        while not done:
            # Each agent predicts its action
            actions = {}
            for agent_id in env.agents:
                action, _ = agents[agent_id].predict(obs[agent_id], deterministic=True)
                actions[agent_id] = action
            
            obs, rewards, terms, truncs, infos = env.step(actions)
            
            # average reward over agents for this step
            episode_reward += sum(rewards.values()) / max(len(rewards), 1)
            done = all(terms.values()) or all(truncs.values())
        
        # Episode finished -> compute completion from factory state
        factory_state = env.factory.get_factory_state()
        total_jobs = len(factory_state.jobs)
        done_jobs = sum(1 for j in factory_state.jobs if j.done)
        completion_rate = done_jobs / max(total_jobs, 1)

        episode_rewards.append(episode_reward)
        episode_costs.append(env.total_episode_power_cost)
        episode_completion_rates.append(completion_rate)
        
        print(
            f"Episode {episode + 1}: "
            f"Reward={episode_reward:.2f}, "
            f"Cost=${env.total_episode_power_cost:.2f}, "
            f"Completion={completion_rate*100:.1f}%"
        )
    
    print(f"\n{'='*50}")
    print("RESULTS:")
    print(f"  Avg Reward: {np.mean(episode_rewards):.2f} ± {np.std(episode_rewards):.2f}")
    print(f"  Avg Cost: ${np.mean(episode_costs):.2f} ± {np.std(episode_costs):.2f}")
    print(f"  Avg Completion: {np.mean(episode_completion_rates)*100:.1f}%")
    print(f"{'='*50}")



def load_agents(save_dir="models/marl_independent"):
    """Load trained agents from directory"""
    agents = {}
    for filename in os.listdir(save_dir):
        if not filename.endswith(".zip"):
            continue

        base = filename.split(".")[0]  # e.g. "MAQ118_final"
        # Strip "_final" suffix so it matches env.agents ("MAQ118")
        if base.endswith("_final"):
            agent_id = base[:-6]  # remove "_final"
        else:
            agent_id = base

        agents[agent_id] = PPO.load(os.path.join(save_dir, filename))

    return agents


## Train IPPO

In [None]:
with open(data_file) as f:
    data = json.load(f) # frontend will change this, product_requests and steps specifically

product_requests_data = data["product_requests"] # frontend will change this, product_requests and steps specifically
product_requests = [ProductRequest(**pr) for pr in product_requests_data]

factory_logic = FactoryLogicLoader.load_from_file(filepath=data_file)
job_builder = JobBuilder(factory_logic=factory_logic)
jobs = job_builder.build_jobs(product_requests=product_requests) # job objects

print(f"Training with {len(jobs)} jobs")
print(f"Machines: {list(factory_logic.machines.keys())}")

# Train agents
agents = train_independent_agents(
    factory_logic=factory_logic,
    jobs=jobs,
    total_timesteps=100_000,  # Start small, increase later
    save_dir="models/marl_independent"
)

## Evaluate IPPO

In [None]:
with open(data_file) as f:
    data = json.load(f) # frontend will change this, product_requests and steps specifically

product_requests_data = data["product_requests"] # frontend will change this, product_requests and steps specifically
product_requests = [ProductRequest(**pr) for pr in product_requests_data]

factory_logic = FactoryLogicLoader.load_from_file(filepath=data_file)
job_builder = JobBuilder(factory_logic=factory_logic)
jobs = job_builder.build_jobs(product_requests=product_requests) # job objects


# load agents
agents = load_agents(save_dir="models/marl_independent")

# Evaluate
evaluate_trained_agents(agents, factory_logic, jobs, num_episodes=1)


EVALUATING TRAINED AGENTS
Episode 1: Reward=1253.43, Cost=$0.00, Completion=99.2%

RESULTS:
  Avg Reward: 1253.43 ± 0.00
  Avg Cost: $0.00 ± 0.00
  Avg Completion: 99.2%


: 