# Train statistics of a single experiment (PPO)

This notebook presents training statistics for the policies that have been trained with the PPO algorithm.

## Experiment loading

In [None]:
# Common imports.
from pathlib import Path

%matplotlib widget
import base

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import networkx as nx

import dfaas_env
import dfaas_utils

In [None]:
# Base directory where the experiments are located.
prefix_dir = Path("../results/").resolve().absolute()

# Experiment directory.
exp_dir = prefix_dir / "DF_20250929_144721_PPO_5_agents_expanded_actions"

# Alternative set an absolute directory. Not suggested.
# exp_dir = Path("/home/emanuele/marl-dfaas/results/DF_20250929_144725_PPO_5_agents")
# prefix_dir = exp_dir.parent

# Raw data dictionary "result.json".
raw_exp_data = dfaas_utils.parse_result_file(exp_dir / "result.json")

# Reference environment.
env = base.get_env(exp_dir)

In [None]:
print(f"Experiment prefix dir: {prefix_dir.as_posix()!r}")
print(f"Experiment name:       {exp_dir.name!r}")
print(f"Agents:                {env.agents} ({len(env.agents)})")
print(f"Iterations:            {len(raw_exp_data)}")

In [None]:
def make_networkx_plot(graph):
    plt.close(fig=f"networkx")
    fig = plt.figure(num=f"networkx", layout="constrained")
    fig.canvas.header_visible = False
    ax = fig.subplots()

    ax.axis("off")

    options = {
        "ax": ax,
        "node_size": 2500,
        "node_color": "white",
        "edgecolors": "black",
    }

    nx.draw_networkx(graph, **options)
    ax.set_title(f"Network topology")
    ax.set_axisbelow(True)


make_networkx_plot(env.network)

## Reward statistics

### Reward per episode

In [None]:
def get_reward_per_episode(raw_exp_data, env):
    """Returns the reward per episode for each agent and all agents."""
    iters_n = len(raw_exp_data)

    reward_sum = {}
    reward_sum["all"] = np.empty(iters_n)
    for agent in env.agents:
        reward_sum[agent] = np.empty(iters_n)

    for iter_idx in range(len(raw_exp_data)):
        reward_sum["all"][iter_idx] = np.average(raw_exp_data[iter_idx]["env_runners"]["hist_stats"]["episode_reward"])
        for agent in env.agents:
            reward_sum[agent][iter_idx] = np.average(
                raw_exp_data[iter_idx]["env_runners"]["hist_stats"][f"policy_policy_{agent}_reward"]
            )

    return reward_sum


stats_reward_episode = get_reward_per_episode(raw_exp_data, env)

In [None]:
def make_reward_per_episode_plot(stats):
    for agent, reward in stats.items():
        plt.close(fig=f"reward_per_episode_{agent}")
        fig = plt.figure(num=f"reward_cum_{agent}", layout="constrained")
        fig.canvas.header_visible = False
        ax = fig.subplots()

        ax.plot(reward)

        ax.set_title(f"Average reward per episode ({agent = })")
        ax.set_ylabel("Reward")
        ax.set_xlabel("Iteration")

        ax.grid(axis="both")
        ax.set_axisbelow(True)  # By default the axis is over the content.


make_reward_per_episode_plot(stats_reward_episode)

## Policy/Training Statistics

### Total Loss

The overall loss is used to update the policy network in a single gradient step. It is a combination of:

* Policy loss
* Value function loss
* Differential Entropy
* KL divergence penalty

The range depends on the reward scale.

In [None]:
def get_total_loss(raw_exp_data):
    """Returns the total loss for each policy and for each training iteration."""
    iters = len(raw_exp_data)
    policies = raw_exp_data[0]["info"]["learner"]

    loss = {policy: np.empty(iters) for policy in policies}

    for iter in range(iters):
        for policy in policies:
            loss[policy][iter] = raw_exp_data[iter]["info"]["learner"][policy]["learner_stats"]["total_loss"]

    return loss


total_loss = get_total_loss(raw_exp_data)

In [None]:
def make_total_loss_plot(total_loss):
    for policy, loss in total_loss.items():
        fig_name = f"total_loss_{policy}"
        plt.close(fig=fig_name)
        fig = plt.figure(num=fig_name, layout="constrained")
        fig.canvas.header_visible = False
        ax = fig.subplots()

        ax.plot(loss)

        ax.set_title(f"Total loss for {policy}")
        ax.set_ylabel("Loss")
        ax.set_xlabel("Iteration")

        ax.grid(axis="both")
        ax.set_axisbelow(True)  # By default the axis is over the content.


make_total_loss_plot(total_loss)

### Policy loss

This is the loss associated with the policy (actor) network.

In [None]:
def get_policy_loss(raw_exp_data):
    """Returns the policy loss for each policy and for each training iteration."""
    iters = len(raw_exp_data)
    policies = raw_exp_data[0]["info"]["learner"]

    loss = {policy: np.empty(iters) for policy in policies}

    for iter in range(iters):
        for policy in policies:
            loss[policy][iter] = raw_exp_data[iter]["info"]["learner"][policy]["learner_stats"]["policy_loss"]

    return loss


policy_loss = get_policy_loss(raw_exp_data)

In [None]:
def make_policy_loss_plot(policy_loss):
    for policy, loss in policy_loss.items():
        fig_name = f"policy_loss_{policy}"
        plt.close(fig=fig_name)
        fig = plt.figure(num=fig_name, layout="constrained")
        fig.canvas.header_visible = False
        ax = fig.subplots()

        ax.plot(loss)

        ax.set_title(f"Policy loss for {policy}")
        ax.set_ylabel("Loss")
        ax.set_xlabel("Iteration")

        ax.grid(axis="both")
        ax.set_axisbelow(True)  # By default the axis is over the content.


make_policy_loss_plot(policy_loss)

### Value Function Loss

This is the loss associated with the value (critic) network. It measures how closely the predictions of the value network match the actual returns observed during training.

In [None]:
def get_value_function_loss(raw_exp_data):
    """Returns the value function loss for each policy and for each training iteration."""
    iters = len(raw_exp_data)
    policies = raw_exp_data[0]["info"]["learner"]

    loss = {policy: np.empty(iters) for policy in policies}

    for iter in range(iters):
        for policy in policies:
            loss[policy][iter] = raw_exp_data[iter]["info"]["learner"][policy]["learner_stats"]["vf_loss"]

    return loss


value_loss = get_value_function_loss(raw_exp_data)

In [None]:
def make_value_loss_plot(value_loss):
    for policy, loss in value_loss.items():
        fig_name = f"value_loss_{policy}"
        plt.close(fig=fig_name)
        fig = plt.figure(num=fig_name, layout="constrained")
        fig.canvas.header_visible = False
        ax = fig.subplots()

        ax.plot(loss)

        ax.set_title(f"Value loss for {policy}")
        ax.set_ylabel("Loss")
        ax.set_xlabel("Iteration")

        ax.grid(axis="both")
        ax.set_axisbelow(True)  # By default the axis is over the content.


make_value_loss_plot(value_loss)

### Value Function Explained Variance

This is a normalised measure of how well the value function's predictions explain the variation in actual returns. The typical range is from negative to 1. Values should be closer to 1.

* 1: perfect prediction.
* 0: the predictions are no better than the mean of the targets.
* <0: predictions are worse than just using the mean

In [None]:
def get_value_explained_var(raw_exp_data):
    """Returns the value function explained variance for each policy and for each training iteration."""
    iters = len(raw_exp_data)
    policies = raw_exp_data[0]["info"]["learner"]

    loss = {policy: np.empty(iters) for policy in policies}

    for iter in range(iters):
        for policy in policies:
            loss[policy][iter] = raw_exp_data[iter]["info"]["learner"][policy]["learner_stats"]["vf_explained_var"]

    return loss


value_explained_var = get_value_explained_var(raw_exp_data)

In [None]:
def make_value_explained_var_plot(value_explained_var):
    for policy, loss in value_explained_var.items():
        fig_name = f"value_explained_var_{policy}"
        plt.close(fig=fig_name)
        fig = plt.figure(num=fig_name, layout="constrained")
        fig.canvas.header_visible = False
        ax = fig.subplots()

        ax.plot(loss)

        ax.set_title(f"Value Function Explained Variance for {policy}")
        ax.set_ylabel("Variance")
        ax.set_xlabel("Iteration")

        ax.grid(axis="both")
        ax.set_axisbelow(True)  # By default the axis is over the content.


make_value_explained_var_plot(value_explained_var)

### Differential Entropy

**Warning**: in the context of a continuous distribution like [Dirichlet](https://en.wikipedia.org/wiki/Dirichlet_distribution#Entropy), the entropy is actually the [differential entropy](https://en.wikipedia.org/wiki/Differential_entropy). The differential entropy [is not](https://github.com/pytorch/pytorch/issues/152845#issuecomment-2860403912) the level of exploration like for discrete probabilities distributions.

The differential entropy of the policy measures information in terms of probability density. The output of the actor is the concentration parameters for a Dirichlet distribution, and the final action is sampled from this distribution.

* **Very negative values** indicate that the density is high in a small volume, meaning the **distribution is highly concentrated**.
* **Values closer to zero** or positive indicate a more diffuse distribution, meaning the **density is lower but spread over a larger volume**.

Also:

* If one or more parameters are small, the distribution is concentrated near the corners of the simplex, so the differential entropy is negative.
* If one or more parameters are larger, the distribution spreads toward the center of the simplex, and the differential entropy increases, meaning the distribution is more spread out.

Note that the plot shows the average differential entropy for each training iteration.

In [None]:
def get_entropy(raw_exp_data):
    """Returns the entropy for each policy and for each training iteration."""
    iters = len(raw_exp_data)
    policies = raw_exp_data[0]["info"]["learner"]

    loss = {policy: np.empty(iters) for policy in policies}

    for iter in range(iters):
        for policy in policies:
            loss[policy][iter] = raw_exp_data[iter]["info"]["learner"][policy]["learner_stats"]["entropy"]

    return loss


entropy = get_entropy(raw_exp_data)

In [None]:
def make_entropy_plot(entropy):
    for policy, policy_entropy in entropy.items():
        fig_name = f"value_explained_var_{policy}"
        plt.close(fig=fig_name)
        fig = plt.figure(num=fig_name, layout="constrained")
        fig.canvas.header_visible = False
        ax = fig.subplots()

        ax.plot(policy_entropy)

        ax.set_title(f"Entropy for {policy}")
        ax.set_ylabel("Entropy")
        ax.set_xlabel("Iteration")

        ax.grid(axis="both")
        ax.set_axisbelow(True)  # By default the axis is over the content.


make_entropy_plot(entropy)

### KL divergence

The KL divergence measures how much the new policy has changed compared to the old policy.

* A higher mean KL means the policy is changing a lot in one update (possibly too much).
* A lower mean KL means the policy is not changing much (possibly learning too slowly).

It is a non-negative metric.

In [None]:
def get_kl_divergence(raw_exp_data):
    """Returns the KL divergence for each policy and for each training iteration."""
    iters = len(raw_exp_data)
    policies = raw_exp_data[0]["info"]["learner"]

    loss = {policy: np.empty(iters) for policy in policies}

    for iter in range(iters):
        for policy in policies:
            loss[policy][iter] = raw_exp_data[iter]["info"]["learner"][policy]["learner_stats"]["kl"]

    return loss


kl_divergence = get_kl_divergence(raw_exp_data)

In [None]:
def make_kl_divergence_plot(kl_divergence):
    for policy, policy_kl_divergence in kl_divergence.items():
        fig_name = f"kl_divergence_{policy}"
        plt.close(fig=fig_name)
        fig = plt.figure(num=fig_name, layout="constrained")
        fig.canvas.header_visible = False
        ax = fig.subplots()

        ax.plot(policy_kl_divergence)

        ax.set_title(f"KL divergence for {policy}")
        ax.set_ylabel("KL divergence")
        ax.set_xlabel("Iteration")

        ax.grid(axis="both")
        ax.set_axisbelow(True)  # By default the axis is over the content.


make_kl_divergence_plot(kl_divergence)

### Global Gradient Norm

It is the Euclidean norm of the gradients computed during a single optimisation step. It measures the size of the policy/value gradients during learning.

* A very large value can indicate unstable learning or exploding gradients.
* A very small value (close to zero) means the model's parameters are barely changing (possibly due to vanishing gradients or convergence).

The scale of the value is influenced by the network's structure and the distribution of actions. Fluctuations are normal.

In [None]:
def get_grad_gnorm(raw_exp_data):
    """Returns the global Gradient Norm for each policy and for each training iteration."""
    iters = len(raw_exp_data)
    policies = raw_exp_data[0]["info"]["learner"]

    loss = {policy: np.empty(iters) for policy in policies}

    for iter in range(iters):
        for policy in policies:
            loss[policy][iter] = raw_exp_data[iter]["info"]["learner"][policy]["learner_stats"]["grad_gnorm"]

    return loss


grad_gnorm = get_grad_gnorm(raw_exp_data)

In [None]:
def make_grad_gnorm_plot(grad_gnorm):
    for policy, policy_grad_gnorm in grad_gnorm.items():
        fig_name = f"grad_gnorm_{policy}"
        plt.close(fig=fig_name)
        fig = plt.figure(num=fig_name, layout="constrained")
        fig.canvas.header_visible = False
        ax = fig.subplots()

        ax.plot(policy_grad_gnorm)

        ax.set_title(f"Global Gradient Norm for {policy}")
        ax.set_ylabel("Global Gradient Norm")
        ax.set_xlabel("Iteration")

        ax.grid(axis="both")
        ax.set_axisbelow(True)  # By default the axis is over the content.


make_grad_gnorm_plot(grad_gnorm)