# Train statistics of a single experiment (SAC)

This notebook presents training statistics for the policies that have been trained with the SAC algorithm.

## Experiment loading

In [None]:
# Common imports.
from pathlib import Path

%matplotlib widget
import base

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import dfaas_env
import dfaas_utils

In [None]:
prefix_dir = Path("/home/emanuele/marl-dfaas/results/")

exp_dir = prefix_dir / "DF_20250619_102139_SAC_initial_test_10000"

# Raw data dictionary "result.json".
raw_exp_data = dfaas_utils.parse_result_file(exp_dir / "result.json")

# Reference environment.
env = base.get_env(exp_dir)

In [None]:
print(f"Experiment prefix dir: {prefix_dir.as_posix()!r}")
print(f"Experiment name:       {exp_dir.name!r}")
print(f"Agents:                {env.agents} ({len(env.agents)})")
print(f"Iterations:            {len(raw_exp_data)}")

## Reward statistics

In [None]:
def get_reward_per_episode(raw_exp_data, env):
    """Returns the reward per episode for each agent and all agents."""
    iters_n = len(raw_exp_data)

    reward_sum = {}
    reward_sum["all"] = np.empty(iters_n)
    for agent in env.agents:
        reward_sum[agent] = np.empty(iters_n)

    for iter_idx in range(len(raw_exp_data)):
        reward_sum["all"][iter_idx] = np.average(raw_exp_data[iter_idx]["env_runners"]["hist_stats"]["episode_reward"])
        for agent in env.agents:
            reward_sum[agent][iter_idx] = np.average(
                raw_exp_data[iter_idx]["env_runners"]["hist_stats"][f"policy_policy_{agent}_reward"]
            )

    return reward_sum


stats_reward_episode = get_reward_per_episode(raw_exp_data, env)

In [None]:
def make_reward_per_episode_plot(stats):
    for agent, reward in stats.items():
        plt.close(fig=f"reward_per_episode_{agent}")
        fig = plt.figure(num=f"reward_cum_{agent}", layout="constrained")
        fig.canvas.header_visible = False
        ax = fig.subplots()

        ax.plot(reward)

        ax.set_title(f"Average reward per episode ({agent = })")
        ax.set_ylabel("Reward")
        ax.set_xlabel("Iteration")

        ax.grid(axis="both")
        ax.set_axisbelow(True)  # By default the axis is over the content.


make_reward_per_episode_plot(stats_reward_episode)

## Losses

### Total Loss

The total loss in the SAC algorithm is the combination of three components:

1. **Policy Loss**, or also called Actor Loss,
2. **Q-function Loss**, or also called Critic Loss,
3. **Entropy temperature loss**, or also called Alpha Loss.

In [None]:
def get_total_loss(raw_exp_data):
    """Returns the total loss for each policy and for each training iteration."""
    iters = len(raw_exp_data)
    policies = raw_exp_data[0]["config"]["policies_to_train"]

    loss = {policy: np.empty(iters) for policy in policies}

    for iter in range(iters):
        for policy in policies:
            try:
                # Currently for SAC there is no "total_loss" field like PPO, so we
                # must calculate it manually.
                actor_loss = raw_exp_data[iter]["info"]["learner"][policy]["learner_stats"]["actor_loss"]
                critic_loss = raw_exp_data[iter]["info"]["learner"][policy]["learner_stats"]["critic_loss"]
                alpha_loss = raw_exp_data[iter]["info"]["learner"][policy]["learner_stats"]["alpha_loss"]
                loss[policy][iter] = actor_loss + critic_loss + alpha_loss
            except KeyError:
                # No training done for this iteration (maybe the replay buffer
                # is not yet full).
                loss[policy][iter] = 0

    return loss


total_loss = get_total_loss(raw_exp_data)

In [None]:
def make_total_loss_plot(total_loss):
    for policy, loss in total_loss.items():
        fig_name = f"total_loss_{policy}"
        plt.close(fig=fig_name)
        fig = plt.figure(num=fig_name, layout="constrained")
        fig.canvas.header_visible = False
        ax = fig.subplots()

        ax.plot(loss)

        ax.set_title(f"Total loss for {policy}")
        ax.set_ylabel("Loss")
        ax.set_xlabel("Iteration")

        ax.grid(axis="both")
        ax.set_axisbelow(True)  # By default the axis is over the content.


make_total_loss_plot(total_loss)

### Policy Loss

This is the loss associated with the policy (actor) network. Also called Actor Loss.

In [None]:
def get_policy_loss(raw_exp_data):
    """Returns the policy loss for each policy and for each training iteration."""
    iters = len(raw_exp_data)
    policies = raw_exp_data[0]["config"]["policies_to_train"]

    loss = {policy: np.empty(iters) for policy in policies}

    for iter in range(iters):
        for policy in policies:
            try:
                loss[policy][iter] = raw_exp_data[iter]["info"]["learner"][policy]["learner_stats"]["actor_loss"]
            except KeyError:
                # No training done for this iteration (maybe the replay buffer
                # is not yet full).
                loss[policy][iter] = 0

    return loss


policy_loss = get_policy_loss(raw_exp_data)

In [None]:
def make_policy_loss_plot(policy_loss):
    for policy, loss in policy_loss.items():
        fig_name = f"policy_loss_{policy}"
        plt.close(fig=fig_name)
        fig = plt.figure(num=fig_name, layout="constrained")
        fig.canvas.header_visible = False
        ax = fig.subplots()

        ax.plot(loss)

        ax.set_title(f"Policy loss (Actor Loss) for {policy}")
        ax.set_ylabel("Loss")
        ax.set_xlabel("Iteration")

        ax.grid(axis="both")
        ax.set_axisbelow(True)  # By default the axis is over the content.


make_policy_loss_plot(policy_loss)

### Q-function Loss

This is the loss associated to the critic network(s). Also called Critic Loss.

In [None]:
def get_critic_loss(raw_exp_data):
    """Returns the critic loss (q-function loss) for each policy and for each training iteration."""
    iters = len(raw_exp_data)
    policies = raw_exp_data[0]["config"]["policies_to_train"]

    loss = {policy: np.empty(iters) for policy in policies}

    for iter in range(iters):
        for policy in policies:
            try:
                loss[policy][iter] = raw_exp_data[iter]["info"]["learner"][policy]["learner_stats"]["critic_loss"]
            except KeyError:
                # No training done for this iteration (maybe the replay buffer
                # is not yet full).
                loss[policy][iter] = 0

    return loss


critic_loss = get_critic_loss(raw_exp_data)

In [None]:
def make_critic_loss_plot(critic_loss):
    for policy, loss in critic_loss.items():
        fig_name = f"critic_loss_{policy}"
        plt.close(fig=fig_name)
        fig = plt.figure(num=fig_name, layout="constrained")
        fig.canvas.header_visible = False
        ax = fig.subplots()

        ax.plot(loss)

        ax.set_title(f"Q-function loss (Critic Loss) for {policy}")
        ax.set_ylabel("Loss")
        ax.set_xlabel("Iteration")

        ax.grid(axis="both")
        ax.set_axisbelow(True)  # By default the axis is over the content.


make_critic_loss_plot(critic_loss)

### Entropy Temperature Loss

This is the loss associated with the dynamic learning of the entropy parameter alpha (to balance the trade-off between exploration and exploitation). Also called Alpha Loss. The learning is based on the target entropy, that is a predefined value.

If the predefined value is "auto", the value is automatically set based on the action space dimensionality:

* For continuous action spaces: `-dim(A)` (A is the action space),
* For discrete action spaces: `-log(1/|A|)` (`|A|` is the number of possibile actions).

In [None]:
def get_alpha_loss(raw_exp_data):
    """Returns the Entropy Temperature Loss (Alpha Loss) for each policy and for each training iteration."""
    iters = len(raw_exp_data)
    policies = raw_exp_data[0]["config"]["policies_to_train"]

    loss = {policy: np.empty(iters) for policy in policies}

    for iter in range(iters):
        for policy in policies:
            try:
                loss[policy][iter] = raw_exp_data[iter]["info"]["learner"][policy]["learner_stats"]["alpha_loss"]
            except KeyError:
                # No training done for this iteration (maybe the replay buffer
                # is not yet full).
                loss[policy][iter] = 0

    return loss


alpha_loss = get_alpha_loss(raw_exp_data)

In [None]:
def get_target_entropy(raw_exp_data, env):
    iters = len(raw_exp_data)
    policies = raw_exp_data[0]["config"]["policies_to_train"]

    target_entropy = {}

    for policy in policies:
        for iter in range(iters):
            try:
                target_entropy[policy] = raw_exp_data[15]["info"]["learner"][policy]["learner_stats"]["target_entropy"][
                    0
                ]
                break
            except KeyError:
                continue

    return target_entropy


target_entropy = get_target_entropy(raw_exp_data, env)

In [None]:
for policy in raw_exp_data[0]["config"]["policies_to_train"]:
    print(f"Target entropy for {policy!r}: {target_entropy[policy]}")

In [None]:
def make_alpha_loss_plot(alpha_loss):
    for policy, loss in alpha_loss.items():
        fig_name = f"alpha_loss_{policy}"
        plt.close(fig=fig_name)
        fig = plt.figure(num=fig_name, layout="constrained")
        fig.canvas.header_visible = False
        ax = fig.subplots()

        ax.plot(loss)

        ax.set_title(f"Entropy Temperature Loss (Alpha Loss) for {policy}")
        ax.set_ylabel("Loss")
        ax.set_xlabel("Iteration")

        ax.grid(axis="both")
        ax.set_axisbelow(True)  # By default the axis is over the content.


make_alpha_loss_plot(alpha_loss)

## Alpha value

Since the entropy temperature parameter alpha is automatically tuned in every training iteration, we log the value. The meaning of this parameterer is how much importance is being placed on policy entropy; higher means more exploration.

In [None]:
def get_alpha_value(raw_exp_data):
    """Returns the Entropy Temperature Parameter (alpha) for each policy and for each training iteration."""
    iters = len(raw_exp_data)
    policies = raw_exp_data[0]["config"]["policies_to_train"]

    value = {policy: np.empty(iters) for policy in policies}

    for iter in range(iters):
        for policy in policies:
            try:
                # There may be multiple alpha values, depending on the action space.
                alpha_values = raw_exp_data[iter]["info"]["learner"][policy]["learner_stats"]["alpha_value"]
                assert (
                    len(alpha_values) == 1
                ), f"Only one entropy temperature parameter is supported, found {len(alpha_values)}"

                value[policy][iter] = alpha_values[0]
            except KeyError:
                # No training done for this iteration (maybe the replay buffer
                # is not yet full).
                value[policy][iter] = 0

    return value


alpha_value = get_alpha_value(raw_exp_data)

In [None]:
def make_alpha_value_plot(alpha_value):
    for policy, value in alpha_value.items():
        fig_name = f"alpha_value_{policy}"
        plt.close(fig=fig_name)
        fig = plt.figure(num=fig_name, layout="constrained")
        fig.canvas.header_visible = False
        ax = fig.subplots()

        ax.plot(value)

        ax.set_title(f"Entropy Temperature Parameter (alpha) for {policy}")
        ax.set_ylabel("Value")
        ax.set_xlabel("Iteration")

        ax.grid(axis="both")
        ax.set_axisbelow(True)  # By default the axis is over the content.


make_alpha_value_plot(alpha_value)

## Q-value Mean/Max/Min

Average/Min/Max Q-values (expected returns) predicted by the critic network(s) for the sampled batch of data in each training iteration.

The average Q-value should increase over time, this means the agent is learning higher rewards. The min/max shows the spread (variance) of the estimates. In the long term, the Q-values should not go to infinity (diverge) or become stuck at a single value.

In [None]:
def get_q_value_stats(raw_exp_data):
    """Returns the Mean/Max/Min Q-values for each policy and for each training iteration."""
    iters = len(raw_exp_data)
    policies = raw_exp_data[0]["config"]["policies_to_train"]

    q_value = {}
    for policy in policies:
        q_value[policy] = {stat: np.zeros(iters) for stat in ["max", "min", "mean"]}

    for iter in range(iters):
        for policy in policies:
            try:
                q_value[policy]["mean"][iter] = raw_exp_data[iter]["info"]["learner"][policy]["learner_stats"]["mean_q"]
                q_value[policy]["min"][iter] = raw_exp_data[iter]["info"]["learner"][policy]["learner_stats"]["min_q"]
                q_value[policy]["max"][iter] = raw_exp_data[iter]["info"]["learner"][policy]["learner_stats"]["max_q"]
            except KeyError:
                # No training done for this iteration (maybe the replay buffer
                # is not yet full).
                continue

    return q_value


q_value = get_q_value_stats(raw_exp_data)

In [None]:
def make_q_value_plot(q_value):
    for policy, policy_q_value in q_value.items():
        fig_name = f"q_value_{policy}"
        plt.close(fig=fig_name)
        fig = plt.figure(num=fig_name, layout="constrained")
        fig.canvas.header_visible = False
        ax = fig.subplots()

        default_first_color = plt.rcParams["axes.prop_cycle"].by_key()["color"][0]
        iterations = np.arange(len(policy_q_value["mean"]))

        ax.plot(policy_q_value["mean"], label="Mean", color=default_first_color)
        ax.fill_between(
            iterations,
            policy_q_value["min"],
            policy_q_value["max"],
            color=default_first_color,
            alpha=0.4,
            label="Min/Max",
        )

        ax.set_title(f"Predicted Q-values for {policy}")
        ax.set_ylabel("Reward")
        ax.set_xlabel("Iteration")

        ax.legend()
        ax.grid(axis="both")
        ax.set_axisbelow(True)  # By default the axis is over the content.


make_q_value_plot(q_value)

## Mean/Dist TD Error

It is the average of **Temporal Difference** (TD) errors of the critic network(s) over the current training batch. The single TD error is how much the predicted Q-value differs from the "target" value computed using the observed reward and the target network.

* A large average TD error (positive or negative) can signal instability, underfitting, or divergence.
* A large average TD error near zero means the critic's predictions are close to the targets, which is typically desired as training progresses.

It should generally decrease and stabilize (but may fluctuate) as the agent learns.

There is a plot also for the distribution of the TD errors, since Ray RLLib logs them.

In [None]:
def get_mean_td_error(raw_exp_data):
    """Returns the average of Temporal Difference errors for each policy and for each training iteration."""
    iters = len(raw_exp_data)
    policies = raw_exp_data[0]["config"]["policies_to_train"]

    mean_td_error = {policy: np.zeros(iters) for policy in policies}

    for iter in range(iters):
        for policy in policies:
            try:
                mean_td_error[policy][iter] = raw_exp_data[iter]["info"]["learner"][policy]["mean_td_error"]
            except KeyError:
                # No training done for this iteration (maybe the replay buffer
                # is not yet full).
                continue

    return mean_td_error


mean_td_error = get_mean_td_error(raw_exp_data)

In [None]:
def get_td_errors(raw_exp_data):
    """Returns the Temporal Difference errors for each policy and for each training iteration."""
    iters = len(raw_exp_data)
    policies = raw_exp_data[0]["config"]["policies_to_train"]

    td_errors = {}

    # We need first to get the number of errors per iteration, we assume it is the
    # same for all training iterations.
    for policy in policies:
        td_errors_size = 0
        for iter in range(iters):
            try:
                td_errors_size = len(raw_exp_data[iter]["info"]["learner"][policy]["td_error"])
                break
            except KeyError:
                # No training done for this iteration (maybe the replay buffer
                # is not yet full).
                continue

        assert td_errors_size > 0
        td_errors[policy] = np.zeros(shape=(iters, td_errors_size))

    for iter in range(iters):
        for policy in policies:
            try:
                td_errors[policy][iter] = raw_exp_data[iter]["info"]["learner"][policy]["td_error"]
            except KeyError:
                # No training done for this iteration (maybe the replay buffer
                # is not yet full).
                continue

    return td_errors


td_errors = get_td_errors(raw_exp_data)

In [None]:
def make_mean_td_error_plot(mean_td_error):
    for policy, policy_mean_td_error in mean_td_error.items():
        fig_name = f"mean_td_error_{policy}"
        plt.close(fig=fig_name)
        fig = plt.figure(num=fig_name, layout="constrained")
        fig.canvas.header_visible = False
        ax = fig.subplots()

        ax.plot(policy_mean_td_error)

        ax.set_title(f"Average TD error for {policy}")
        ax.set_ylabel("Error")
        ax.set_xlabel("Iteration")

        ax.grid(axis="both")
        ax.set_axisbelow(True)  # By default the axis is over the content.


make_mean_td_error_plot(mean_td_error)

In [None]:
def make_td_errors_plot(td_errors):
    for policy, policy_td_errors in td_errors.items():
        fig_name = f"td_errors_{policy}"
        plt.close(fig=fig_name)
        fig = plt.figure(num=fig_name, layout="constrained")
        fig.canvas.header_visible = False
        ax = fig.subplots()

        # Important: the array must be transposed!
        ax.violinplot(policy_td_errors.T, showmedians=True, bw_method=0.8)

        ax.set_title(f"TD error for {policy}")
        ax.set_ylabel("Error")
        ax.set_xlabel("Iteration")

        ax.xaxis.set_major_locator(ticker.MaxNLocator(integer=True))

        ax.grid(axis="both")
        ax.set_axisbelow(True)  # By default the axis is over the content.


make_td_errors_plot(td_errors)

## Gradient and Optimization Stats

### Global Gradient Norm

It is the Euclidean norm of the gradients computed during a single optimisation step. It measures the size of the policy/value gradients during learning.

* A very large value can indicate unstable learning or exploding gradients.
* A very small value (close to zero) means the model's parameters are barely changing (possibly due to vanishing gradients or convergence).

The scale of the value is influenced by the network's structure and the distribution of actions. Fluctuations are normal.

In [None]:
def get_grad_gnorm(raw_exp_data):
    """Returns the global Gradient Norm for each policy and for each training iteration."""
    iters = len(raw_exp_data)
    policies = raw_exp_data[0]["config"]["policies_to_train"]

    loss = {policy: np.empty(iters) for policy in policies}

    for iter in range(iters):
        for policy in policies:
            try:
                loss[policy][iter] = raw_exp_data[iter]["info"]["learner"][policy]["learner_stats"]["grad_gnorm"]
            except KeyError:
                # No training done for this iteration (maybe the replay buffer
                # is not yet full).
                loss[policy][iter] = 0

    return loss


grad_gnorm = get_grad_gnorm(raw_exp_data)

In [None]:
def make_grad_gnorm_plot(grad_gnorm):
    for policy, policy_grad_gnorm in grad_gnorm.items():
        fig_name = f"grad_gnorm_{policy}"
        plt.close(fig=fig_name)
        fig = plt.figure(num=fig_name, layout="constrained")
        fig.canvas.header_visible = False
        ax = fig.subplots()

        ax.plot(policy_grad_gnorm)

        ax.set_title(f"Global Gradient Norm for {policy}")
        ax.set_ylabel("Global Gradient Norm")
        ax.set_xlabel("Iteration")

        ax.grid(axis="both")
        ax.set_axisbelow(True)  # By default the axis is over the content.


make_grad_gnorm_plot(grad_gnorm)

### Gradient Updates

The total number of gradient updates (optimizer steps) performed by the learner for this policy since the beginning of training. Since in SAC the number of SGD passes per batch can be changed over time, can be helpful to see the increasing curve.

In [None]:
def get_grad_updates(raw_exp_data):
    """Returns the Gradient Updates count for each policy and for each training iteration."""
    iters = len(raw_exp_data)
    policies = raw_exp_data[0]["config"]["policies_to_train"]

    updates = {policy: np.empty(iters) for policy in policies}

    for iter in range(iters):
        for policy in policies:
            try:
                updates[policy][iter] = raw_exp_data[iter]["info"]["learner"][policy]["num_grad_updates_lifetime"]
            except KeyError:
                # No training done for this iteration (maybe the replay buffer
                # is not yet full).
                updates[policy][iter] = 0

    return updates


grad_updates = get_grad_updates(raw_exp_data)

In [None]:
def make_grad_updates_plot(grad_updates):
    for policy, policy_grad_updates in grad_updates.items():
        fig_name = f"grad_updates_{policy}"
        plt.close(fig=fig_name)
        fig = plt.figure(num=fig_name, layout="constrained")
        fig.canvas.header_visible = False
        ax = fig.subplots()

        ax.plot(policy_grad_updates)

        ax.set_title(f"Cumulative gradient updates since beginning for {policy}")
        ax.set_ylabel("Updates")
        ax.set_xlabel("Iteration")

        ax.grid(axis="both")
        ax.set_axisbelow(True)  # By default the axis is over the content.


make_grad_updates_plot(grad_updates)

### Diff. Gradient Updates over Sample Policy

It is the difference between the number of gradient updates that have been performed and the number of environment timesteps collected for policy sampling.

WIP

In [None]:
def get_diff_grad_updates(raw_exp_data):
    """Returns the difference of Gradient Updates over Sample Policy for each policy and for each training iteration."""
    iters = len(raw_exp_data)
    policies = raw_exp_data[0]["config"]["policies_to_train"]

    diff = {policy: np.empty(iters) for policy in policies}

    for iter in range(iters):
        for policy in policies:
            try:
                diff[policy][iter] = raw_exp_data[iter]["info"]["learner"][policy][
                    "diff_num_grad_updates_vs_sampler_policy"
                ]
            except KeyError:
                # No training done for this iteration (maybe the replay buffer
                # is not yet full).
                diff[policy][iter] = 0

    return diff


diff_grad = get_diff_grad_updates(raw_exp_data)

In [None]:
def make_diff_grad_updates_plot(diff_grad):
    for policy, policy_diff_grad in diff_grad.items():
        fig_name = f"diff_grad_updates_{policy}"
        plt.close(fig=fig_name)
        fig = plt.figure(num=fig_name, layout="constrained")
        fig.canvas.header_visible = False
        ax = fig.subplots()

        ax.plot(policy_diff_grad)

        ax.set_title(f"Difference in Updates vs. Samples for {policy}")
        ax.set_ylabel("Difference")
        ax.set_xlabel("Iteration")

        ax.grid(axis="both")
        ax.set_axisbelow(True)  # By default the axis is over the content.


make_diff_grad_updates_plot(diff_grad)