## Imports

In [1]:
import os
import sys
import random
from collections import deque
from operator import itemgetter

import gym_donkeycar
import gymnasium as gym
import imageio
import ipywidgets as widgets
import logging
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from IPython.display import display
from ipywidgets import HBox, VBox
from matplotlib import pyplot as plt
from PIL import Image
from ruamel.yaml import YAML
from tensorboard import notebook
from tensorboard.backend.event_processing.event_accumulator import \
    EventAccumulator
from torch import distributions as dist
from torch.distributions import Categorical, Normal
from torch.utils.tensorboard import SummaryWriter
from torchvision import transforms
from tqdm import tqdm

# suppress warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.environ["IMAGEIO_IGNORE_WARNINGS"] = "True"

import gym.spaces as gym_spaces
import gymnasium as gym  # overwrite OpenAI gym
import stable_baselines3 as sb3
from gym_donkeycar.envs.donkey_env import DonkeyEnv
from gymnasium import spaces
from gymnasium.spaces import Box
from src.actor_critic import ContinuousActorCritic
from src.blocks import CategoricalStraightThrough, ConvBlock
from src.categorical_vae import CategoricalVAE
from src.imagination_env import ImaginationEnv
from src.mlp import MLP
from src.preprocessing import grayscale_transform as transform
from src.replay_buffer import ReplayBuffer
from src.rssm import RSSM
from src.utils import load_config, save_image_and_reconstruction, to_np, make_env
from stable_baselines3 import A2C, PPO, SAC
from stable_baselines3.common import env_checker
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.results_plotter import load_results, ts2xy
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv

torch.cuda.empty_cache()
%matplotlib inline

## Load the config

In [2]:
config = load_config()

for key in config:
    locals()[key] = config[key]

print(config)

{'device': device(type='cuda', index=0), 'A': 2, 'Z': 1024, 'debug': False, 'show_inline_tensorboard': False, 'log_dir': 'logs/', 'seed': 0, 'exe_path': '/home/till/Desktop/Thesis/donkeycar_sim/DonkeySimLinux/donkey_sim.x86_64', 'env_id': 'donkey-minimonaco-track-v0', 'port': 9091, 'frame_skip': 2, 'max_cte': 4.0, 'body_style': 'f1', 'body_rgb': [255, 255, 255], 'car_name': 'RL-Racer', 'font_size': 30, 'toy_env': False, 'vectorized': True, 'monitor': False, 'n_envs': 2, 'size': [64, 64], 'grayscale': False, 'start_episode': 1000, 'n_seed_episodes': 1000, 'n_training_episodes': 5000, 'max_episode_steps': 1000, 'max_imagination_episode_steps': 15, 'imagination_timesteps_per_model_update': 200, 'max_grad_norm': 1, 'rssm_lr': 0.0001, 'rssm_l2_regularization': 1e-06, 'batch_size': 1, 'H': 512, 'uniform_ratio': 0.01, 'buffer_size': 50000, 'activation': 'silu', 'num_categoricals': 32, 'num_classes': 32, 'channels': [64, 128, 256, 512, 256], 'kernel_size': 3, 'stride': 2, 'padding': 1, 'conv_b

## Create the environment

In [4]:
env = make_env()

Making a real sim env.
Making 2 vectorized envs.
starting DonkeyGym env
donkey subprocess started
Found path: /home/till/Desktop/Thesis/donkeycar_sim/DonkeySimLinux/donkey_sim.x86_64


INFO:gym_donkeycar.core.client:connecting to localhost:9091 


loading scene mini_monaco


INFO:gym_donkeycar.envs.donkey_sim:on need car config
INFO:gym_donkeycar.envs.donkey_sim:sending car config.
INFO:gym_donkeycar.envs.donkey_sim:sim started!


starting DonkeyGym env
donkey subprocess started
Found path: /home/till/Desktop/Thesis/donkeycar_sim/DonkeySimLinux/donkey_sim.x86_64


INFO:gym_donkeycar.core.client:connecting to localhost:9091 
INFO:gym_donkeycar.envs.donkey_sim:on need car config
INFO:gym_donkeycar.envs.donkey_sim:sending car config.
INFO:gym_donkeycar.envs.donkey_sim:sim started!


## Train an agent

In [None]:
train_agent = True

if train_agent:
    
    agent = PPO(
        "CnnPolicy",
        env,
        verbose=1,
        tensorboard_log="baseline_logs/"
    )
    agent.learn(total_timesteps=500_000)

Using cuda device
Wrapping the env in a VecTransposeImage.
Logging to baseline_logs/PPO_8
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 43.5     |
|    ep_rew_mean     | 9.68     |
| time/              |          |
|    fps             | 24       |
|    iterations      | 1        |
|    time_elapsed    | 170      |
|    total_timesteps | 4096     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 46.5        |
|    ep_rew_mean          | 11.7        |
| time/                   |             |
|    fps                  | 24          |
|    iterations           | 2           |
|    time_elapsed         | 331         |
|    total_timesteps      | 8192        |
| train/                  |             |
|    approx_kl            | 0.014259405 |
|    clip_fraction        | 0.137       |
|    clip_range           | 0.2         |
|    entropy_loss       

INFO:gym_donkeycar.envs.donkey_sim:New lap time: 3.88 seconds


In [None]:
save_agent = False

if save_agent:
    # agent.save("baseline_weights/SAC_500k")
    # agent.save_replay_buffer("baseline_weights/SAC_500k_replay_buffer")
    
    agent.save("baseline_weights/AGENT_NAME_FILLER")

In [None]:
load_agent = False

if load_agent:
    
    # create the agent
    eval_agent = A2C(
        "CnnPolicy", 
        env,
        #buffer_size=0,
        verbose=1, 
        # tensorboard_log="baseline_logs/"
    )
    
    # load the agent
    eval_agent.load("baseline_weights/A2C_500k")
    # eval_agent.load_replay_buffer("baseline_weights/SAC_500k_replay_buffer")

## Trained Policy Evaluation

In [None]:
evaluate_trained_policy = True

# CURRENTLY SELECTING RANDOM ACTIONS TO TEST THE PLOTTING.

if evaluate_trained_policy:
    n_trials = 30

    sim_config = {
     "exe_path" : "/home/till/Desktop/Thesis/donkeycar_sim/DonkeySimLinux/donkey_sim.x86_64",
     "port" : 9091,
    }
    env = gym.make(
        "GymV21Environment-v0",
        env_id=env_id,
        max_episode_steps=1500,
        make_kwargs={
         "conf": sim_config
        }
    )

    # create the agent
    eval_agent = SAC(
        "CnnPolicy", 
        env,
        buffer_size=0,
        verbose=1, 
        # tensorboard_log="baseline_logs/"
    )

    # load the agent
    # eval_agent.load("baseline_weights/SAC_500k")

    episode_rewards = []
    episode_lengths = []

    for i in range(n_trials):
        print("Starting eval episode", i)
        episode_reward = 0.0
        episode_length = 0

        obs, info = env.reset()
        done = False
        while not done:
            action = eval_agent.predict(obs, deterministic=True)
            obs, reward, terminated, truncated, info = env.step([np.random.rand(), np.random.rand()])
            episode_reward += reward
            episode_length += 1
            done = terminated or truncated

        episode_rewards.append(episode_reward)
        episode_lengths.append(episode_length)

    # episode_rewards, episode_lengths = evaluate_policy(eval_agent, env, n_eval_episodes=30, return_episode_rewards=True, deterministic=True)
    print(f"mean_reward={np.mean(episode_rewards):.3f} +/- {np.std(episode_rewards):.3f}")
    print(f"mean_length={np.mean(episode_lengths):.3f} +/- {np.std(episode_lengths):.3f}")
    
    

In [None]:
env.close()

In [None]:
save_files = False

if save_files:

    with open("baseline_eval/RANDOM_AGENT_episode_rewards.npy", 'wb') as f:
        np.save(f, episode_rewards)

    with open("baseline_eval/RANDOM_AGENT_episode_lengths.npy", 'wb') as f:
        np.save(f, episode_lengths)

In [None]:
data = {
    "Random Agent": {
        "episode_rewards": np.load("baseline_eval/RANDOM_AGENT_episode_rewards.npy"),
        "episode_lengths": np.load("baseline_eval/RANDOM_AGENT_episode_lengths.npy"),
    },
    
    "Random Agent 2": {
        "episode_rewards": np.load("baseline_eval/RANDOM_AGENT_episode_rewards.npy") * 2,
        "episode_lengths": np.load("baseline_eval/RANDOM_AGENT_episode_lengths.npy") * 10 - 100, 
    },
    
    "Random Agent 3": {
        "episode_rewards": np.load("baseline_eval/RANDOM_AGENT_episode_rewards.npy") * 1.5,
        "episode_lengths": np.load("baseline_eval/RANDOM_AGENT_episode_lengths.npy") * 1.5, 
    },
}

In [None]:
num_bootstrap_samples = 2000

fig, axs = plt.subplots(1, 2, figsize=(12, 5))

for key in data.keys():
    
    episode_rewards = data[key]["episode_rewards"]
    episode_lengths = data[key]["episode_lengths"]
    
    bootstrap_rewards = []
    bootstrap_lengths = []

    for _ in range(num_bootstrap_samples):
        # bootstrap sampling for episode rewards
        bootstrap_rewards_sample = np.random.choice(episode_rewards, size=len(episode_rewards), replace=True)
        bootstrap_rewards.append(np.mean(bootstrap_rewards_sample))

        # bootstrap sampling for episode lengths
        bootstrap_lengths_sample = np.random.choice(episode_lengths, size=len(episode_lengths), replace=True)
        bootstrap_lengths.append(np.mean(bootstrap_lengths_sample))

    axs[0].hist(bootstrap_rewards, bins="auto", alpha=0.7, density=True, label=key,) # color="tab:blue"
    axs[1].hist(bootstrap_lengths, bins="auto", alpha=0.7, density=True, label=key) # color="tab:blue"

axs[0].set_xlabel("Bootstrapped Mean Episode Reward")
axs[0].set_ylabel("Probability Density")
axs[0].legend()
#axs[0].set_title('Bootstrapped Episode Rewards Histogram')
axs[0].grid(True, alpha=0.2)

axs[1].set_xlabel("Bootstrapped Mean Episode Length")
axs[1].set_ylabel("Probability Density")
axs[1].legend()
#axs[1].set_title('Bootstrapped Episode Lengths Histogram')
axs[1].grid(True, alpha=0.2)
    
plt.tight_layout()
plt.savefig("figs/Bootstrapped Episode Reward and Length.pdf")
plt.show()


In [None]:
sim_config = {
 "exe_path" : "/home/till/Desktop/Thesis/donkeycar_sim/DonkeySimLinux/donkey_sim.x86_64",
 "port" : 9091,
}
env = gym.make(
    "GymV21Environment-v0",
    env_id=env_id,
    max_episode_steps=1500,
    make_kwargs={
     "conf": sim_config
    }
)

# create the agent
eval_agent = PPO(
    "CnnPolicy", 
    env,
    #buffer_size=0,
    verbose=1, 
    # tensorboard_log="baseline_logs/"
)

# load the agent
eval_agent.load("baseline_weights/PPO_500k")

In [None]:
eval_agent = PPO(
    "CnnPolicy", 
    env,
    #buffer_size=0,
    verbose=1, 
    # tensorboard_log="baseline_logs/"
)

In [None]:
obs, info = env.reset()

done = False
while not done:
    action = eval_agent.predict(obs)
    obs, reward, terminated, truncated, info = env.step(action)
    episode_reward += reward
    episode_length += 1
    done = terminated or truncated

In [None]:
env.close()

In [None]:
# eval_rewards, eval_lengths = evaluate_policy(eval_agent, env, n_eval_episodes=100, return_episode_rewards=True, deterministic=True)

In [None]:
# np.save("eval_rewards", np.array(eval_rewards))
# np.save("eval_lengths", np.array(eval_lengths))

In [None]:
# agent = SAC(
#     "CnnPolicy", 
#     env,
#     buffer_size=20_000,
#     verbose=1, 
#     tensorboard_log="logs/")
# 
# eval_callback = EvalCallback(
#     env, 
#     best_model_save_path='weights/', 
#     log_path='logs/', 
#     eval_freq=500,
#     n_eval_episodes=1)
#
# agent.learn(total_timesteps=30_000, callback=eval_callback)

## Plot the training results

In [None]:
metrics = [
    "rollout/ep_rew_mean",     # SAC: ✅, PPO: ✅, A2C: ✅
    "rollout/ep_len_mean",     # SAC: ✅, PPO: ✅, A2C: ✅
    "train/critic_loss",       # SAC: critic_loss, PPO: value_loss, A2C: value_loss
    "train/actor_loss",        # SAC: actor_loss, PPO: policy_gradient_loss, A2C: policy_loss
    "train/entropy_loss",     # SAC: ent_coef_loss, PPO: entropy_loss, A2C: entropy_loss
    "time/fps" # SAC: ✅, PPO: ✅, A2C: ✅
]

baselines = {
    "A2C_500k": "baseline_logs/A2C_500k/events.out.tfevents.1685255857.z.54420.0",
    "SAC_500k": "baseline_logs/SAC_500k/events.out.tfevents.1685211019.z.35749.0",
    "PPO_500k": "baseline_logs/PPO_500k/events.out.tfevents.1685229886.z.44451.0",
}

dataframes = {}

for key, log_file in baselines.items():
    event_acc = EventAccumulator(log_file)
    event_acc.Reload()

    rows = []
    for metric in metrics:
        
        if metric == "train/critic_loss":
            if "PPO" in key or "A2C" in key:
                metric = "train/value_loss"
        
        if metric == "train/actor_loss":
            if "PPO" in key:
                metric = "train/policy_gradient_loss"
            elif "A2C" in key:
                metric = "train/policy_loss"
        
        if metric == "train/entropy_loss":
            if "SAC" in key:
                metric = "train/ent_coef_loss"        
        
        steps = []
        values = []
        for event in event_acc.Scalars(metric):
            steps.append(event.step)
            values.append(event.value)
        rows.append(values)

    df = pd.DataFrame(rows, columns=steps, index=metrics)
    dataframes[key] = df

print("Created dataframes:", list(dataframes.keys()))
    
sac_500k_df = dataframes["SAC_500k"].T
ppo_500k_df = dataframes["PPO_500k"].T
a2c_500k_df = dataframes["A2C_500k"].T

In [None]:
sac_500k_df.describe()

In [None]:
ppo_500k_df.describe()

In [None]:
a2c_500k_df.describe()

In [None]:
for key, df in dataframes.items():
    print(key)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_filter1d

title_dict = {
    "rollout/ep_rew_mean": "Mean Episode Reward",
    "rollout/ep_len_mean": "Mean Episode Length",
    "train/critic_loss": "Critic Loss",
    "train/actor_loss": "Actor Loss",
    "train/entropy_loss": "Entropy Loss",
    "time/fps": "FPS",
}

fig, axs = plt.subplots(2, 3, figsize=(12, 8))
# fig.suptitle("Training Metrics with a CNN Policy", fontsize=14)

colors = ["tab:blue", "tab:orange", "tab:green"]

for i, metric in enumerate(metrics):
    row = i // 3
    col = i % 3
    ax = axs[row, col]

    ax.set_title(title_dict[metric])
    ax.set_ylabel(title_dict[metric])
    ax.set_xlabel('Environment Step')

    for j, (key, df) in enumerate(dataframes.items()):
        steps = df.columns
        values = df.loc[metric]
        
        label = key.split("_")[0] # cut of everything after the first underscore
        
        if "actor_loss" in metric and "A2C" in key:
            smoothed_values = gaussian_filter1d(values, sigma=4) 
        else:
            smoothed_values = gaussian_filter1d(values, sigma=2)
        
        ax.plot(steps, smoothed_values, label=label, alpha=1.0, color=colors[j % len(colors)])
        ax.plot(steps, values, alpha=0.25, color=colors[j % len(colors)])
        
        ax.legend(loc="upper right")

    ax.xaxis.set_major_locator(plt.MaxNLocator(4))
    ax.yaxis.set_major_locator(plt.MaxNLocator(8))

    if "critic_loss" in metric or "actor_loss" in metric:
        ax.set_yscale("symlog")
        ax.get_yaxis().set_major_formatter(plt.FuncFormatter(lambda x, pos: "{:.0e}".format(x) if x != 0 else None))
        
    ax.grid(True, linestyle='--', linewidth=0.5, color='lightgray')
    ax.set_facecolor('white')

plt.tight_layout()
plt.savefig("figs/Training Metrics with a CNN Policy.pdf")  
plt.show()

In [None]:
# TODO:
# add a plot for the explained variance for A2C

## Plot the Explained Variance for A2C

In [None]:
metrics = [
    "train/explained_variance", # A2C
]

baselines = {
    "A2C_500k": "baseline_logs/A2C_500k/events.out.tfevents.1685255857.z.54420.0",
}

dataframes = {}

for key, log_file in baselines.items():
    event_acc = EventAccumulator(log_file)
    event_acc.Reload()

    rows = []
    for metric in metrics:
        
        steps = []
        values = []
        for event in event_acc.Scalars(metric):
            steps.append(event.step)
            values.append(event.value)
        rows.append(values)

    df = pd.DataFrame(rows, columns=steps, index=metrics)
    dataframes[key] = df

print("Created dataframes:", list(dataframes.keys()))

explvar_a2c_500k_df = dataframes["A2C_500k"].T

In [None]:
explvar_a2c_500k_df

In [None]:
plt.plot(explvar_a2c_500k_df)

In [None]:
title_dict = {
    "train/explained_variance": "Explained Variance"
}

fig, axs = plt.subplots(1, 1, figsize=(8, 6))

colors = ["tab:blue", "tab:orange", "tab:green"]

metric = "train/explained_variance"
ax = axs

# ax.set_title(title_dict[metric])
ax.set_ylabel(title_dict[metric])
ax.set_xlabel('Environment Step')

for j, (key, df) in enumerate(dataframes.items()):
    steps = df.columns
    values = df.loc[metric]

    label = key.split("_")[0]  # Cut off everything after the first underscore

    if "A2C" in key:
        smoothed_values = gaussian_filter1d(values, sigma=2)

    ax.plot(steps, smoothed_values, label=label, alpha=1.0, color=colors[j % len(colors)])
    ax.plot(steps, values, alpha=0.25, color=colors[j % len(colors)])

    ax.legend(loc="upper right")

ax.xaxis.set_major_locator(plt.MaxNLocator(4))
ax.yaxis.set_major_locator(plt.MaxNLocator(8))

ax.set_yscale("symlog")
ax.get_yaxis().set_major_formatter(plt.FuncFormatter(lambda x, pos: '{:.0e}'.format(x) if x != 0 else None))

ax.grid(True, linestyle='--', linewidth=0.5, color='lightgray')
ax.set_facecolor('white')

plt.tight_layout()
plt.savefig("figs/Explained Variance of A2C.pdf")
plt.show()
