In [1]:
from __future__ import annotations
from typing import Dict, List, Union

import logging
import os
import random
import sys
from collections import deque
from operator import itemgetter

import gym_donkeycar
import gymnasium as gym
import imageio
import ipywidgets as widgets
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from IPython.display import display
from ipywidgets import HBox, VBox
from matplotlib import pyplot as plt
from PIL import Image
from ruamel.yaml import YAML
from scipy.ndimage import gaussian_filter1d
from scipy.stats import norm
from tensorboard import notebook
from tensorboard.backend.event_processing.event_accumulator import \
    EventAccumulator
from torch import distributions as dist
from torch.distributions import Categorical, Normal
from torch.utils.tensorboard import SummaryWriter
from torchvision import transforms
from tqdm import tqdm

import gymnasium as gym

# suppress warnings
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="gymnasium.spaces.box") # module="gymnasium"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.environ["IMAGEIO_IGNORE_WARNINGS"] = "True"

import stable_baselines3 as sb3
from gym_donkeycar.envs.donkey_env import DonkeyEnv
from gymnasium import spaces
from gymnasium.spaces import Box
from stable_baselines3 import A2C, PPO, SAC
from stable_baselines3.common import env_checker
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.results_plotter import load_results, ts2xy
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv

import src
from src.actor_critic_discrete import DiscreteActorCritic
from src.actor_critic_dreamer import ActorCriticDreamer
from src.actor_critic import ContinuousActorCritic
from src.blocks import CategoricalStraightThrough, ConvBlock
from src.categorical_vae import CategoricalVAE
from src.imagination_env import make_imagination_env
from src.mlp import MLP
from src.preprocessing import transform
from src.replay_buffer import ReplayBuffer
from src.rssm import RSSM
from src.utils import (load_config, make_env, save_image_and_reconstruction,
                       to_np, symlog, symexp, twohot_encode, ExponentialMovingAvg,
                       ActionExponentialMovingAvg, MetricsTracker)
from src.vae import VAE

torch.cuda.empty_cache()
%matplotlib inline
%load_ext autoreload
%autoreload 2

# Load the config
config = load_config()
for key in config:
    locals()[key] = config[key]

In [2]:
config = load_config()

for key in config:
    locals()[key] = config[key]

print(config)

{'device': device(type='cuda', index=0), 'A': 3, 'Z': 1024, 'debug': False, 'log_dir': 'logs/', 'log_interval': 20, 'seed': 0, 'exe_path': '/home/till/Desktop/Thesis/donkeycar_sim/DonkeySimLinux/donkey_sim.x86_64', 'env_id': 'donkey-minimonaco-track-v0', 'port': 9091, 'max_cte': 4.0, 'frame_skip': 2, 'steer_limit': 1.0, 'throttle_min': -0.2, 'throttle_max': 0.2, 'body_style': 'f1', 'body_rgb': [255, 255, 255], 'car_name': 'RL-Racer', 'font_size': 30, 'sb3_monitor': False, 'toy_env': True, 'vectorized': True, 'n_envs': 1, 'size': [64, 64], 'grayscale': False, 'start_phase': 1000, 'n_seed_phases': 1000, 'n_model_updates': 7812, 'n_steps_per_model_update': 64, 'agent_update_phases_per_model_update': 5, 'max_imagination_episode_steps': 16, 'imagination_steps_per_agent_update': 128, 'max_grad_norm': 100, 'rssm_lr': 0.0001, 'rssm_l2_regularization': 1e-06, 'batch_size': 1, 'H': 512, 'uniform_ratio': 0.01, 'buffer_size': 50000, 'activation': 'silu', 'num_categoricals': 32, 'num_classes': 32, 

## Init the environment

## Playground  \~( ˘▾˘~)

## AC training loop for Cartpole

In [None]:
# environment hyperparams
n_envs = 1
n_updates = 30000
n_steps_per_update = 128

agent = ContinuousActorCritic(
     n_features=3, 
     n_actions=1,
     n_envs=n_envs,
     gamma=0.999,
     lam=0.95,
     entropy_coeff=0.01,
     critic_lr=5e-4, # it's very sensitive to higher learning rates (gets nans)
     actor_lr=1e-4,
    action_clip=2
).to(device)

In [None]:
# create a wrapper environment to save episode returns and episode lengths
envs = gym.vector.make("Pendulum-v1", num_envs=n_envs, max_episode_steps=200)
envs_wrapper = gym.wrappers.RecordEpisodeStatistics(envs, deque_size=n_envs*n_updates)

# Logging
log_dir = "logs/"
writer = SummaryWriter(log_dir)
notebook.start(f"--logdir={log_dir}")

critic_losses = []
actor_losses = []
entropies = []

for sample_phase in tqdm(range(n_updates)):
    
    # we don't have to reset the envs, they just continue playing
    # until the episode is over and then reset automatically

    ep_value_preds = torch.zeros(n_steps_per_update, n_envs, device=device)
    ep_rewards = torch.zeros(n_steps_per_update, n_envs, device=device)
    ep_log_probs = torch.zeros(n_steps_per_update, n_envs, device=device)
    ep_entropies = torch.zeros(n_steps_per_update, n_envs, device=device)
    ep_masks = torch.zeros(n_steps_per_update, n_envs, device=device)

    if sample_phase == 0:
        obs, info = envs_wrapper.reset(seed=0)
        obs = torch.Tensor(obs)

    for step in range(n_steps_per_update):
        
        # get action and value
        action, log_prob, actor_entropy = agent.get_action(obs)
        value_pred = agent.critic(obs)

        # env step
        obs, reward, terminated, truncated, infos = envs_wrapper.step(to_np(action))
        obs = torch.Tensor(obs)

        ep_value_preds[step] = value_pred.squeeze()
        ep_rewards[step] = torch.tensor(reward, device=device)
        ep_log_probs[step] = log_prob.squeeze()
        ep_entropies[step] = actor_entropy.squeeze()

        # add a mask (for the return calculation later);
        # for each env the mask is 1 if the episode is ongoing and 0 if it is terminated (not by truncation!)
        dones = np.logical_or(terminated, truncated)
        ep_masks[step] = torch.Tensor([not term for term in dones], device=device) # terminated/dones

    # calculate the losses for actor and critic
    last_value_pred = agent.critic(obs)
    critic_loss, actor_loss = agent.get_loss(ep_rewards, ep_log_probs, ep_value_preds, last_value_pred,
                                             ep_entropies, ep_masks)

    # update the actor and critic networks
    agent.update_parameters(critic_loss, actor_loss)

    # log the losses and entropy
    if envs_wrapper.return_queue:
        writer.add_scalar("episode_return", envs_wrapper.return_queue[-1], global_step=sample_phase)
    writer.add_scalar("actor_loss", to_np(actor_loss), global_step=sample_phase)
    writer.add_scalar("critic_loss", to_np(critic_loss), global_step=sample_phase)
    writer.add_scalar("actor_entropy", to_np(ep_entropies.mean()), global_step=sample_phase)
    critic_losses.append(to_np(critic_loss))
    actor_losses.append(to_np(actor_loss))
    entropies.append(to_np(ep_entropies.mean()))

In [None]:
""" Plot the results """

rolling_length = max(1, int(len(envs_wrapper.return_queue) / 20))

fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(12, 5))
fig.suptitle(f"Training plots for {agent.__class__.__name__} with n_envs={n_envs}, n_steps_per_update={n_steps_per_update}")

# Episode return
axs[0, 0].set_title("Episode Returns")
episode_returns_moving_average = np.convolve(np.array(envs_wrapper.return_queue).flatten(), np.ones(rolling_length), mode="valid") / rolling_length
axs[0, 0].plot(np.arange(len(episode_returns_moving_average)) / n_envs, episode_returns_moving_average)
axs[0, 0].set_xlabel("Number of episodes")

# Entropy
axs[1, 0].set_title("Entropy")
entropy_moving_average = np.convolve(np.array(entropies), np.ones(rolling_length), mode="valid") / rolling_length
axs[1, 0].plot(entropy_moving_average)
axs[1, 0].set_xlabel("Number of updates")

# Critic loss
axs[0, 1].set_title("Critic Loss")
critic_losses_moving_average = np.convolve(np.array(critic_losses).flatten(), np.ones(rolling_length), mode="valid") / rolling_length
axs[0, 1].plot(critic_losses_moving_average)
axs[0, 1].set_xlabel("Number of updates")

# Actor loss
axs[1, 1].set_title("Actor Loss")
actor_losses_moving_average = np.convolve(np.array(actor_losses).flatten(), np.ones(rolling_length), mode="valid") / rolling_length
axs[1, 1].plot(actor_losses_moving_average)
axs[1, 1].set_xlabel("Number of updates")

plt.tight_layout()
plt.show()

In [None]:
rolling_length = max(1, int(len(envs_wrapper.return_queue) / 50))

fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(12, 7))
fig.suptitle(f"Training plots for {agent.__class__.__name__} with n_envs={n_envs}, n_steps_per_update={n_steps_per_update}")

# Episode return
axs[0, 0].set_title("Episode Returns")
episode_returns = np.array(envs_wrapper.return_queue).flatten()
episode_returns_moving_average = np.convolve(episode_returns, np.ones(rolling_length), mode="valid") / rolling_length
episode_returns_std = np.array([np.std(episode_returns[max(0, i-rolling_length+1):i+1]) for i in range(rolling_length-1, len(episode_returns))])
axs[0, 0].plot(np.arange(len(episode_returns_moving_average)) / n_envs, episode_returns_moving_average)
axs[0, 0].fill_between(np.arange(len(episode_returns_moving_average)) / n_envs,
                       episode_returns_moving_average - episode_returns_std,
                       episode_returns_moving_average + episode_returns_std,
                       alpha=0.3)
axs[0, 0].set_xlabel("Number of episodes")
axs[0, 0].set_ylim(np.percentile(episode_returns, 2), np.percentile(episode_returns, 99))

# Entropy
axs[1, 0].set_title("Entropy")
entropy_moving_average = np.convolve(np.array(entropies), np.ones(rolling_length), mode="valid") / rolling_length
entropy_std = np.array([np.std(np.array(entropies)[max(0, i-rolling_length+1):i+1]) for i in range(rolling_length-1, len(entropies))])
axs[1, 0].plot(entropy_moving_average)
axs[1, 0].fill_between(np.arange(len(entropy_moving_average)),
                       entropy_moving_average - entropy_std,
                       entropy_moving_average + entropy_std,
                       alpha=0.3)
axs[1, 0].set_xlabel("Number of updates")

# Critic loss
axs[0, 1].set_title("Critic Loss")
critic_losses = np.array(critic_losses).flatten()
critic_losses_moving_average = np.convolve(critic_losses, np.ones(rolling_length), mode="valid") / rolling_length
critic_losses_std = np.array([np.std(critic_losses[max(0, i-rolling_length+1):i+1]) for i in range(rolling_length-1, len(critic_losses))])
axs[0, 1].plot(critic_losses_moving_average)
axs[0, 1].fill_between(np.arange(len(critic_losses_moving_average)),
                       critic_losses_moving_average - critic_losses_std,
                       critic_losses_moving_average + critic_losses_std,
                       alpha=0.3)
axs[0, 1].set_xlabel("Number of updates")
axs[0, 1].set_ylim(np.percentile(critic_losses, 1), np.percentile(critic_losses, 98.5))

# Actor loss
axs[1, 1].set_title("Actor Loss")
actor_losses = np.array(actor_losses).flatten()
actor_losses_moving_average = np.convolve(actor_losses, np.ones(rolling_length), mode="valid") / rolling_length
actor_losses_std = np.array([np.std(actor_losses[max(0, i-rolling_length+1):i+1]) for i in range(rolling_length-1, len(actor_losses))])
axs[1, 1].plot(actor_losses_moving_average)
axs[1, 1].fill_between(np.arange(len(actor_losses_moving_average)),
actor_losses_moving_average - actor_losses_std,
actor_losses_moving_average + actor_losses_std,
alpha=0.3)
axs[1, 1].set_xlabel("Number of updates")
axs[1, 1].set_ylim(np.percentile(actor_losses, 2), np.percentile(actor_losses, 98.5))

plt.tight_layout()
plt.show()

## Showcase

In [None]:
envs = gym.vector.make("Pendulum-v1", num_envs=1, max_episode_steps=200, render_mode="human")

obs, info = envs.reset(seed=0)
obs = torch.Tensor(obs)

for step in range(n_steps_per_update):
    
    # get action and value
    action, action_log_probs, actor_entropy = agent.get_action(obs)
    
    # env step
    obs, rewards, terminated, truncated, infos = envs.step(to_np(action))
    obs = torch.Tensor(obs)

envs.close()

## Stable Baselines 3

In [None]:
# train
vec_env = make_vec_env("Pendulum-v1", n_envs=1, seed=0)

agent = SAC("MlpPolicy", vec_env, train_freq=1, gradient_steps=-1, verbose=1, tensorboard_log="logs/")
agent.learn(total_timesteps=500)

In [None]:
# evaluate
mean_reward, std_reward = evaluate_policy(agent, agent.get_env(), n_eval_episodes=3)
mean_reward, std_reward

In [None]:
# showcase
showcase_env = gym.make("Pendulum-v1", render_mode="human")

obs, info = showcase_env.reset()
n_steps = 100
for step in range(n_steps):
    action, _ = agent.predict(obs, deterministic=True)
    obs, reward, terminated, truncated, info = showcase_env.step(action)
    showcase_env.render()
    if done:
        pass
        #break

showcase_env.close()

In [None]:
# Tensorboard Logging
# log_dir = "logs/"
# notebook.start(f"--logdir={log_dir}")

In [None]:
# env = DonkeyEnv(env_id)

In [3]:
# Create the environment
sim_config = {
    "exe_path" : "/home/till/Desktop/Thesis/donkeycar_sim/DonkeySimLinux/donkey_sim.x86_64",
    "port" : 9091
}
env = gym.make(
    "GymV21Environment-v0", 
    env_id=env_id,
    max_episode_steps=1000,
    make_kwargs={
        "conf": sim_config
    })

starting DonkeyGym env
Setting default: start_delay 5.0
Setting default: max_cte 8.0
Setting default: frame_skip 1
Setting default: cam_resolution (120, 160, 3)
Setting default: log_level 20
Setting default: host localhost
Setting default: steer_limit 1.0
Setting default: throttle_min 0.0
Setting default: throttle_max 1.0
donkey subprocess started
Found path: /home/till/Desktop/Thesis/donkeycar_sim/DonkeySimLinux/donkey_sim.x86_64


INFO:gym_donkeycar.core.client:connecting to localhost:9091 
  logger.warn(


loading scene mini_monaco


INFO:gym_donkeycar.envs.donkey_sim:on need car config
INFO:gym_donkeycar.envs.donkey_sim:sending car config.
INFO:gym_donkeycar.envs.donkey_sim:sim started!


In [4]:
agent = SAC(
    "CnnPolicy", 
    env,
    buffer_size=20_000,
    verbose=1, 
    tensorboard_log="logs/")

# eval_callback = EvalCallback(
#     env, 
#     best_model_save_path='weights/', 
#     log_path='logs/', 
#     eval_freq=500,
#     n_eval_episodes=1)

agent.learn(total_timesteps=30_000) # callback=eval_callback

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
Logging to logs/SAC_27
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 36.2     |
|    ep_rew_mean     | 26.7     |
| time/              |          |
|    episodes        | 4        |
|    fps             | 9        |
|    time_elapsed    | 15       |
|    total_timesteps | 145      |
| train/             |          |
|    actor_loss      | -3.43    |
|    critic_loss     | 0.296    |
|    ent_coef        | 0.987    |
|    ent_coef_loss   | -0.0433  |
|    learning_rate   | 0.0003   |
|    n_updates       | 44       |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 55.8     |
|    ep_rew_mean     | 57.8     |
| time/              |          |
|    episodes        | 8        |
|    fps             | 12       |
|    time_elapsed    | 35

KeyboardInterrupt: 

In [None]:
# Load the training results from the log files
results = load_results('logs/')

# Extract the training curve data
x, y = ts2xy(results, 'timesteps')

# Plot the training curve
plt.plot(x, y)
plt.xlabel('Timesteps')
plt.ylabel('Rewards')
plt.title('Training Curve')
plt.show()

In [None]:
# envs = gym.vector.make("Pendulum-v1", num_envs=1, max_episode_steps=200, render_mode="human")
# 
# obs, info = envs.reset(seed=0)
# obs = torch.Tensor(obs)
# 
# for step in range(n_steps_per_update):
#     
#     # get action and value
#     action, state_value, action_log_probs, actor_entropy = agent.select_action(obs)
#     continuous_action = []
#     for idx in actions:
#         continuous_action.append([idx_to_action[idx.item()]])
#     #print(continuous_action)
#     # env step
#     obs, rewards, terminated, truncated, infos = envs.step(continuous_action)
#     obs = torch.Tensor(obs)
# 
# envs.close()

In [None]:
from skopt.space import Real, Integer
from skopt.utils import use_named_args
from skopt import gp_minimize
from skopt.plots import plot_convergence, plot_objective

In [None]:
space  = [
    Real(0, 1, name="gamma"),
    Real(0, 1, name="lam"),
    Real(10**-10, 10**4, "log-uniform", name="ent_coef"),
    Real(10**-5, 10**0, "log-uniform", name="actor_lr"),
    Real(10**-5, 10**0, "log-uniform", name="critic_lr"),
]

In [None]:
# # environment hyperparams
# n_envs = 1
# n_updates = 1000
# n_steps_per_update = 128
# 
# # agent hyperparams
# gamma = 0.999
# lam = 0.95  # hyperparameter for GAE
# ent_coef = 0.01  # coefficient for the entropy bonus (to encourage exploration)
# actor_lr = 0.001
# critic_lr = 0.005
# 
# agent = A2C(3, 9, device, critic_lr, actor_lr, n_envs)

In [None]:
@use_named_args(space)
def objective(**params):
    """  The objective we want to MINIMIZE """
    
    # for a neural net
    # model.set_params(**params)
    # return -np.mean(cross_val_score(reg, X, y, cv=5, n_jobs=-1, scoring="neg_mean_absolute_error"))
    
    print(params)
    # result = 0
    # for x in params.values():
    #     result -= x
    # return result
    
    # environment hyperparams
    n_envs = 1
    n_updates = 1000
    n_steps_per_update = 128

    # agent hyperparams
    gamma = params["gamma"]
    lam = params["lam"]
    ent_coef = params["ent_coef"]
    actor_lr = params["actor_lr"]
    critic_lr = params["critic_lr"]

    agent = A2C(3, 9, device, critic_lr, actor_lr, n_envs, gamma, lam, ent_coef)

    return_queue, critic_losses, actor_losses, entropies = train(
        n_envs, n_updates, n_steps_per_update) # we want to maximize this
    
    result = - sum(return_queue) # we want to minimize this
    # print("result:", result, "(we want to minimize this)")
    return result

In [None]:
from skopt import gp_minimize
gp_result = gp_minimize(
    objective, 
    space, 
    n_calls=20, 
    random_state=0)

print("Best score:", gp_result.fun)

In [None]:
plot_convergence(gp_result)

In [None]:
plot_objective(gp_result)

In [None]:
""" train an agent with the best params """

print("best parameters:", gp_result.x)

gamma, lam, ent_coef, actor_lr, critic_lr = gp_result.x
agent = A2C(3, 9, device, critic_lr, actor_lr, n_envs, gamma, lam, ent_coef)

# set longer training time
n_envs = 1
n_updates = 3000
n_steps_per_update = 128

return_queue, critic_losses, actor_losses, entropies = train(n_envs, n_updates, n_steps_per_update)

In [None]:
envs = gym.vector.make("Pendulum-v1", num_envs=1, max_episode_steps=200, render_mode="human")

obs, info = envs.reset(seed=0)
obs = torch.Tensor(obs)

for step in range(n_steps_per_update):
    
    # get action and value
    action, state_value, action_log_probs, actor_entropy = agent.select_action(obs)
    continuous_action = []
    for idx in actions:
        continuous_action.append([idx_to_action[idx.item()]])
    #print(continuous_action)
    # env step
    obs, rewards, terminated, truncated, infos = envs.step(continuous_action)
    obs = torch.Tensor(obs)

envs.close()