In [1]:
import sys

import gym
import numpy as np
import scipy.integrate as sc_integrate

from stable_baselines3.common.env_checker import check_env
from stable_baselines3 import A2C, DQN

# using wredsen's symbtools fork (https://github.com/wredsen/symbtools @ DAE_statefeedback), assuming repos SA-Wrede and symbtools share the same parent directory
sys.path.append('../../symbtools/')
import symbtools as st
import sympy as sp
import pickle

## Mathematical system description with SymPy / symbtools

In [2]:
F1, F2 = sp.symbols('F1 F2')

params = sp.symbols('m1, m2, l1, g')
st.make_global(params)
params_values = [(m1, 1.0), (m2, 0.1), (l1, 0.5), (g, 9.81)]

In [3]:
# load model etc. from pickle of flatness analysis notebook
with open("../flatness_notebooks/single_crane_model.pcl", "rb") as pfile:
    data = pickle.load(pfile)
    locals().update(data)

In [4]:
mod.xx ##:

Matrix([
[   p1],
[   p2],
[   q1],
[pdot1],
[pdot2],
[qdot1]])

In [5]:
mod.calc_state_eq(force_recalculation=True)
mod.eqns

Matrix([
[       m2*pddot1 - tau2*(p1 - q1)/sqrt(p2**2 + (p1 - q1)**2)],
[       g*m2 + m2*pddot2 - p2*tau2/sqrt(p2**2 + (p1 - q1)**2)],
[m1*qddot1 - tau1 + tau2*(p1 - q1)/sqrt(p2**2 + (p1 - q1)**2)]])

In [6]:
states_dot = mod.f + mod.g * sp.Matrix([F1, F2]) ##:

In [7]:
states_dot_wo_params = states_dot.subs(params_values)

In [8]:
states_dot_func = st.expr_to_func([*mod.xx, F1, F2], states_dot_wo_params)

## pygent utility functions

In [2]:
def observation(x, xIsAngle):
    obsv = []
    for i, state in enumerate(x):
        if xIsAngle[i]:
            obsv.append(np.cos(state))
            obsv.append(np.sin(state))
        else:
            obsv.append(state)

    return np.array(obsv)

def mapAngles(xIsAngle, x, mod=np):
        """ Maps angles to the interval [-pi,pi]. """
        x_pi = []
        for i, state in enumerate(x):
            if xIsAngle[i]:
                # map theta to [-pi,pi]
                x_pi.append((state + mod.pi) % (2*mod.pi) - mod.pi)   
            else:
                x_pi.append(state)
        return x_pi

## Gym environment

In [23]:
from abc import abstractmethod
import gym
from gym import logger, spaces
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import animation
import matplotlib.patches as patches
from matplotlib.ticker import FuncFormatter, MultipleLocator
from scipy.integrate import solve_ivp
import inspect
import pickle
import copy

class Environment(gym.Env):
    """ Environment base class.

    Args:
        x0 (array, list, callable):

    Attributes:

        x (array): current state x[k] (size = n)
        x_ (array): previous state x[k-1](size = n)
        history (array): previous states (x[0],x[1],...,x[k-1])
        tt (list): time vector (corresponding to history)
        terminated (bool): True, if environment is in a terminal state

    """

    def __init__(self, x0, uDim, dt):
        if callable(x0):
            self.x0 = x0  # initial state
            x0 = x0()
        else:
            x0 = list(x0)
            self.x0 = x0
        self.x = x0  # current state
        self.x_ = x0 # previous state x[k-1]
        self.xDim = len(x0) # state dimension
        self.oDim = self.xDim # observation dimension
        self.uDim = uDim # inputs
        self.xIsAngle = np.zeros([self.xDim], dtype=bool)
        self.history = np.array([x0])
        self.tt = [0]
        self.terminated = False
        self.uMax = np.ones(uDim)
        self.dt = dt

    def get_state(self):
        return self.x

    def reset(self):
        """ Resets environment to state x0

        Args:
            x0 (array, list, callable): initial state

        """
        if callable(self.x0):
            x0 = self.x0()
        self.history = np.array([x0])
        self.x_ = x0
        self.x = x0
        self.tt = [0]
        self.terminated = False
        return np.array(self.x, dtype=np.float32)

    @abstractmethod
    def step(self, *args):
        return

    def plot(self):
        """ Plots the environments history

        Returns:
            fig (matplotlib.pyplot.figure)
            ax (matploltib.pyplot.axes)

        """

        fig, ax = plt.subplots(len(self.x), 1, dpi=300, sharex=True)
        # Plot state trajectories
        if len(self.x) > 1:
            for i in range(len(self.x)):
                ax[i].step(self.tt, self.history[:, i], 'b',  lw=1)
                ax[i].set_ylabel(r'$x_'+str(i+1)+'$')
                ax[i].grid(True)
                if self.xIsAngle[i]:
                    ax[i].yaxis.set_major_formatter(FuncFormatter(
                        lambda val, pos: '{:.0g}$\pi$'.format(val / np.pi) if val != 0 else '0'))
                    ax[i].yaxis.set_major_locator(MultipleLocator(base=np.pi))
        else:
            ax.step(self.tt, self.history[:, 0], 'b',  lw=1)
            ax.grid(True)
            plt.ylabel(r'$x_1$')
        fig.align_ylabels(ax)
        plt.xlabel(r't[s]')
        plt.tight_layout()
        # Todo: save data in numpy arrays
        return fig, ax

    def save_history(self, filename, path):
        history_dict = {'tt': self.tt, 'xx': self.history}
        pickle.dump(history_dict, open(path + filename +'.p', 'wb'))
        pass

    def animation(self):
        pass

    def observe(self, x):
        return x

class StateSpaceModel(Environment):
    """ Environment subclass that uses a state space model of the form dx/dt = f(x, u)
    to represent the environments dynamics.

    Args:
        ode
        cost
        x0
        uDim

    Attributes:
        ode (function): ODE for simulation
        cost (function): cost function (returns scalar)
        xIsAngle (ndarray): 'True' if state is an angle, 'False' otherwise
        o
        o_
        oDim

    """

    def __init__(self, ode, cost, x0, uDim, dt,
                 terminal_cost=0.):
        super(StateSpaceModel, self).__init__(x0, uDim, dt)
        self.ode = ode
        params = inspect.signature(cost).parameters
        cost_args = params.__len__()
        if cost_args == 1:
            self.cost = lambda x_, u_, x, t, mod: cost(x_)
        elif cost_args == 2:
            if 'mod' in params:
                self.cost = lambda x_, u_, x, t, mod: cost(x_, mod)
            elif 't' in params:
                self.cost = lambda x_, u_, x, t, mod: cost(x_, t)
            else:
                self.cost = lambda x_, u_, x, t, mod: cost(x_, u_)
        elif cost_args == 3:
            if 'mod' in params:
                self.cost = lambda x_, u_, x, t, mod: cost(x_, u_, mod)
            elif 't' in params:
                self.cost = lambda x_, u_, x, t, mod: cost(x_, u_, t)
            else:
                self.cost = lambda x_, u_, x, t, mod: cost(x_, u_, x)
        elif cost_args == 4:
            if 'mod' in params and 't' in params:
                self.cost = lambda x_, u_, x, t, mod: cost(x_, u_, t, mod)
            elif 'mod' in params and not 't' in params:
                self.cost = lambda x_, u_, x, t, mod: cost(x_, u_, x, mod)
            else:
                self.cost = lambda x_, u_, x, t, mod: cost(x_, u_, x, t)
        elif cost_args == 5:
            self.cost = cost
        else:
            print('Cost function must to be of the form c(x_, u_, x, t, mod), where mod is numpy/sympy.')
            assert(True)
        self.xIsAngle = np.zeros([len(self.x_)], dtype=bool)
        self.o = self.x
        self.o_ = self.x_
        self.oDim = len(self.o)  # observation dimensions
        self.terminal_cost = terminal_cost

    def step(self, *args):
        """ Simulates the environment for 1 step of time t.

        Args:
            dt (int, float): duration of step (not solver step size)
            u (array): control/action

        Returns:
            c (float): cost of state transition

        """
        self.x_ = self.x  # shift state (x[k-1] = x[k])
        self.o_ = self.o
        if args.__len__()==2:
            u = args[0]
            dt = args[1]
        elif args.__len__() == 1:
            u = args[0]
            dt = self.dt

        # system simulation
        sol = solve_ivp(lambda t, x: self.ode(t, x, u), (0, dt), self.x_, 'RK45')
        # todo: only output value of the last timestep
        y = list(sol.y[:, -1])  # extract simulation result
        self.x = y
        self.o = self.observe(self.x)
        self.history = np.concatenate((self.history, np.array([self.x])))  # save current state
        self.tt.extend([self.tt[-1] + dt])  # increment simulation time
        self.terminated = self.terminate(self.x)
        #x_2pi = mapAngles(self.xIsAngle, self.x_)
        #x2pi = mapAngles(self.xIsAngle, self.x)
        #c = (self.cost(x_2pi, u, x2pi, np) + self.terminal_cost*self.terminated)*dt
        t = self.tt[-1]
        c = (self.cost(self.x_, u, self.x, t, np) + self.terminal_cost * self.terminated) * dt
        return np.array(self.x, dtype=np.float32), -c, self.terminated, {"info": False}

    def terminate(self, x):
        """ Check if a 'terminal' state is reached.

            Args:
                x (ndarray, list): state

            Returns:
                terminated (bool): 'True' if 'x' is a terminal state. """

        terminated = False
        return terminated


    def fast_step(self, *args):
        """ Simulates the environment for 1 step of time t, using Euler forward integration.

        Args:
            dt (int, float): duration of step (not solver step size)
            u (array): control/action

        Returns:
            c (float): cost of state transition

        """

        if args.__len__()==2:
            u = args[0]
            dt = args[1]
        elif args.__len__() == 1:
            u = args[0]
            dt = self.dt

        self.x_ = self.x  # shift state (x[k-1] := x[k])
        self.o_ = self.o

        # Euler forward step
        y = self.x_ + dt*self.ode(None, self.x_, u)
        self.x = y
        self.o = self.observe(self.x)
        self.history = np.concatenate((self.history, np.array([self.x])))  # save current state
        self.tt.extend([self.tt[-1] + dt])  # increment simulation time
        self.terminated = self.terminate(self.x)
        t = self.tt[-1]
        c = (self.cost(self.x_, u, self.x, t, np) + self.terminal_cost*self.terminated)*dt
        return c

    def observe(self, x):
        obsv = observation(x, self.xIsAngle)
        return obsv

class Pendulum(StateSpaceModel):
    metadata = {}

    def __init__(self, cost, x0, dt):
        super(Pendulum, self).__init__(self.ode, cost, x0, 1, dt)
        self.xIsAngle = [True, False]
        self.o = self.observe(self.x)
        self.o_ = self.o
        self.oDim = len(self.o)  # observation dimensions
        self.uMax = 3.5*np.ones(1)
        
        high_obs = np.array(
            [
                100.0,
                100.0
            ],
            dtype=np.float32,
        )
        self.observation_space = spaces.Box(-high_obs, high_obs, dtype=np.float32)
        
        high_act = np.array(
            [ 
                100.0
            ],
            dtype=np.float32,
        )
        self.action_space = spaces.Box(-high_act, high_act, dtype=np.float32)

    @staticmethod
    def ode(t, x, u):

        g = 9.81  # gravity
        b = 0.02  # dissipation
        u1, = u  # torque
        x1, x2 = x

        dx1dt = x2
        dx2dt = u1 + g*np.sin(x1) - b*x2

        return np.array([dx1dt, dx2dt])

    def terminate(self, x):
        x1, x2 = x
        if abs(x2) > 10 or abs(x1)>8*np.pi:
            return True
        else:
            return False


    def animation(self):
        # mapping from theta and s to the x,y-plane (definition of the line points, that represent the pole)
        def pendulum_plot(l, xt):
            x_pole_end = -l * np.sin(xt[:, 0])
            y_pole_end = l * np.cos(xt[:, 0])

            return x_pole_end, y_pole_end

        # line and text
        def animate(t):
            thisx = [0, x_pole_end[t]]
            thisy = [0, y_pole_end[t]]

            pole.set_data(thisx, thisy)
            time_text.set_text(time_template % self.tt[t])
            return pole, time_text,

        x_pole_end, y_pole_end  = pendulum_plot(0.5, self.history)
        fig, ax = plt.subplots()
        ax.set_aspect('equal')
        plt.ylim((-.6, .6))
        plt.xlim((-.6, .6))
        plt.title('Pendulum')
        plt.xticks([], [])
        plt.yticks([], [])
        time_template = 'time = %.1fs'
        time_text = ax.text(0.05, 1.05, '', transform=ax.transAxes)
        pole, = ax.plot([], [], 'b-', zorder=1, lw=3)
        circ = patches.Circle((0, 0), 0.03, fc='b', zorder=1)
        ax.add_artist(circ)
        # animation using matplotlibs animation library
        ani = animation.FuncAnimation(fig, animate, np.arange(len(self.tt)), interval=self.tt[1] * 1000,
                                      blit=True)
        return ani

## Kostenfunktion, Anfangswerte

In [24]:
# define the incremental cost
def c_k(x, u):
    x1, x2 = mapAngles([1,0], x)
    u1, = u
    c = x1**2 + 0.1*x2**2 + 0.05*u1**2
    return c

# define the function, that represents the initial value distribution p(x_0)
def p_x0():
    x0 = [np.random.uniform(0.999*np.pi, 1.001*np.pi), np.random.uniform(-0.001,0.001)]
    return x0

def ode(t, x, u):

        g = 9.81  # gravity
        b = 0.02  # dissipation
        u1, = u  # torque
        x1, x2 = x

        dx1dt = x2
        dx2dt = u1 + g*np.sin(x1) - b*x2

        return np.array([dx1dt, dx2dt])


t = 10 # time of an episode
dt = 0.05 # time step-size
learning_steps = 1e5 # define training duration

## Gym-Umgebung erstellen

In [25]:
# environment without renderings for training
env = Pendulum(c_k, p_x0, dt)

In [26]:
env.reset()
env.step(env.action_space.sample())
env.observation_space
check_env(env)



## Learning the model

In [27]:
%%time
# Learning!
model = A2C('MlpPolicy', env, verbose=1, tensorboard_log="./a2c_cartpole_tensorboard/")
model.learn(total_timesteps=learning_steps)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to ./a2c_cartpole_tensorboard/A2C_20
------------------------------------
| time/                 |          |
|    fps                | 1164     |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.44    |
|    explained_variance | -0.0417  |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -1.08    |
|    std                | 1.02     |
|    value_loss         | 0.835    |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 1204     |
|    iterations         | 200      |
|    time_elapsed       | 0        |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss       | -1.44    |
|    expla

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.62e+03 |
|    ep_rew_mean        | -3.1e+03 |
| time/                 |          |
|    fps                | 1240     |
|    iterations         | 1600     |
|    time_elapsed       | 6        |
|    total_timesteps    | 8000     |
| train/                |          |
|    entropy_loss       | -1.4     |
|    explained_variance | -64.9    |
|    learning_rate      | 0.0007   |
|    n_updates          | 1599     |
|    policy_loss        | 0.147    |
|    std                | 0.982    |
|    value_loss         | 0.0309   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.21e+03 |
|    ep_rew_mean        | -1.7e+03 |
| time/                 |          |
|    fps                | 1241     |
|    iterations         | 1700     |
|    time_elapsed       | 6        |
|    total_timesteps    | 8500     |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 1.01e+03 |
|    ep_rew_mean        | -387     |
| time/                 |          |
|    fps                | 1250     |
|    iterations         | 2900     |
|    time_elapsed       | 11       |
|    total_timesteps    | 14500    |
| train/                |          |
|    entropy_loss       | -1.39    |
|    explained_variance | 0.141    |
|    learning_rate      | 0.0007   |
|    n_updates          | 2899     |
|    policy_loss        | -0.851   |
|    std                | 0.972    |
|    value_loss         | 0.419    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 935      |
|    ep_rew_mean        | -358     |
| time/                 |          |
|    fps                | 1251     |
|    iterations         | 3000     |
|    time_elapsed       | 11       |
|    total_timesteps    | 15000    |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 511      |
|    ep_rew_mean        | -193     |
| time/                 |          |
|    fps                | 1256     |
|    iterations         | 4200     |
|    time_elapsed       | 16       |
|    total_timesteps    | 21000    |
| train/                |          |
|    entropy_loss       | -1.37    |
|    explained_variance | -32.7    |
|    learning_rate      | 0.0007   |
|    n_updates          | 4199     |
|    policy_loss        | 0.671    |
|    std                | 0.952    |
|    value_loss         | 0.297    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 476      |
|    ep_rew_mean        | -180     |
| time/                 |          |
|    fps                | 1256     |
|    iterations         | 4300     |
|    time_elapsed       | 17       |
|    total_timesteps    | 21500    |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 77       |
|    ep_rew_mean        | -29.7    |
| time/                 |          |
|    fps                | 1257     |
|    iterations         | 5500     |
|    time_elapsed       | 21       |
|    total_timesteps    | 27500    |
| train/                |          |
|    entropy_loss       | -1.36    |
|    explained_variance | 0.944    |
|    learning_rate      | 0.0007   |
|    n_updates          | 5499     |
|    policy_loss        | -0.127   |
|    std                | 0.944    |
|    value_loss         | 0.0108   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 69       |
|    ep_rew_mean        | -27.1    |
| time/                 |          |
|    fps                | 1257     |
|    iterations         | 5600     |
|    time_elapsed       | 22       |
|    total_timesteps    | 28000    |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 62       |
|    ep_rew_mean        | -25.4    |
| time/                 |          |
|    fps                | 1258     |
|    iterations         | 6800     |
|    time_elapsed       | 27       |
|    total_timesteps    | 34000    |
| train/                |          |
|    entropy_loss       | -1.32    |
|    explained_variance | -289     |
|    learning_rate      | 0.0007   |
|    n_updates          | 6799     |
|    policy_loss        | 0.382    |
|    std                | 0.902    |
|    value_loss         | 0.161    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 63.2     |
|    ep_rew_mean        | -25.8    |
| time/                 |          |
|    fps                | 1258     |
|    iterations         | 6900     |
|    time_elapsed       | 27       |
|    total_timesteps    | 34500    |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 79.7     |
|    ep_rew_mean        | -30.1    |
| time/                 |          |
|    fps                | 1259     |
|    iterations         | 8100     |
|    time_elapsed       | 32       |
|    total_timesteps    | 40500    |
| train/                |          |
|    entropy_loss       | -1.3     |
|    explained_variance | 0.947    |
|    learning_rate      | 0.0007   |
|    n_updates          | 8099     |
|    policy_loss        | -2.37    |
|    std                | 0.891    |
|    value_loss         | 15.2     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 81.2     |
|    ep_rew_mean        | -30.6    |
| time/                 |          |
|    fps                | 1259     |
|    iterations         | 8200     |
|    time_elapsed       | 32       |
|    total_timesteps    | 41000    |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 96       |
|    ep_rew_mean        | -33.5    |
| time/                 |          |
|    fps                | 1259     |
|    iterations         | 9400     |
|    time_elapsed       | 37       |
|    total_timesteps    | 47000    |
| train/                |          |
|    entropy_loss       | -1.27    |
|    explained_variance | -1.55    |
|    learning_rate      | 0.0007   |
|    n_updates          | 9399     |
|    policy_loss        | -0.429   |
|    std                | 0.866    |
|    value_loss         | 0.31     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 98       |
|    ep_rew_mean        | -34      |
| time/                 |          |
|    fps                | 1259     |
|    iterations         | 9500     |
|    time_elapsed       | 37       |
|    total_timesteps    | 47500    |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 134      |
|    ep_rew_mean        | -44.1    |
| time/                 |          |
|    fps                | 1259     |
|    iterations         | 10700    |
|    time_elapsed       | 42       |
|    total_timesteps    | 53500    |
| train/                |          |
|    entropy_loss       | -1.24    |
|    explained_variance | 0.989    |
|    learning_rate      | 0.0007   |
|    n_updates          | 10699    |
|    policy_loss        | 0.0623   |
|    std                | 0.838    |
|    value_loss         | 0.0038   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 136      |
|    ep_rew_mean        | -44.6    |
| time/                 |          |
|    fps                | 1259     |
|    iterations         | 10800    |
|    time_elapsed       | 42       |
|    total_timesteps    | 54000    |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 146      |
|    ep_rew_mean        | -46.7    |
| time/                 |          |
|    fps                | 1257     |
|    iterations         | 12000    |
|    time_elapsed       | 47       |
|    total_timesteps    | 60000    |
| train/                |          |
|    entropy_loss       | -1.03    |
|    explained_variance | -6.21    |
|    learning_rate      | 0.0007   |
|    n_updates          | 11999    |
|    policy_loss        | -0.0125  |
|    std                | 0.678    |
|    value_loss         | 0.000261 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 146      |
|    ep_rew_mean        | -46.7    |
| time/                 |          |
|    fps                | 1257     |
|    iterations         | 12100    |
|    time_elapsed       | 48       |
|    total_timesteps    | 60500    |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 146      |
|    ep_rew_mean        | -46.7    |
| time/                 |          |
|    fps                | 1250     |
|    iterations         | 13300    |
|    time_elapsed       | 53       |
|    total_timesteps    | 66500    |
| train/                |          |
|    entropy_loss       | -0.683   |
|    explained_variance | 0.485    |
|    learning_rate      | 0.0007   |
|    n_updates          | 13299    |
|    policy_loss        | 0.00252  |
|    std                | 0.479    |
|    value_loss         | 2.29e-05 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 146      |
|    ep_rew_mean        | -46.7    |
| time/                 |          |
|    fps                | 1249     |
|    iterations         | 13400    |
|    time_elapsed       | 53       |
|    total_timesteps    | 67000    |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 146       |
|    ep_rew_mean        | -46.7     |
| time/                 |           |
|    fps                | 1244      |
|    iterations         | 14600     |
|    time_elapsed       | 58        |
|    total_timesteps    | 73000     |
| train/                |           |
|    entropy_loss       | -0.367    |
|    explained_variance | 0.862     |
|    learning_rate      | 0.0007    |
|    n_updates          | 14599     |
|    policy_loss        | -0.000109 |
|    std                | 0.349     |
|    value_loss         | 2.12e-07  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 146      |
|    ep_rew_mean        | -46.7    |
| time/                 |          |
|    fps                | 1243     |
|    iterations         | 14700    |
|    time_elapsed       | 59       |
|    total_timesteps

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 146      |
|    ep_rew_mean        | -46.7    |
| time/                 |          |
|    fps                | 1236     |
|    iterations         | 15900    |
|    time_elapsed       | 64       |
|    total_timesteps    | 79500    |
| train/                |          |
|    entropy_loss       | -0.086   |
|    explained_variance | 0.274    |
|    learning_rate      | 0.0007   |
|    n_updates          | 15899    |
|    policy_loss        | -8.3e-05 |
|    std                | 0.264    |
|    value_loss         | 1.92e-06 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 146       |
|    ep_rew_mean        | -46.7     |
| time/                 |           |
|    fps                | 1236      |
|    iterations         | 16000     |
|    time_elapsed       | 64        |
|    total_timesteps    | 8000

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 146       |
|    ep_rew_mean        | -46.7     |
| time/                 |           |
|    fps                | 1230      |
|    iterations         | 17200     |
|    time_elapsed       | 69        |
|    total_timesteps    | 86000     |
| train/                |           |
|    entropy_loss       | 0.133     |
|    explained_variance | 0.124     |
|    learning_rate      | 0.0007    |
|    n_updates          | 17199     |
|    policy_loss        | -0.000177 |
|    std                | 0.212     |
|    value_loss         | 1.89e-07  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 146      |
|    ep_rew_mean        | -46.7    |
| time/                 |          |
|    fps                | 1230     |
|    iterations         | 17300    |
|    time_elapsed       | 70       |
|    total_timesteps

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 146      |
|    ep_rew_mean        | -46.7    |
| time/                 |          |
|    fps                | 1227     |
|    iterations         | 18500    |
|    time_elapsed       | 75       |
|    total_timesteps    | 92500    |
| train/                |          |
|    entropy_loss       | 0.301    |
|    explained_variance | 0.39     |
|    learning_rate      | 0.0007   |
|    n_updates          | 18499    |
|    policy_loss        | 7.33e-05 |
|    std                | 0.179    |
|    value_loss         | 2.43e-07 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 146       |
|    ep_rew_mean        | -46.7     |
| time/                 |           |
|    fps                | 1226      |
|    iterations         | 18600     |
|    time_elapsed       | 75        |
|    total_timesteps    | 9300

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 146       |
|    ep_rew_mean        | -46.7     |
| time/                 |           |
|    fps                | 1219      |
|    iterations         | 19800     |
|    time_elapsed       | 81        |
|    total_timesteps    | 99000     |
| train/                |           |
|    entropy_loss       | 0.465     |
|    explained_variance | -2.13     |
|    learning_rate      | 0.0007    |
|    n_updates          | 19799     |
|    policy_loss        | -0.000467 |
|    std                | 0.152     |
|    value_loss         | 4.56e-07  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 146      |
|    ep_rew_mean        | -46.7    |
| time/                 |          |
|    fps                | 1218     |
|    iterations         | 19900    |
|    time_elapsed       | 81       |
|    total_timesteps

<stable_baselines3.a2c.a2c.A2C at 0x7f66e409b7c0>

## Testing the model with rendering

In [119]:
obs = env_rendering.reset()
for i in range(1000):
    action, _state = model.predict(obs, deterministic=False)
    obs, reward, done, info = env_rendering.step(action)
    env_rendering.render()
    if done:
      obs = env_rendering.reset()

ALSA lib confmisc.c:767:(parse_card) cannot find card '0'
ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_card_driver returned error: No such file or directory
ALSA lib confmisc.c:392:(snd_func_concat) error evaluating strings
ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_concat returned error: No such file or directory
ALSA lib confmisc.c:1246:(snd_func_refer) error evaluating name
ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_refer returned error: No such file or directory
ALSA lib conf.c:5220:(snd_config_expand) Evaluate error: No such file or directory
ALSA lib pcm.c:2642:(snd_pcm_open_noupdate) Unknown PCM default


In [120]:
env.close()
env_rendering.close()