In [1]:
import sys

import gym
import numpy as np
import scipy.integrate as sc_integrate

from stable_baselines3.common.env_checker import check_env
from stable_baselines3 import A2C, DQN

# using wredsen's symbtools fork (https://github.com/wredsen/symbtools @ DAE_statefeedback), assuming repos SA-Wrede and symbtools share the same parent directory
sys.path.append('../../symbtools/')
import symbtools as st
import sympy as sp
import pickle

## Mathematical system description with SymPy / symbtools

In [37]:
F1, F2 = sp.symbols('F1 F2')

params = sp.symbols('m1, m2, l1, g')
st.make_global(params)
params_values = [(m1, 1.0), (m2, 0.1), (l1, 0.5), (g, 9.81)]

In [38]:
# load model etc. from pickle of flatness analysis notebook
with open("../flatness_notebooks/single_crane_model.pcl", "rb") as pfile:
    data = pickle.load(pfile)
    locals().update(data)

In [39]:
mod.xx ##:

Matrix([
[   p1],
[   p2],
[   q1],
[pdot1],
[pdot2],
[qdot1]])

In [40]:
mod.calc_state_eq(force_recalculation=True)
mod.eqns

Matrix([
[       m2*pddot1 - tau2*(p1 - q1)/sqrt(p2**2 + (p1 - q1)**2)],
[       g*m2 + m2*pddot2 - p2*tau2/sqrt(p2**2 + (p1 - q1)**2)],
[m1*qddot1 - tau1 + tau2*(p1 - q1)/sqrt(p2**2 + (p1 - q1)**2)]])

In [41]:
states_dot = mod.f + mod.g * sp.Matrix([F1, F2]) ##:

In [42]:
states_dot_wo_params = states_dot.subs(params_values)

In [43]:
states_dot_func = st.expr_to_func([*mod.xx, F1, F2], states_dot_wo_params)

## Utility functions for Gym environment

In [44]:
"""
Utility functions used for classic control environments.
"""

from typing import Optional, SupportsFloat, Tuple


def verify_number_and_cast(x: SupportsFloat) -> float:
    """Verify parameter is a single number and cast to a float."""
    try:
        x = float(x)
    except (ValueError, TypeError):
        raise ValueError(f"An option ({x}) could not be converted to a float.")
    return x


def maybe_parse_reset_bounds(
    options: Optional[dict], default_low: float, default_high: float
) -> Tuple[float, float]:
    """
    This function can be called during a reset() to customize the sampling
    ranges for setting the initial state distributions.

    Args:
      options: Options passed in to reset().
      default_low: Default lower limit to use, if none specified in options.
      default_high: Default upper limit to use, if none specified in options.

    Returns:
      Tuple of the lower and upper limits.
    """
    if options is None:
        return default_low, default_high

    low = options.get("low") if "low" in options else default_low
    high = options.get("high") if "high" in options else default_high

    # We expect only numerical inputs.
    low = verify_number_and_cast(low)
    high = verify_number_and_cast(high)
    if low > high:
        raise ValueError(
            f"Lower bound ({low}) must be lower than higher bound ({high})."
        )

    return low, high

In [45]:
"""A utility class to collect render frames from a function that computes a single frame."""
from typing import Any, Callable, List, Optional, Set

# list of modes with which render function returns None
NO_RETURNS_RENDER = {"human"}

# list of modes with which render returns just a single frame of the current state
SINGLE_RENDER = {"single_rgb_array", "single_depth_array", "single_state_pixels"}


class Renderer:
    """This class serves to easily integrate collection of renders for environments that can computes a single render.

    To use this function:
    - instantiate this class with the mode and the function that computes a single frame
    - call render_step method each time the frame should be saved in the list
      (usually at the end of the step and reset methods)
    - call get_renders whenever you want to retrieve renders
      (usually in the render method)
    - call reset to clean the render list
      (usually in the reset method of the environment)
    """

    def __init__(
        self,
        mode: Optional[str],
        render: Callable[[str], Any],
        no_returns_render: Optional[Set[str]] = None,
        single_render: Optional[Set[str]] = None,
    ):
        """Instantiates a Renderer object.

        Args:
            mode (Optional[str]): Way to render
            render (Callable[[str], Any]): Function that receives the mode and computes a single frame
            no_returns_render (Optional[Set[str]]): Set of render modes that don't return any value.
                The default value is the set {"human"}.
            single_render (Optional[Set[str]]): Set of render modes that should return a single frame.
                The default value is the set {"single_rgb_array", "single_depth_array", "single_state_pixels"}.
        """
        if no_returns_render is None:
            no_returns_render = NO_RETURNS_RENDER
        if single_render is None:
            single_render = SINGLE_RENDER

        self.no_returns_render = no_returns_render
        self.single_render = single_render
        self.mode = mode
        self.render = render
        self.render_list = []

    def render_step(self) -> None:
        """Computes a frame and save it to the render collection list.

        This method should be usually called inside environment's step and reset method.
        """
        if self.mode is not None and self.mode not in self.single_render:
            render_return = self.render(self.mode)
            if self.mode not in self.no_returns_render:
                self.render_list.append(render_return)

    def get_renders(self) -> Optional[List]:
        """Pops all the frames from the render collection list.

        This method should be usually called in the environment's render method to retrieve the frames collected till this time step.
        """
        if self.mode in self.single_render:
            return self.render(self.mode)
        elif self.mode is not None and self.mode not in self.no_returns_render:
            renders = self.render_list
            self.render_list = []
            return renders

    def reset(self):
        """Resets the render collection list.

        This method should be usually called inside environment's reset method.
        """
        self.render_list = []

## Gym environment for crane

In [49]:
"""
Classic cart-pole system implemented by Rich Sutton et al.
Copied from http://incompleteideas.net/sutton/book/code/pole.c
permalink: https://perma.cc/C9ZM-652R
"""
import math
from typing import Optional, Union

import numpy as np

import gym
from gym import logger, spaces
#from gym.envs.classic_control import utils
from gym.error import DependencyNotInstalled
#from gym.utils.renderer import Renderer


class CartPoleEnv(gym.Env):
    """
    ### Description

    This environment corresponds to the version of the cart-pole problem described by Barto, Sutton, and Anderson in
    ["Neuronlike Adaptive Elements That Can Solve Difficult Learning Control Problem"](https://ieeexplore.ieee.org/document/6313077).
    A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track.
    The pendulum is placed upright on the cart and the goal is to balance the pole by applying forces
     in the left and right direction on the cart.

    ### Action Space

    The action is a `ndarray` with shape `(1,)` which can take values `{0, 1}` indicating the direction
     of the fixed force the cart is pushed with.

    | Num | Action                 |
    |-----|------------------------|
    | 0   | Push cart to the left  |
    | 1   | Push cart to the right |

    **Note**: The velocity that is reduced or increased by the applied force is not fixed and it depends on the angle
     the pole is pointing. The center of gravity of the pole varies the amount of energy needed to move the cart underneath it

    ### Observation Space

    The observation is a `ndarray` with shape `(4,)` with the values corresponding to the following positions and velocities:

    | Num | Observation           | Min                 | Max               |
    |-----|-----------------------|---------------------|-------------------|
    | 0   | Cart Position         | -4.8                | 4.8               |
    | 1   | Cart Velocity         | -Inf                | Inf               |
    | 2   | Pole Angle            | ~ -0.418 rad (-24°) | ~ 0.418 rad (24°) |
    | 3   | Pole Angular Velocity | -Inf                | Inf               |

    **Note:** While the ranges above denote the possible values for observation space of each element,
        it is not reflective of the allowed values of the state space in an unterminated episode. Particularly:
    -  The cart x-position (index 0) can be take values between `(-4.8, 4.8)`, but the episode terminates
       if the cart leaves the `(-2.4, 2.4)` range.
    -  The pole angle can be observed between  `(-.418, .418)` radians (or **±24°**), but the episode terminates
       if the pole angle is not in the range `(-.2095, .2095)` (or **±12°**)

    ### Rewards

    Since the goal is to keep the pole upright for as long as possible, a reward of `+1` for every step taken,
    including the termination step, is allotted. The threshold for rewards is 475 for v1.

    ### Starting State

    All observations are assigned a uniformly random value in `(-0.05, 0.05)`

    ### Episode End

    The episode ends if any one of the following occurs:

    1. Termination: Pole Angle is greater than ±12°
    2. Termination: Cart Position is greater than ±2.4 (center of the cart reaches the edge of the display)
    3. Truncation: Episode length is greater than 500 (200 for v0)

    ### Arguments

    ```
    gym.make('CartPole-v1')
    ```

    No additional arguments are currently supported.
    """

    metadata = {
        "render_modes": ["human", "rgb_array", "single_rgb_array"],
        "render_fps": 50,
    }

    def __init__(self, render_mode: Optional[str] = None):
    
        # geometrics for rendering
        self.length = 0.5  # actually half the pole's length
        
        # magnitude for discrete force applied
        self.force_mag1 = 0.10
        self.force_mag2 = 0.10
        
        #simulation step width
        self.deltaT = 0.02  # seconds between state updates

        # thresholds to fail the episode
        self.p1_threshold = 0.5
        self.p2_threshold = 0.5
        self.q1_threshold = 2.4

        high_act = np.array(
            [ 
                1.0,
                1.0
            ],
            dtype=np.float32,
        )
        self.action_space = spaces.Box(-high_act, high_act, dtype=np.float32)
        
        # observation limits set to 2 * thresholds so failing observation
        # is still within bounds.
        high_obs = np.array(
            [
                2*self.p1_threshold,
                np.finfo(np.float32).max,
                2*self.p2_threshold,
                np.finfo(np.float32).max,
                2*self.q1_threshold,
                np.finfo(np.float32).max
            ],
            dtype=np.float32,
        )
        self.observation_space = spaces.Box(-high_obs, high_obs, dtype=np.float32)

        self.render_mode = render_mode
        self.renderer = Renderer(self.render_mode, self._render)

        self.screen_width = 600
        self.screen_height = 400
        self.screen = None
        self.clock = None
        self.isopen = True
        self.state = None

        self.steps_beyond_terminated = None

    def step(self, action):
        err_msg = f"{action!r} ({type(action)}) invalid"
        assert self.action_space.contains(action), err_msg
        assert self.state is not None, "Call reset before using step method."
        p1, p2, q1, p1_dot, p2_dot, q1_dot = self.state
        force1 = self.force1_mag * action[0]
        force2 = self.force2_mag * action[1]
        
        states_dot_now = states_dot_func(*self.state, force1, force2)
        p1, p2, q1, p1_dot, p2_dot, q1_dot = self.state + self.deltaT * states_dot_now
        
        self.state = (p1, p2, q1, p1_dot, p2_dot, q1_dot)

        terminated = bool(
            q1 < -self.q1_threshold
            or q1 > self.q1_threshold
            or p1 < -self.p1_threshold
            or p1 > self.p1_threshold
            or p2 < -self.p2_threshold
            or p2 > self.p2_threshold
        )

        if not terminated:
            reward = 1.0
        elif self.steps_beyond_terminated is None:
            # Pole just fell!
            self.steps_beyond_terminated = 0
            reward = 1.0
        else:
            if self.steps_beyond_terminated == 0:
                logger.warn(
                    "You are calling 'step()' even though this "
                    "environment has already returned terminated = True. You "
                    "should always call 'reset()' once you receive 'terminated = "
                    "True' -- any further steps are undefined behavior."
                )
            self.steps_beyond_terminated += 1
            reward = 0.0

        self.renderer.render_step()
        return np.array(self.state, dtype=np.float32), reward, terminated, {"info": False}#False, {}

    def reset(
        self,
        *,
        seed: Optional[int] = None,
        options: Optional[dict] = None,
    ):
        #super().reset()(seed=seed)
        # Note that if you use custom reset bounds, it may lead to out-of-bound
        # state/observations.
        low, high = maybe_parse_reset_bounds(
            options, -0.05, 0.05  # default low
        )  # default high
        self.state = np.random.uniform(low=low, high=high, size=(4,))
        self.steps_beyond_terminated = None
        self.renderer.reset()
        self.renderer.render_step()
        return np.array(self.state, dtype=np.float32)#, {}

    def render(self):
        return self.renderer.get_renders()

    def _render(self, mode="human"):
        assert mode in self.metadata["render_modes"]
        try:
            import pygame
            from pygame import gfxdraw
        except ImportError:
            raise DependencyNotInstalled(
                "pygame is not installed, run `pip install gym[classic_control]`"
            )

        if self.screen is None:
            pygame.init()
            if mode == "human":
                pygame.display.init()
                self.screen = pygame.display.set_mode(
                    (self.screen_width, self.screen_height)
                )
            else:  # mode in {"rgb_array", "single_rgb_array"}
                self.screen = pygame.Surface((self.screen_width, self.screen_height))
        if self.clock is None:
            self.clock = pygame.time.Clock()

        world_width = self.q1_threshold * 2
        scale = self.screen_width / world_width
        polewidth = 10.0
        polelen = scale * (2 * self.length)
        cartwidth = 50.0
        cartheight = 30.0

        if self.state is None:
            return None

        x = self.state

        self.surf = pygame.Surface((self.screen_width, self.screen_height))
        self.surf.fill((255, 255, 255))

        l, r, t, b = -cartwidth / 2, cartwidth / 2, cartheight / 2, -cartheight / 2
        axleoffset = cartheight / 4.0
        cartx = x[0] * scale + self.screen_width / 2.0  # MIDDLE OF CART
        carty = 100  # TOP OF CART
        cart_coords = [(l, b), (l, t), (r, t), (r, b)]
        cart_coords = [(c[0] + cartx, c[1] + carty) for c in cart_coords]
        gfxdraw.aapolygon(self.surf, cart_coords, (0, 0, 0))
        gfxdraw.filled_polygon(self.surf, cart_coords, (0, 0, 0))

        l, r, t, b = (
            -polewidth / 2,
            polewidth / 2,
            polelen - polewidth / 2,
            -polewidth / 2,
        )

        pole_coords = []
        for coord in [(l, b), (l, t), (r, t), (r, b)]:
            coord = pygame.math.Vector2(coord).rotate_rad(-x[2])
            coord = (coord[0] + cartx, coord[1] + carty + axleoffset)
            pole_coords.append(coord)
        gfxdraw.aapolygon(self.surf, pole_coords, (202, 152, 101))
        gfxdraw.filled_polygon(self.surf, pole_coords, (202, 152, 101))

        gfxdraw.aacircle(
            self.surf,
            int(cartx),
            int(carty + axleoffset),
            int(polewidth / 2),
            (129, 132, 203),
        )
        gfxdraw.filled_circle(
            self.surf,
            int(cartx),
            int(carty + axleoffset),
            int(polewidth / 2),
            (129, 132, 203),
        )

        gfxdraw.hline(self.surf, 0, self.screen_width, carty, (0, 0, 0))

        self.surf = pygame.transform.flip(self.surf, False, True)
        self.screen.blit(self.surf, (0, 0))
        if mode == "human":
            pygame.event.pump()
            self.clock.tick(self.metadata["render_fps"])
            pygame.display.flip()

        elif mode in {"rgb_array", "single_rgb_array"}:
            return np.transpose(
                np.array(pygame.surfarray.pixels3d(self.screen)), axes=(1, 0, 2)
            )

    def close(self):
        if self.screen is not None:
            import pygame

            pygame.display.quit()
            pygame.quit()
            self.isopen = False

In [50]:
# environment without renderings for training
env = CartPoleEnv()
# environment with renderings for validating
env_rendering = CartPoleEnv(render_mode = "human")

In [51]:
env.reset()
env.step(env.action_space.sample())
env.observation_space
check_env(env)

# TODO: error may be through incomplete reset state choice

ValueError: not enough values to unpack (expected 6, got 4)

## Learning the model

In [14]:
%%time
# Learning!
model = A2C('MlpPolicy', env, verbose=1, tensorboard_log="./a2c_cartpole_tensorboard/")
model.learn(total_timesteps=100000)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


2022-09-02 15:57:16.035086: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-09-02 15:57:16.077653: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/kwrede/.local/lib/python3.8/site-packages/cv2/../../lib64:
2022-09-02 15:57:16.077666: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Logging to ./a2c_cartpole_tensorboard/A2C_21
------------------------------------
| time/                 |          |
|    fps                | 191      |
|    iterations         | 100      |
|    time_elapsed       | 2        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.41    |
|    explained_variance | 0.0159   |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 4.4      |
|    std                | 0.997    |
|    value_loss         | 7.44     |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 333      |
|    iterations         | 200      |
|    time_elapsed       | 3        |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss       | -1.42    |
|    explained_variance | -0.00333 |
|    learning_rate      | 0.0007   |
|    n_updates          | 199 

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.72e+03 |
|    ep_rew_mean        | 5.72e+03 |
| time/                 |          |
|    fps                | 908      |
|    iterations         | 1500     |
|    time_elapsed       | 8        |
|    total_timesteps    | 7500     |
| train/                |          |
|    entropy_loss       | -1.39    |
|    explained_variance | 2.84e-05 |
|    learning_rate      | 0.0007   |
|    n_updates          | 1499     |
|    policy_loss        | 0.483    |
|    std                | 0.972    |
|    value_loss         | 0.295    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.72e+03 |
|    ep_rew_mean        | 5.72e+03 |
| time/                 |          |
|    fps                | 925      |
|    iterations         | 1600     |
|    time_elapsed       | 8        |
|    total_timesteps    | 8000     |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.29e+03 |
|    ep_rew_mean        | 5.29e+03 |
| time/                 |          |
|    fps                | 1048     |
|    iterations         | 2800     |
|    time_elapsed       | 13       |
|    total_timesteps    | 14000    |
| train/                |          |
|    entropy_loss       | -1.39    |
|    explained_variance | 4.67e-05 |
|    learning_rate      | 0.0007   |
|    n_updates          | 2799     |
|    policy_loss        | -0.00328 |
|    std                | 0.975    |
|    value_loss         | 6.95e-06 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 5.29e+03  |
|    ep_rew_mean        | 5.29e+03  |
| time/                 |           |
|    fps                | 1054      |
|    iterations         | 2900      |
|    time_elapsed       | 13        |
|    total_timesteps    | 1450

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.29e+03 |
|    ep_rew_mean        | 5.29e+03 |
| time/                 |          |
|    fps                | 1106     |
|    iterations         | 4100     |
|    time_elapsed       | 18       |
|    total_timesteps    | 20500    |
| train/                |          |
|    entropy_loss       | -1.4     |
|    explained_variance | 0.000112 |
|    learning_rate      | 0.0007   |
|    n_updates          | 4099     |
|    policy_loss        | -0.00104 |
|    std                | 0.982    |
|    value_loss         | 1.05e-06 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.29e+03 |
|    ep_rew_mean        | 5.29e+03 |
| time/                 |          |
|    fps                | 1110     |
|    iterations         | 4200     |
|    time_elapsed       | 18       |
|    total_timesteps    | 21000    |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 5.29e+03  |
|    ep_rew_mean        | 5.29e+03  |
| time/                 |           |
|    fps                | 1147      |
|    iterations         | 5400      |
|    time_elapsed       | 23        |
|    total_timesteps    | 27000     |
| train/                |           |
|    entropy_loss       | -1.4      |
|    explained_variance | 2.71e-05  |
|    learning_rate      | 0.0007    |
|    n_updates          | 5399      |
|    policy_loss        | -0.000986 |
|    std                | 0.98      |
|    value_loss         | 4.76e-07  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 5.29e+03  |
|    ep_rew_mean        | 5.29e+03  |
| time/                 |           |
|    fps                | 1149      |
|    iterations         | 5500      |
|    time_elapsed       | 23        |
|    total_t

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.29e+03 |
|    ep_rew_mean        | 5.29e+03 |
| time/                 |          |
|    fps                | 1174     |
|    iterations         | 6700     |
|    time_elapsed       | 28       |
|    total_timesteps    | 33500    |
| train/                |          |
|    entropy_loss       | -1.37    |
|    explained_variance | 3.62e-05 |
|    learning_rate      | 0.0007   |
|    n_updates          | 6699     |
|    policy_loss        | -0.00103 |
|    std                | 0.948    |
|    value_loss         | 3.65e-07 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.29e+03 |
|    ep_rew_mean        | 5.29e+03 |
| time/                 |          |
|    fps                | 1176     |
|    iterations         | 6800     |
|    time_elapsed       | 28       |
|    total_timesteps    | 34000    |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.29e+03 |
|    ep_rew_mean        | 5.29e+03 |
| time/                 |          |
|    fps                | 1192     |
|    iterations         | 8000     |
|    time_elapsed       | 33       |
|    total_timesteps    | 40000    |
| train/                |          |
|    entropy_loss       | -1.36    |
|    explained_variance | 0.000615 |
|    learning_rate      | 0.0007   |
|    n_updates          | 7999     |
|    policy_loss        | -0.00141 |
|    std                | 0.938    |
|    value_loss         | 7.55e-07 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 5.29e+03  |
|    ep_rew_mean        | 5.29e+03  |
| time/                 |           |
|    fps                | 1194      |
|    iterations         | 8100      |
|    time_elapsed       | 33        |
|    total_timesteps    | 4050

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 5.29e+03  |
|    ep_rew_mean        | 5.29e+03  |
| time/                 |           |
|    fps                | 1206      |
|    iterations         | 9300      |
|    time_elapsed       | 38        |
|    total_timesteps    | 46500     |
| train/                |           |
|    entropy_loss       | -1.42     |
|    explained_variance | 5.25e-05  |
|    learning_rate      | 0.0007    |
|    n_updates          | 9299      |
|    policy_loss        | -0.000866 |
|    std                | 1         |
|    value_loss         | 2.48e-07  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 5.29e+03  |
|    ep_rew_mean        | 5.29e+03  |
| time/                 |           |
|    fps                | 1208      |
|    iterations         | 9400      |
|    time_elapsed       | 38        |
|    total_t

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.29e+03 |
|    ep_rew_mean        | 5.29e+03 |
| time/                 |          |
|    fps                | 1218     |
|    iterations         | 10600    |
|    time_elapsed       | 43       |
|    total_timesteps    | 53000    |
| train/                |          |
|    entropy_loss       | -1.39    |
|    explained_variance | 1.01e-05 |
|    learning_rate      | 0.0007   |
|    n_updates          | 10599    |
|    policy_loss        | -0.00119 |
|    std                | 0.976    |
|    value_loss         | 1.29e-06 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 5.29e+03  |
|    ep_rew_mean        | 5.29e+03  |
| time/                 |           |
|    fps                | 1219      |
|    iterations         | 10700     |
|    time_elapsed       | 43        |
|    total_timesteps    | 5350

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 5.29e+03  |
|    ep_rew_mean        | 5.29e+03  |
| time/                 |           |
|    fps                | 1224      |
|    iterations         | 11900     |
|    time_elapsed       | 48        |
|    total_timesteps    | 59500     |
| train/                |           |
|    entropy_loss       | -1.4      |
|    explained_variance | 4.76e-05  |
|    learning_rate      | 0.0007    |
|    n_updates          | 11899     |
|    policy_loss        | -0.000736 |
|    std                | 0.982     |
|    value_loss         | 2.61e-07  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 5.29e+03  |
|    ep_rew_mean        | 5.29e+03  |
| time/                 |           |
|    fps                | 1225      |
|    iterations         | 12000     |
|    time_elapsed       | 48        |
|    total_t

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.29e+03 |
|    ep_rew_mean        | 5.29e+03 |
| time/                 |          |
|    fps                | 1224     |
|    iterations         | 13200    |
|    time_elapsed       | 53       |
|    total_timesteps    | 66000    |
| train/                |          |
|    entropy_loss       | -1.44    |
|    explained_variance | 2.35e-05 |
|    learning_rate      | 0.0007   |
|    n_updates          | 13199    |
|    policy_loss        | -0.00109 |
|    std                | 1.02     |
|    value_loss         | 5.58e-07 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.29e+03 |
|    ep_rew_mean        | 5.29e+03 |
| time/                 |          |
|    fps                | 1224     |
|    iterations         | 13300    |
|    time_elapsed       | 54       |
|    total_timesteps    | 66500    |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.29e+03 |
|    ep_rew_mean        | 5.29e+03 |
| time/                 |          |
|    fps                | 1230     |
|    iterations         | 14500    |
|    time_elapsed       | 58       |
|    total_timesteps    | 72500    |
| train/                |          |
|    entropy_loss       | -1.43    |
|    explained_variance | 4.99e-05 |
|    learning_rate      | 0.0007   |
|    n_updates          | 14499    |
|    policy_loss        | -0.00133 |
|    std                | 1.01     |
|    value_loss         | 1.06e-06 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.29e+03 |
|    ep_rew_mean        | 5.29e+03 |
| time/                 |          |
|    fps                | 1231     |
|    iterations         | 14600    |
|    time_elapsed       | 59       |
|    total_timesteps    | 73000    |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.29e+03 |
|    ep_rew_mean        | 5.29e+03 |
| time/                 |          |
|    fps                | 1236     |
|    iterations         | 15800    |
|    time_elapsed       | 63       |
|    total_timesteps    | 79000    |
| train/                |          |
|    entropy_loss       | -1.46    |
|    explained_variance | 0.000632 |
|    learning_rate      | 0.0007   |
|    n_updates          | 15799    |
|    policy_loss        | -0.0014  |
|    std                | 1.04     |
|    value_loss         | 3.3e-07  |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 5.29e+03  |
|    ep_rew_mean        | 5.29e+03  |
| time/                 |           |
|    fps                | 1236      |
|    iterations         | 15900     |
|    time_elapsed       | 64        |
|    total_timesteps    | 7950

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 5.29e+03  |
|    ep_rew_mean        | 5.29e+03  |
| time/                 |           |
|    fps                | 1240      |
|    iterations         | 17100     |
|    time_elapsed       | 68        |
|    total_timesteps    | 85500     |
| train/                |           |
|    entropy_loss       | -1.45     |
|    explained_variance | 0.0013    |
|    learning_rate      | 0.0007    |
|    n_updates          | 17099     |
|    policy_loss        | -0.000859 |
|    std                | 1.03      |
|    value_loss         | 3.59e-07  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 5.29e+03  |
|    ep_rew_mean        | 5.29e+03  |
| time/                 |           |
|    fps                | 1241      |
|    iterations         | 17200     |
|    time_elapsed       | 69        |
|    total_t

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 5.29e+03  |
|    ep_rew_mean        | 5.29e+03  |
| time/                 |           |
|    fps                | 1244      |
|    iterations         | 18400     |
|    time_elapsed       | 73        |
|    total_timesteps    | 92000     |
| train/                |           |
|    entropy_loss       | -1.46     |
|    explained_variance | 6.53e-05  |
|    learning_rate      | 0.0007    |
|    n_updates          | 18399     |
|    policy_loss        | -0.000702 |
|    std                | 1.05      |
|    value_loss         | 1.95e-07  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 5.29e+03 |
|    ep_rew_mean        | 5.29e+03 |
| time/                 |          |
|    fps                | 1244     |
|    iterations         | 18500    |
|    time_elapsed       | 74       |
|    total_timesteps

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 2.42e+04 |
|    ep_rew_mean        | 2.42e+04 |
| time/                 |          |
|    fps                | 1248     |
|    iterations         | 19700    |
|    time_elapsed       | 78       |
|    total_timesteps    | 98500    |
| train/                |          |
|    entropy_loss       | -1.48    |
|    explained_variance | nan      |
|    learning_rate      | 0.0007   |
|    n_updates          | 19699    |
|    policy_loss        | -0       |
|    std                | 1.06     |
|    value_loss         | 0        |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 2.42e+04 |
|    ep_rew_mean        | 2.42e+04 |
| time/                 |          |
|    fps                | 1248     |
|    iterations         | 19800    |
|    time_elapsed       | 79       |
|    total_timesteps    | 99000    |
|

<stable_baselines3.a2c.a2c.A2C at 0x7fce4077b6a0>

## Testing the model with rendering

In [15]:
obs = env_rendering.reset()
for i in range(1000):
    action, _state = model.predict(obs, deterministic=False)
    obs, reward, done, info = env_rendering.step(action)
    env_rendering.render()
    if done:
      obs = env_rendering.reset()

pygame 2.1.0 (SDL 2.0.16, Python 3.8.10)
Hello from the pygame community. https://www.pygame.org/contribute.html


ALSA lib confmisc.c:767:(parse_card) cannot find card '0'
ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_card_driver returned error: No such file or directory
ALSA lib confmisc.c:392:(snd_func_concat) error evaluating strings
ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_concat returned error: No such file or directory
ALSA lib confmisc.c:1246:(snd_func_refer) error evaluating name
ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_refer returned error: No such file or directory
ALSA lib conf.c:5220:(snd_config_expand) Evaluate error: No such file or directory
ALSA lib pcm.c:2642:(snd_pcm_open_noupdate) Unknown PCM default


In [16]:
env.close()
env_rendering.close()