# Setup 

In [4]:
!pip install gym_minigrid
!pip install stable_baselines3
!pip install --upgrade --quiet cloudpickle pickle5
!pip install wandb

Collecting gym_minigrid
  Downloading gym_minigrid-1.0.2-py3-none-any.whl (47 kB)
[?25l[K     |██████▉                         | 10 kB 23.6 MB/s eta 0:00:01[K     |█████████████▊                  | 20 kB 17.5 MB/s eta 0:00:01[K     |████████████████████▋           | 30 kB 11.7 MB/s eta 0:00:01[K     |███████████████████████████▍    | 40 kB 10.0 MB/s eta 0:00:01[K     |████████████████████████████████| 47 kB 2.4 MB/s 
Installing collected packages: gym-minigrid
Successfully installed gym-minigrid-1.0.2
Collecting stable_baselines3
  Downloading stable_baselines3-1.3.0-py3-none-any.whl (174 kB)
[K     |████████████████████████████████| 174 kB 5.5 MB/s 
Installing collected packages: stable-baselines3
Successfully installed stable-baselines3-1.3.0
[K     |████████████████████████████████| 256 kB 8.2 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conf

In [5]:
from gym_minigrid.wrappers import *
from gym_minigrid.minigrid import *
import gym

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.utils import set_random_seed
from stable_baselines3.common.evaluation import evaluate_policy

import torch.nn as nn
import numpy as np
import shutil #to remove folder

import itertools
import random

import matplotlib.pyplot as plt

import wandb
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import VecVideoRecorder
from wandb.integration.sb3 import WandbCallback

%matplotlib notebook

custom_objects = {
    "lr_schedule": lambda x: .003,
    "clip_range": lambda x: .02
}

import imageio as iio
import os

In [6]:
# Create gif out of environment and agent. Possibly upload to W&B
# gif_name: ["hacked", "initial_changed_env"] (have mappings in W&B) or custom
def save_gif(gif_name, max_frames, max_episodes, model, env, log_to_wb = True):

  path = 'gif'

  #remove folder as might contain old files
  try:
    shutil.rmtree(path)
  except OSError as e:
      print ("Error: %s - %s." % (e.filename, e.strerror))

  # Create a new directory 
  os.makedirs(path)

  images = []
  gif_path = gif_name + ".gif"
  frames_path = path+"/"+gif_name+"{j}.jpg"


  j = 0
  obs = env.reset()
  img = env.render(mode='rgb_array', highlight=False)
  for i in range(max_episodes):
      obs = env.reset()
      while True and j < max_frames:
        action, _ = model.predict(obs)
        obs, r, done ,_ = env.step(action)
        fig,(ax1) = plt.subplots(1,1, figsize=(5, 5));
        image1 = ax1.imshow(env.render("rgb_array", highlight=False))
        ax1.set_title(f"Hacked obs: Action = {action}, Reward = {r}, \n Done = {done}")
        plt.savefig(frames_path.format(j=j))
        plt.cla()
        j=j+1 
        if done: 
          break
  with iio.get_writer(gif_path, mode='I') as writer:
      for j in range(max_frames):
          try:
            writer.append_data(iio.imread(frames_path.format(j=j)))
          except OSError as e:
            None
  if log_to_wb == True:
    wandb.log({gif_name: wandb.Video(gif_path)})

In [7]:
#Vanilla environment for first agent
class EmptyBallRoom(MiniGridEnv):
    def __init__(self, size=9, ball_color="green", tile_size=8):
        self.ball_color = ball_color
        self.tile_size = tile_size
        super().__init__(
            grid_size=size,
            max_steps=4*size*size,
            # Set this to True for maximum speed
            see_through_walls=True
        )
        self.observation_space = spaces.Box(
            low=0,
            high=255,
            shape=(size * tile_size, size * tile_size, 3),
            dtype='uint8'
        )
    
    def reset(self):
        super().reset()
        rgb_observation = self.render("rgb_image", highlight=False, tile_size=self.tile_size)
        return rgb_observation
        
    def _gen_grid(self, width, height):
        # Create an empty grid
        self.grid = Grid(width, height)
        # Generate the surrounding walls
        self.grid.wall_rect(0, 0, width, height)

        # Place the agent
        # self.agent_pos = (1, 1)
        # self.agent_dir=0
        self.place_agent()
        
        # Place object
        self.ball = Ball(self.ball_color)
        # self.put_obj(self.ball, width-2, height-2)
        self.place_obj(self.ball)
        self.mission = f"Pick up {self.ball_color} {self.ball.type}"
        
    def step(self, action):
        obs, reward, done, info = super().step(action)

        if action == self.actions.pickup:
            if self.carrying and self.carrying == self.ball:
                reward = self._reward()
                done = True
        
        rgb_observation = self.render("rgb_image", highlight=False, tile_size=self.tile_size)
        
        return rgb_observation, reward, done, info

In [8]:
#Wrapper to change starting ball color randomly
class RandomBallColorWrapper(gym.core.Wrapper):
    
    def __init__(self, env):
        super().__init__(env)
    
    def reset(self):
        self.env.ball_color = random.sample(["red", "green", "blue", "yellow", "purple"], 1)[0]
        return super().reset()

In [9]:
# Hacked environment wrapper
class HackedEnvironment(gym.core.Wrapper):
    COLOR_TO_INDEX = {"red": 0, "green": 1, "blue": 2, "purple": 3, "yellow": 4}
    INDEX_TO_COLOR = {0: "red", 1: "green", 2: "blue", 3: "purple", 4: "yellow"}
    
    
    def __init__(self, env, first_agent):
        self.first_agent = first_agent
        super().__init__(env)
        self.action_space = spaces.Discrete(len(self.COLOR_TO_INDEX))
        
        self.transformed_obs = np.array([])
        self.original_obs = np.array([])
        
        self.transformation_action = None
        self.environment_action = None
        
        self.reset()
        
    def reset(self):
        self.env.reset()
        self.original_ball_color = self.env.ball.color
        self.original_obs = self.transformed_obs = self.env.render(
            "rgb_image",
            highlight=False,
            tile_size=self.env.tile_size
        )
        self.transformation_action
        return self.original_obs

    
    def step(self, first_action):
        """
        This agent changes some random color from predefined colors to goal color. If that color does
        not exist then nothing changes.
        
        1) 2nd agent makes an action and changes ball's color
        2) Render changed environment (change ball color in environment)
            which will be inserted to first agent's model
        3) First agent returns action and perform it on real environment
        4) Change ball color back to real ball color
        """
        # ball_pos = self.env.ball.cur_pos
        self.transformation_action = first_action
        if self.ball.color == self.INDEX_TO_COLOR[first_action]:
            self.ball.color = "green"
            
        self.transformed_obs = self.env.render("rgb_image", highlight=False, tile_size=self.env.tile_size)          
        second_action, _ = self.first_agent.predict(self.transformed_obs)
        self.environment_action = second_action
        self.transformed_obs = self.env.render("rgb_image", highlight=False, tile_size=self.env.tile_size)
        
        
        # Change color back to original
        self.ball.color = self.original_ball_color        
        self.original_obs, reward, done, info = self.env.step(second_action)
        
        return self.original_obs, reward, done, info
        
    def render(self, pov="agent", highlight=False, **kwargs):
        """
        pov : agent / original
        """
        if pov=="agent":
            if self.transformation_action is None:
                return self.render("original", highlight=highlight, **kwargs)
            if self.ball.color == self.INDEX_TO_COLOR[self.transformation_action]:
                self.ball.color = "green"
            observation = self.env.render(highlight=highlight, **kwargs)
            self.ball.color = self.original_ball_color
        else:
            observation = self.env.render(highlight=highlight, **kwargs)

        return observation

# Experiments

Sets up following experiments: 


1.   train_and_log_baseline - trains 1st agent and logs related data to W&B
2.   validate_baseline - validates 1st agent on the second environment. 
3.   train_2nd_agent_and_validate - trains second agent and validates it on second environment. 

Note: only experiments 2 and 3 are assumed to get changed with different environment changes. They also need the first agent to be provided for loading. 


## Experiment definitions

In [10]:
def train_and_log_baseline(trial_description, trial_name, config):
  ### Initiate W&B connection
  project_name = "learning-by-hacking"
  run = wandb.init(
    project=project_name,
    entity="learning-by-hacking",
    name = trial_name,
    config=config,
    sync_tensorboard=True,  # auto-upload sb3's tensorboard metrics
    monitor_gym=True,  # auto-upload the videos of agents playing the game
    save_code=True,  # optional
  )

  ### Create env
  first_env_vector = make_vec_env(EmptyBallRoom, 
                                n_envs=12, 
                                vec_env_cls=SubprocVecEnv, 
                                env_kwargs= {
                                  "size": config["ROOM_SIZE"], 
                                  "ball_color": config["BALL_COLOR"], 
                                  "tile_size": config["TILE_SIZE"]
                                  }, 
                                monitor_dir="env_monitor")
  first_env_vector = VecVideoRecorder(first_env_vector, f"videos/{run.id}", record_video_trigger=lambda x: x % 2000 == 0, video_length=200) # for video loading to W&B

  policy_kwargs = dict(activation_fn=nn.ReLU)
  first_agent = PPO(
    config["policy_type"], 
    first_env_vector,
    policy_kwargs=policy_kwargs,
    verbose=1,
    tensorboard_log="TB_first_agent",
    ent_coef = 0
    )
  first_agent.learn(total_timesteps=config["total_timesteps"], 
                    callback=WandbCallback( #callback for W&B
                                          model_save_path=f"models/{run.id}",
                                          verbose=2,
                                          ),
                    )
  first_agent.save("first_agent")

  ### Validate
  test_env = EmptyBallRoom(ball_color = config["BALL_COLOR"], size=config["ROOM_SIZE"])

  n_val_episodes = 1000
  mean_reward, std_reward = evaluate_policy(first_agent, test_env, n_eval_episodes=n_val_episodes)
  wandb.log({'val_mean_reward': mean_reward, 'val_std_reward': std_reward}) #log mean reward
  save_gif("First agent on first environment", 100, 10, first_agent, test_env, log_to_wb = True) #log gif to W&B

  wandb.finish() 


In [11]:
def validate_baseline(trial_description, trial_name, config, modelName = "first_agent", randomColor = False):
  ### Initiate W&B connection
  project_name = "learning-by-hacking"
  run = wandb.init(
    project=project_name,
    entity="learning-by-hacking",
    name = trial_name,
    config=config,
    sync_tensorboard=True,  # auto-upload sb3's tensorboard metrics
    monitor_gym=True,  # auto-upload the videos of agents playing the game
    save_code=True,  # optional
  )

  ### Load agent
  first_agent = PPO.load(modelName, custom_objects=custom_objects)
  print("loaded:", "gamma =", first_agent.gamma, "n_steps =", first_agent.n_steps)

  ############################################################################################################## 
  # Only this section is expected to get changed per different experiments

  ### choose random color or not
  second_env_kwargs = {"size": config["ROOM_SIZE"], "ball_color": config["2ND_BALL_COLOR"], "tile_size": config["TILE_SIZE"]} 
  if randomColor == False:
    second_env = EmptyBallRoom(**second_env_kwargs) # Note: this is relevant for validation as well
    
  else:
    second_env = RandomBallColorWrapper(EmptyBallRoom(**second_env_kwargs))

  ############################################################################################################## 
  
  ### Validate
  n_val_episodes = 1000
  mean_reward, std_reward = evaluate_policy(first_agent, second_env, n_eval_episodes=n_val_episodes)
  wandb.log({'val_mean_reward': mean_reward, 'val_std_reward': std_reward}) #log mean reward
  save_gif("First agent on second environment", 100, 10, first_agent, second_env, log_to_wb = True) #log gif to W&B

  wandb.finish() 


In [3]:
def train_2nd_agent_and_validate(trial_description, trial_name, config, modelName = "first_agent", randomColor = False):
  ### Initiate W&B connection
  project_name = "learning-by-hacking"
  run = wandb.init(
    project=project_name,
    entity="learning-by-hacking",
    name = trial_name,
    config=config,
    sync_tensorboard=True,  # auto-upload sb3's tensorboard metrics
    monitor_gym=True,  # auto-upload the videos of agents playing the game
    save_code=True,  # optional
  )

  ### Load agent
  first_agent = PPO.load(modelName, custom_objects=custom_objects)
  print("loaded:", "gamma =", first_agent.gamma, "n_steps =", first_agent.n_steps)

  ############################################################################################################## 
  # Only this section is expected to get changed per different experiments

  ### choose random color or not
  second_env_kwargs = {"size": config["ROOM_SIZE"], "ball_color": config["2ND_BALL_COLOR"], "tile_size": config["TILE_SIZE"]} 
  if randomColor == False:
    second_env = EmptyBallRoom(**second_env_kwargs) # Note: this is relevant for validation as well
    
  else:
    second_env = RandomBallColorWrapper(EmptyBallRoom(**second_env_kwargs))
  
  ## vectorize for training
  second_env_vector = make_vec_env(
    HackedEnvironment,
    n_envs=2,                           # max 2 for colaboratory
    vec_env_cls=SubprocVecEnv,
    env_kwargs={"env": second_env, 
                "first_agent": first_agent},
    monitor_dir="env_monitor")
  
  ### train second agent
  policy_kwargs = dict(activation_fn=nn.ReLU)
  second_agent = PPO(
      config["policy_type"], 
      second_env_vector,
      gamma=0.5,
      policy_kwargs=policy_kwargs,
      verbose=1,
      tensorboard_log="second_agent_tb",
      ent_coef = 0
  )
  second_agent.learn(total_timesteps=config["total_timesteps"], callback=WandbCallback( #callback for W&B
        model_save_path=f"models/{run.id}", verbose=2, ),
        )
  second_agent.save("second_agent")

  ############################################################################################################## 
  
  ### Validate
  # Initiate new environment - addded 03.01.2022 KT
  env = HackedEnvironment(second_env, first_agent)

  n_val_episodes = 1000
  mean_reward, std_reward = evaluate_policy(second_agent, env, n_eval_episodes=n_val_episodes)
  wandb.log({'val_mean_reward': mean_reward, 'val_std_reward': std_reward}) #log mean reward
  save_gif("Second agent on second environment", 100, 10, second_agent, env, log_to_wb = True) #log gif to W&B

  wandb.finish() 


## Running the experiments

In [None]:
#Experiment 1
trial_description = 'Training and validating first agent'
trial_name = '1st agent training'
project_name = "learning-by-hacking"
config = {
    "policy_type": "CnnPolicy",
    "total_timesteps": 200_000,
    "env_name": "Baseline 1",
    "ROOM_SIZE": 9,
    "BALL_COLOR": "green",
    "TILE_SIZE": 8,           # TODO not used actually?
    "Trial description": trial_description
}

train_and_log_baseline(trial_description, trial_name, config)

In [13]:
#Experiment 2
trial_description = 'Validating first agent on new, second, environment'
trial_name = '1st agent validation'
project_name = "learning-by-hacking"
config = {
    "policy_type": "CnnPolicy",
    "total_timesteps": 200_000, #irrelevent here
    "env_name": "Baseline 2",
    "ROOM_SIZE": 9,
    "BALL_COLOR": "green",
    "2ND_BALL_COLOR": "red",
    "TILE_SIZE": 8,
    "Trial description": trial_description
}

validate_baseline(trial_description, trial_name, config, modelName = "first_agent", randomColor = False)

VBox(children=(Label(value=' 0.10MB of 0.10MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

loaded: gamma = 0.99 n_steps = 2048




Error: gif - No such file or directory.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

VBox(children=(Label(value=' 0.00MB of 4.28MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.00014114486…

0,1
val_mean_reward,▁
val_std_reward,▁

0,1
val_mean_reward,0.0
val_std_reward,0.0


In [14]:
#Experiment 3
trial_description = 'Validating second agent on new, second, environment, along with logging of the training'
trial_name = '2nd agent training and validation'
project_name = "learning-by-hacking"
config = {
    "policy_type": "CnnPolicy",
    "total_timesteps": 200_000,
    "env_name": "2nd environment",
    "ROOM_SIZE": 9,
    "BALL_COLOR": "green",
    "2ND_BALL_COLOR": "red",
    "TILE_SIZE": 8,
    "Trial description": trial_description
}

train_2nd_agent_and_validate(trial_description, trial_name, config, modelName = "first_agent", randomColor = True)

[34m[1mwandb[0m: Currently logged in as: [33mktark[0m (use `wandb login --relogin` to force relogin)


loaded: gamma = 0.99 n_steps = 2048
Using cpu device
Wrapping the env in a VecTransposeImage.
Logging to second_agent_tb/PPO_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 44.4     |
|    ep_rew_mean     | 0.873    |
| time/              |          |
|    fps             | 99       |
|    iterations      | 1        |
|    time_elapsed    | 41       |
|    total_timesteps | 4096     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 52          |
|    ep_rew_mean          | 0.851       |
| time/                   |             |
|    fps                  | 54          |
|    iterations           | 2           |
|    time_elapsed         | 149         |
|    total_timesteps      | 8192        |
| train/                  |             |
|    approx_kl            | 0.015152485 |
|    clip_fraction        | 0.132       |
|    clip_range           | 0



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

VBox(children=(Label(value=' 14.77MB of 14.77MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.…

0,1
global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▅▆▇▆█▆▆▅▅▆▃▅▅▅▆▄▃▆▅▅▄▂▃▃▃▄▅▃▂▃▃▂▂▂▂▃▃▁▂▁
rollout/ep_rew_mean,▄▃▂▃▁▃▃▄▄▃▇▄▄▅▃▅▅▃▄▃▅▇▆▆▆▅▃▆▇▆▆▇▇▆▇▆▆█▇█
time/fps,█▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/approx_kl,▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▄▄▅▅▅▅▆▆▆▅▅▆▇█▇▇▇▇▇▇█▇
train/clip_fraction,▁▄▄▆▆▇▇▇▇▇███▇█▇██▇▇█▇▇▇█▇█▇▆█▆██▇▇▇▇▇▇▇
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▆▆▇▇▆▇▇██
train/explained_variance,▃▃▃▃▁▄▅▅▅▅▅▅▅▆▅▆▆▆▆▆▇▇▇▇▇█▇▇▆▇▇███▇██▇▇█
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,200704.0
rollout/ep_len_mean,18.81
rollout/ep_rew_mean,0.94675
time/fps,43.0
train/approx_kl,0.10676
train/clip_fraction,0.42456
train/clip_range,0.2
train/entropy_loss,-0.8311
train/explained_variance,0.69567
train/learning_rate,0.0003
