In [None]:
# run this cell if you're running the notebook in google colab

!pip install gym_minigrid
!pip install stable_baselines3
!pip install --upgrade --quiet cloudpickle pickle5
!pip install wandb


Collecting gym_minigrid
  Downloading gym_minigrid-1.0.2-py3-none-any.whl (47 kB)
[?25l[K     |██████▉                         | 10 kB 17.8 MB/s eta 0:00:01[K     |█████████████▊                  | 20 kB 21.1 MB/s eta 0:00:01[K     |████████████████████▋           | 30 kB 23.8 MB/s eta 0:00:01[K     |███████████████████████████▍    | 40 kB 23.9 MB/s eta 0:00:01[K     |████████████████████████████████| 47 kB 3.1 MB/s 
Installing collected packages: gym-minigrid
Successfully installed gym-minigrid-1.0.2
Collecting stable_baselines3
  Downloading stable_baselines3-1.3.0-py3-none-any.whl (174 kB)
[K     |████████████████████████████████| 174 kB 11.0 MB/s 
Installing collected packages: stable-baselines3
Successfully installed stable-baselines3-1.3.0
[K     |████████████████████████████████| 256 kB 18.2 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency co

In [None]:
from gym_minigrid.wrappers import *
from gym_minigrid.minigrid import *
import gym

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.utils import set_random_seed
from stable_baselines3.common.evaluation import evaluate_policy

import numpy as np
import torch
import torch.nn as nn

import matplotlib.pyplot as plt

# for logging to WB
import wandb
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import VecVideoRecorder
from wandb.integration.sb3 import WandbCallback

custom_objects = {
    "lr_schedule": lambda x: .003,
    "clip_range": lambda x: .02
}

%matplotlib notebook

# Create custom enviroment

Environment where ball and agent are randomly put into square grid. Ball color and grid size can be changed.

In [None]:
class EmptyBallRoom(MiniGridEnv):
    def __init__(self, size=5, ball_color="green"):
        self.ball_color = ball_color
        super().__init__(
            grid_size=size,
            #max_steps=4*size*size,
            # Set this to True for maximum speed
            see_through_walls=True
        )
    def _gen_grid(self, width, height):
        # Create an empty grid
        self.grid = Grid(width, height)

        # Generate the surrounding walls
        self.grid.wall_rect(0, 0, width, height)

        # Place the agent
        # self.agent_pos = (1, 1)
        # self.agent_dir=0
        self.place_agent()
        
        # Place object
        self.ball = Ball(self.ball_color)
        # self.put_obj(self.ball, width-2, height-2)
        self.place_obj(self.ball)
        self.mission = f"Pick up {self.ball_color} {self.ball.type}"
        
    def step(self, action):
        obs, reward, done, info = super().step(action)

        if action == self.actions.pickup:
            if self.carrying and self.carrying == self.ball:
                reward = self._reward()
                done = True

        return obs, reward, done, info

In [None]:
# Function to create the environment (needed for vectorized environment object in training process)
def make_env(ball_color, room_size, rank, seed=0):
    """
    Utility function for multiprocessed env.

    :param env_id: (str) the environment ID
    :param num_env: (int) the number of environments you wish to have in subprocesses
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    """
    def _init():
        env = ImgObsWrapper(RGBImgPartialObsWrapper(EmptyBallRoom(size=room_size, ball_color=ball_color)))
        #env = gym.make(env_id)
        env.seed(seed + rank)
        env = Monitor(env) # for monitoring env in wb
        return env
    set_random_seed(seed)
    return _init

## Supportive function save_gif. Make gif + send to W&B

In [None]:
import imageio as iio
import os

# Create gif out of environment and agent. Possibly upload to W&B
# gif_name: ["hacked", "initial_changed_env"] (have mappings in W&B) or custom
def save_gif(gif_name, max_frames, max_episodes, model, env, log_to_wb = True):

  path = 'gif'
  # Check whether the specified path exists or not
  isExist = os.path.exists(path)

  if not isExist:    
    # Create a new directory because it does not exist 
    os.makedirs(path)

  images = []
  gif_path = gif_name + ".gif"
  frames_path = path+"/"+gif_name+"{j}.jpg"

  j = 0
  obs = env.reset()
  img = env.render(mode='rgb_array')
  for i in range(max_episodes):
      obs = env.reset()
      while True and j < max_frames:
        action, _ = model.predict(obs)
        obs, r, done ,_ = env.step(action)
        fig,(ax1) = plt.subplots(1,1, figsize=(5, 5));
        image1 = ax1.imshow(env.render("rgb_array"))
        ax1.set_title(f"Hacked obs: Action = {action}, Reward = {r}, \n Done = {done}")
        plt.savefig(frames_path.format(j=j))
        plt.cla()
        j=j+1 
        if done: 
          break
  with iio.get_writer(gif_path, mode='I') as writer:
      for j in range(max_frames):
          try:
            writer.append_data(iio.imread(frames_path.format(j=j)))
          except OSError as e:
            None
  if log_to_wb == True:
    wandb.log({gif_name: wandb.Video(gif_path)})

# Reinforcment learning

### Training first agent (not essential to use the trained agent)

In [None]:
trial_description = 'Initial model training'
trial_name = 'Initial model'
project_name = "learning-by-hacking"


config = {
    "policy_type": "CnnPolicy",
    "total_timesteps": 50_000,
    "env_name": "InitialModel",
    "ROOM_SIZE": 9,
    "BALL_COLOR": "green",
    "Trial description": trial_description
}

In [None]:
run = wandb.init(
    project=project_name,
    entity="learning-by-hacking",
    name = trial_name,
    config=config,
    sync_tensorboard=True,  # auto-upload sb3's tensorboard metrics
    monitor_gym=True,  # auto-upload the videos of agents playing the game
    save_code=True,  # optional
)

VBox(children=(Label(value=' 0.61MB of 0.61MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
global_step,▁▂▃▄▅▆▇█
rollout/ep_len_mean,██▇▆▅▄▂▁
rollout/ep_rew_mean,▁▁▂▃▄▅▇█
time/fps,█▃▂▂▁▁▁▁
train/approx_kl,▁▆▅▆▆█▇
train/clip_fraction,▁▄▄▆▇█▇
train/clip_range,▁▁▁▁▁▁▁
train/entropy_loss,▁▂▂▃▅▆█
train/explained_variance,▁▄▄▆▆▇█
train/learning_rate,▁▁▁▁▁▁▁

0,1
global_step,32768.0
rollout/ep_len_mean,31.8
rollout/ep_rew_mean,0.7068
time/fps,115.0
train/approx_kl,0.01998
train/clip_fraction,0.24133
train/clip_range,0.2
train/entropy_loss,-1.75473
train/explained_variance,0.40921
train/learning_rate,0.0003


In [None]:
# Number of "cores" to use
N_CPU = 2 # 2 in colab

In [None]:
# Create vectorized environment to make learning faster
env = SubprocVecEnv([make_env(ball_color = config["BALL_COLOR"], room_size=config["ROOM_SIZE"], rank=i*2) for i in range(N_CPU)])

In [None]:
# Activation function to be used in CNN layers
policy_kwargs = dict(activation_fn=nn.ReLU)
# create video every 2000 steps
env = VecVideoRecorder(env, f"videos/{run.id}", record_video_trigger=lambda x: x % 2000 == 0, video_length=200)

In [None]:
# https://stable-baselines3.readthedocs.io/en/master/modules/ppo.html?highlight=PPO
model = PPO(config["policy_type"], env, policy_kwargs=policy_kwargs, verbose=1, tensorboard_log=f"runs/{run.id}")

Using cpu device
Wrapping the env in a VecTransposeImage.


In [None]:
model.learn(total_timesteps=config["total_timesteps"], callback=WandbCallback( #callback for W&B
        model_save_path=f"models/{run.id}",
        verbose=2,
    ),)



Logging to runs/9vgv9a88/PPO_1
Saving video to /content/videos/9vgv9a88/rl-video-step-0-to-step-200.mp4
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 89.1     |
|    ep_rew_mean     | 0.116    |
| time/              |          |
|    fps             | 251      |
|    iterations      | 1        |
|    time_elapsed    | 16       |
|    total_timesteps | 4096     |
---------------------------------
Saving video to /content/videos/9vgv9a88/rl-video-step-2000-to-step-2200.mp4
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 87.3        |
|    ep_rew_mean          | 0.138       |
| time/                   |             |
|    fps                  | 153         |
|    iterations           | 2           |
|    time_elapsed         | 53          |
|    total_timesteps      | 8192        |
| train/                  |             |
|    approx_kl            | 0.009091127 |
|    clip_fractio

<stable_baselines3.ppo.ppo.PPO at 0x7fae2dad2790>

In [None]:
model.save("rgb_ball_agent")

In [None]:
model = PPO.load("rgb_ball_agent", custom_objects=custom_objects)



In [None]:
# load again agent and create single environment

test_env = SubprocVecEnv([make_env(ball_color = config["BALL_COLOR"], room_size=config["ROOM_SIZE"], rank=5)])
n_val_episodes = 10
mean_reward, std_reward = evaluate_policy(model, test_env, n_eval_episodes=n_val_episodes)
wandb.log({'val_mean_reward':mean_reward, 'val_std_reward':std_reward}) #log mean reward
save_gif("initial", 100, 10, model, test_env, log_to_wb = True) #log gif to W&B


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
wandb.finish() #close W&B run

VBox(children=(Label(value=' 9.71MB of 9.71MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
global_step,▁▂▂▃▃▄▅▅▆▆▇▇█
rollout/ep_len_mean,██▇▇▆▅▄▃▂▂▁▁▁
rollout/ep_rew_mean,▁▁▂▂▃▄▅▆▇▇███
time/fps,█▃▂▂▂▁▁▁▁▁▁▁▁
train/approx_kl,▁▄▄▄▄▅▅▅█▇▇█
train/clip_fraction,▁▃▄▅▆▇▆▆██▇█
train/clip_range,▁▁▁▁▁▁▁▁▁▁▁▁
train/entropy_loss,▁▁▁▂▂▃▄▅▅▇██
train/explained_variance,▁▃▄▅▆▆▇█▇▇█▇
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,53248.0
rollout/ep_len_mean,14.48
rollout/ep_rew_mean,0.86968
time/fps,112.0
train/approx_kl,0.02946
train/clip_fraction,0.30972
train/clip_range,0.2
train/entropy_loss,-1.44484
train/explained_variance,0.38842
train/learning_rate,0.0003


### New environment. New W&B run.

In [None]:
# Use this one when running locally via jupyter notebook or sth
#model = PPO.load("rgb_ball_agent", device="cpu")

# Use this one for google colab
model = PPO.load("rgb_ball_agent", custom_objects=custom_objects)


In [None]:
#Define trial parameters

trial_description = 'Toggle ball to other colors. 2 steps: change env and toggle color of the ball, not change the environment'
trial_name = 'Blue to other colors' #use to list trials in W&B 


config = {
    "policy_type": "CnnPolicy",
    "total_timesteps": 200_000,
    "env_name": "9x9Grid",
    "ROOM_SIZE": 9,
    "BALL_COLOR": "blue",
    "Trial description": trial_description
}

run = wandb.init(
    project=project_name,
    entity="learning-by-hacking",
    name = trial_name,
    config=config,
    sync_tensorboard=True,  # auto-upload sb3's tensorboard metrics
    monitor_gym=True,  # auto-upload the videos of agents playing the game
    save_code=True,  # optional
)

[34m[1mwandb[0m: Currently logged in as: [33mktark[0m (use `wandb login --relogin` to force relogin)


# Displaying the agent in the enivronment

In [None]:
# Note that the original model can't find the blue ball
test_env = make_env(ball_color = config["BALL_COLOR"], room_size=config["ROOM_SIZE"], rank=5)()

In [None]:
n_val_episodes = 10
mean_reward, std_reward = evaluate_policy(model, test_env, n_eval_episodes=n_val_episodes)
wandb.log({'initial_model_mean_reward':mean_reward, 'initial_model_std_reward':std_reward}) #Log to W&B
save_gif("initial_changed_env", 100, 10, model, test_env, log_to_wb = True) #log gif to W&B




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# 2nd agent

This requires to change our environment. Because stable_baselines3 gets the action indeices from the environment. Look at:
https://github.com/maximecb/gym-minigrid/blob/1d4edcfd585f6b627ebf1cfd54ea5e70260fa1ee/gym_minigrid/minigrid.py#L629

https://github.com/maximecb/gym-minigrid/blob/1d4edcfd585f6b627ebf1cfd54ea5e70260fa1ee/gym_minigrid/minigrid.py#L1099
I think redefining the Actions class and step function should be sufficient.
Possible way is to inherit EmptyBallRoom class because it has correct reward function.

In [None]:
class HackedBallRoom(MiniGridEnv):
    # Variation of ball environment which uses actions.toggle to switch the ball color
    metadata = {'render.modes': ['rgb_array','rgb_array_orig']}
    def __init__(self, size=5, ball_color="green", og_model=None):
        self.initial_ball_color = ball_color
        self.ball_color = ball_color
        self.og_model = og_model
        self.obs_original = None

        super().__init__(
            grid_size=size,
            max_steps=4*size*size,
            # Set this to True for maximum speed
            see_through_walls=True
        )
        
    def _gen_grid(self, width, height):
        # Create an empty grid
        self.grid = Grid(width, height)

        # Generate the surrounding walls
        self.grid.wall_rect(0, 0, width, height)

        # Place the agent
        self.place_agent()
        
        # Place object
        self.ball = Ball(self.ball_color)
        self.place_obj(self.ball)
        
        self.mission = f"Pick up {self.ball_color} {self.ball.type}"
        
    def reset(self):
        self.ball_color = self.initial_ball_color
        return super().reset()
        
    def render(self, mode='rgb_array', close=False):
        # Render the environment to the screen
        if mode == 'rgb_array':
            return np.array(super().render("rgb_array")) #original evironment 
        # elif mode == 'rgb_array':
        #     return np.array(render("rgb_array")) #current obs
    def switch_ball_color(self):
        
        if self.ball_color == "red":
            self.ball_color = "blue"
        elif self.ball_color == "blue":
            self.ball_color = "purple"
        elif self.ball_color == "purple":
            self.ball_color = "green"
        elif self.ball_color == "green":
            self.ball_color = "yellow"
        elif self.ball_color == "yellow":
            self.ball_color = "red"
    
    def step(self, action):

        if action == self.actions.toggle:
            self.step_count += 1

            reward = 0
            done = False
            
            if self.step_count >= self.max_steps:
                done = True
            
            # get ball current pos and switch it with a new ball
            x, y = self.ball.cur_pos
            
            self.switch_ball_color()
            self.ball = Ball(self.ball_color)
            
            self.put_obj(self.ball, x, y)
            #self.place_obj(self.ball)
            self.mission = f"Pick up {self.ball_color} {self.ball.type}"
        
            obs = self.gen_obs()

            return obs, reward, done, {}
        else:
            obs, reward, done, info = super().step(action)

            if action == self.actions.pickup:
                if self.carrying and self.carrying == self.ball:
                    reward = self._reward()
                    done = True

            return obs, reward, done, info

        

In [None]:
def make_hacked_env(ball_color, room_size, og_model, rank, seed=0):
    """
    Utility function for multiprocessed env.

    :param env_id: (str) the environment ID
    :param num_env: (int) the number of environments you wish to have in subprocesses
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    """
    def _init():
        env = ImgObsWrapper(RGBImgPartialObsWrapper(HackedBallRoom(size=room_size, ball_color=ball_color, og_model=og_model)))
        env.seed(seed + rank)
        env = Monitor(env) # for monitoring env in W&B
        return env
    set_random_seed(seed)
    return _init

In [None]:
# Use this one when running locally via jupyter notebook or sth
#og_model = PPO.load("rgb_ball_agent", device="cpu")

# Use this one for google colab
og_model = PPO.load("rgb_ball_agent", custom_objects=custom_objects)

In [None]:
N_CPU = 2
#Create vectorized environment to make learning faster
env = SubprocVecEnv([make_hacked_env(ball_color = config["BALL_COLOR"], room_size=config["ROOM_SIZE"], og_model=og_model, rank=i*2) for i in range(N_CPU)])

# Activation function to be used in CNN layers
policy_kwargs = dict(activation_fn=nn.ReLU)

env = VecVideoRecorder(env, f"videos/{run.id}", record_video_trigger=lambda x: x % 2000 == 0, video_length=200)

# https://stable-baselines3.readthedocs.io/en/master/modules/ppo.html?highlight=PPO
model = PPO(config["policy_type"], env, policy_kwargs=policy_kwargs, verbose=1, tensorboard_log=f"runs/{run.id}")

model.learn(total_timesteps=config["total_timesteps"], callback=WandbCallback(
        model_save_path=f"models/{run.id}",
        verbose=2,
    ),) # Note that I used very few timesteps due to time constraints

model.save("rgb_hacked_ball_agent")

In [None]:
# Use this one when running locally via jupyter notebook or sth
#model = PPO.load("rgb_hacked_ball_agent", device="cpu")

# Use this one for google colab
model = PPO.load("rgb_hacked_ball_agent", custom_objects=custom_objects)

In [None]:
n_val_episodes = 100
env = SubprocVecEnv([make_hacked_env(ball_color = config["BALL_COLOR"], room_size=config["ROOM_SIZE"], og_model=og_model, rank=1)])


In [None]:
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=n_val_episodes)
wandb.log({'hacked_model_mean_reward':mean_reward, 'hacked_model_std_reward':std_reward})
save_gif("hacked", 100, 10, model, env, log_to_wb = True)
wandb.finish()