# Setup 

In [None]:
!pip install gym_minigrid
!pip install stable_baselines3
!pip install --upgrade --quiet cloudpickle pickle5
!pip install wandb

Collecting gym_minigrid
  Downloading gym_minigrid-1.0.2-py3-none-any.whl (47 kB)
[?25l[K     |██████▉                         | 10 kB 19.3 MB/s eta 0:00:01[K     |█████████████▊                  | 20 kB 8.9 MB/s eta 0:00:01[K     |████████████████████▋           | 30 kB 7.8 MB/s eta 0:00:01[K     |███████████████████████████▍    | 40 kB 7.4 MB/s eta 0:00:01[K     |████████████████████████████████| 47 kB 1.8 MB/s 
Installing collected packages: gym-minigrid
Successfully installed gym-minigrid-1.0.2
Collecting stable_baselines3
  Downloading stable_baselines3-1.3.0-py3-none-any.whl (174 kB)
[K     |████████████████████████████████| 174 kB 5.1 MB/s 
Installing collected packages: stable-baselines3
Successfully installed stable-baselines3-1.3.0
[K     |████████████████████████████████| 256 kB 7.3 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflic

In [None]:
from gym_minigrid.wrappers import *
from gym_minigrid.minigrid import *
import gym

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.utils import set_random_seed
from stable_baselines3.common.evaluation import evaluate_policy

import torch.nn as nn
import numpy as np
import shutil #to remove folder

import itertools
import random

import matplotlib.pyplot as plt

import wandb
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import VecVideoRecorder
from wandb.integration.sb3 import WandbCallback

%matplotlib notebook

custom_objects = {
    "lr_schedule": lambda x: .003,
    "clip_range": lambda x: .02
}

import imageio as iio
import os

In [None]:
def save_gif2(gif_name, max_frames, max_episodes, model, env,  agent1_actions, agent2_actions, log_to_wb = True):
    
    path = 'gif'

    #remove folder as might contain old files
    try:
        shutil.rmtree(path)
    except OSError as e:
        print("Error: %s - %s." % (e.filename, e.strerror))

  # Create a new directory 
    os.makedirs(path)

    images = []
    gif_path = gif_name + ".gif"
    frames_path = path+"/"+gif_name+"{j}.jpg"


    j = 0
    fig,(ax1, ax2) = plt.subplots(1,2, figsize=(10, 5));
    for i in range(max_episodes):
        obs = env.reset()
        if i == 0:
            img1 = ax1.imshow(env.render(pov='agent', highlight=False))
            img2 = ax2.imshow(env.render(pov='original', highlight=False))
        else:
            img1.set_data(env.render(pov='agent', highlight=False))
            img2.set_data(env.render(pov='original', highlight=False))
            
        while True and j < max_frames:
            action, _ = model.predict(obs)
            obs, r, done ,_ = env.step(action)
            img1.set_data(env.render(pov='agent', highlight=False))
            img2.set_data(env.render(pov='original', highlight=False))
            fig.suptitle(f'Reward: {r:.5f}', fontsize=16, y=1)
            ax1.set_title(f"Agent 1: {agent1_actions[env.environment_action]}")
            ax2.set_title(f"Agent 2: {agent2_actions[action]} ")
            fig.canvas.draw()
            fig.savefig(frames_path.format(j=j))
            j=j+1 
            if done: 
                break
    
    with iio.get_writer(gif_path, mode='I') as writer:
        for j in range(max_frames):
            try:
                writer.append_data(iio.imread(frames_path.format(j=j)))
            except OSError as e:
                None
    if log_to_wb == True:
        wandb.log({gif_name: wandb.Video(gif_path)})

In [None]:
def save_gif(gif_name, max_frames, max_episodes, model, env, log_to_wb = True):

    path = 'gif'
    
    agent1_actions = {
    0: 'Turn left', 1: 'Turn right', 2: 'Move forward',
    3: 'Pickup', 4: 'Drop', 5:'Toggle', 6: 'Done'
    }

    #remove folder as might contain old files
    try:
        shutil.rmtree(path)
    except OSError as e:
        print("Error: %s - %s." % (e.filename, e.strerror))

  # Create a new directory 
    os.makedirs(path)

    images = []
    gif_path = gif_name + ".gif"
    frames_path = path+"/"+gif_name+"{j}.jpg"


    j = 0
    obs = env.reset()
    img = env.render(mode='rgb_array', highlight=False)
    for i in range(max_episodes):
        obs = env.reset()
        while True and j < max_frames:
            action, _ = model.predict(obs)
            obs, r, done ,info = env.step(action)
            fig,(ax1) = plt.subplots(1,1, figsize=(5, 5));
            image1 = ax1.imshow(env.render("rgb_array", highlight=False))
            ax1.set_title(f"Env = {env.legend} \n Action = {agent1_actions[action]}, Reward = {r}")
            plt.savefig(frames_path.format(j=j))
            plt.cla()
            j=j+1 
            if done: 
                break
    with iio.get_writer(gif_path, mode='I') as writer:
        for j in range(max_frames):
            try:
                writer.append_data(iio.imread(frames_path.format(j=j)))
            except OSError as e:
                None
    if log_to_wb == True:
        wandb.log({gif_name: wandb.Video(gif_path)})

In [None]:
#Vanilla environment for first agent
class EmptyBallRoom(MiniGridEnv):
    def __init__(self, size=9, ball_color="green", tile_size=8):
        self.ball_color = ball_color
        self.tile_size = tile_size
        super().__init__(
            grid_size=size,
            max_steps=4*size*size,
            # Set this to True for maximum speed
            see_through_walls=True
        )
        self.observation_space = spaces.Box(
            low=0,
            high=255,
            shape=(size * tile_size, size * tile_size, 3),
            dtype='uint8'
        )
    
    def reset(self):
        super().reset()
        rgb_observation = self.render("rgb_image", highlight=False, tile_size=self.tile_size)
        return rgb_observation
        
    def _gen_grid(self, width, height):
        # Create an empty grid
        self.grid = Grid(width, height)
        # Generate the surrounding walls
        self.grid.wall_rect(0, 0, width, height)

        # Place the agent
        # self.agent_pos = (1, 1)
        # self.agent_dir=0
        self.place_agent()
        
        # Place object
        self.ball = Ball(self.ball_color)
        # self.put_obj(self.ball, width-2, height-2)
        self.place_obj(self.ball)
        self.mission = f"Pick up {self.ball_color} {self.ball.type}"

    def step(self, action):
        obs, reward, done, info = super().step(action)

        if action == self.actions.pickup:
            if self.carrying and self.carrying == self.ball:
                reward = self._reward()
                done = True
        
        rgb_observation = self.render("rgb_image", highlight=False, tile_size=self.tile_size)
        
        return rgb_observation, reward, done, info

In [None]:
#Rotates agent 180 degrees on reset
import random

# Rotates agent, not env.
class ObsRotatorSwitchWrapper(MiniGridEnv):
    def __init__(self, size=9, ball_color="green", tile_size=8):
        self.ball_color = ball_color
        self.tile_size = tile_size
        self.rotations_n = 2
        self.initial_agent_dir = None
        self.original_agent_dir = None


        super().__init__(
            grid_size=size,
            max_steps=4*size*size,
            # Set this to True for maximum speed
            see_through_walls=True
        )
        self.observation_space = spaces.Box(
            low=0,
            high=255,
            shape=(size * tile_size, size * tile_size, 3),
            dtype='uint8'
        )
        self.legend = 'Not rotated'
    
    def reset(self):
        super().reset()
        self.rotations_n = 2 #rotates agent 180degrees
        self.original_agent_dir = self.agent_dir
        self.agent_dir = (self.original_agent_dir + self.rotations_n)%4 #turn 180 deg 
        rgb_observation = self.render("rgb_image", highlight=False, tile_size=self.tile_size)
        self.agent_dir = self.original_agent_dir

        self.legend = 'rotated '+str(self.rotations_n*90)+ ' degrees'
        return rgb_observation
        
    def _gen_grid(self, width, height):
        # Create an empty grid
        self.grid = Grid(width, height)
        # Generate the surrounding walls
        self.grid.wall_rect(0, 0, width, height)

        # Place the agent
        self.place_agent()
        
        # Place object
        self.ball = Ball(self.ball_color)
        self.place_obj(self.ball)
        self.mission = f"Pick up {self.ball_color} {self.ball.type}"

    def step(self, action):
        
        self.initial_agent_dir = self.agent_dir
        self.agent_dir = (self.initial_agent_dir + self.rotations_n)%4 #turn 180

        obs, reward, done, info = super().step(action)
        
        self.initial_agent_dir = self.agent_dir
        self.agent_dir = (self.agent_dir + self.rotations_n)%4 #turn 180 to get back initial state
        
        
        rgb_observation = self.render("rgb_image", highlight=False, tile_size=self.tile_size)
       
        if action == self.actions.pickup:
            if self.carrying and self.carrying == self.ball:
                reward = self._reward()
                done = True
                
        return rgb_observation, reward, done, info

In [None]:
#Agent rotate 180 degrees 
def validate_baseline_rotated(trial_description, trial_name, config, modelName = "first_agent", randomColor = False):
  ### Initiate W&B connection
  project_name = "learning-by-hacking"
  run = wandb.init(
    project=project_name,
    entity="learning-by-hacking",
    name = trial_name,
    config=config,
    sync_tensorboard=True,  # auto-upload sb3's tensorboard metrics
    monitor_gym=True,  # auto-upload the videos of agents playing the game
    save_code=True,  # optional
  )

  ### Load agent
  first_agent = PPO.load(modelName, custom_objects=custom_objects)
  print("loaded:", "gamma =", first_agent.gamma, "n_steps =", first_agent.n_steps)

  ############################################################################################################## 
  # Only this section is expected to get changed per different experiments

  ### choose random color or not
  second_env_kwargs = {"size": config["ROOM_SIZE"], "ball_color": config["2ND_BALL_COLOR"], "tile_size": config["TILE_SIZE"]} 
  second_env = ObsRotatorSwitchWrapper(**second_env_kwargs)

  ############################################################################################################## 
  
  ### Validate
  n_val_episodes = 1000
  mean_reward, std_reward = evaluate_policy(first_agent, second_env, n_eval_episodes=n_val_episodes)
  wandb.log({'val_mean_reward': mean_reward, 'val_std_reward': std_reward}) #log mean reward
  save_gif("First agent on second environment", 100, 10, first_agent, second_env, log_to_wb = True) #log gif to W&B

  wandb.finish() 

In [None]:
#Experiment - agent rotate 180 degrees 
trial_description = 'Validating first agent on new, second, environment'
trial_name = '1st agent validation - rotate agent 180 degrees'
project_name = "learning-by-hacking"
config = {
    "policy_type": "CnnPolicy",
    "total_timesteps": 200000, #irrelevent here
    "env_name": "Baseline 2",
    "ROOM_SIZE": 9,
    "BALL_COLOR": "green",
    "2ND_BALL_COLOR": "green",
    "TILE_SIZE": 8,
    "Trial description": trial_description
}

validate_baseline_rotated(trial_description, trial_name, config, modelName = "first_agent", randomColor = False)

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


loaded: gamma = 0.99 n_steps = 2048




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

VBox(children=(Label(value=' 4.28MB of 4.28MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
val_mean_reward,▁
val_std_reward,▁

0,1
val_mean_reward,0.02194
val_std_reward,0.14626


In [None]:
# Hacked environment wrapper
class Hacked180DegEnvironment(gym.core.Wrapper):

    
    def __init__(self, env, first_agent):
        self.first_agent = first_agent
        super().__init__(env)
        self.action_space = spaces.Discrete(4) # actions: turn 0, 90, 180, 270 degrees 
        
        self.transformed_obs = np.array([])
        self.original_obs = np.array([])
        
        self.transformation_action = None
        self.environment_action = None
        self.original_agent_dir = None
        
        self.reset()
        
    def reset(self):
        self.env.reset()
        self.original_obs = self.transformed_obs = self.env.render(
            "rgb_image",
            highlight=False,
            tile_size=self.env.tile_size
        )
        self.transformation_action
        return self.original_obs

    
    def step(self, first_action):
        """
        Can ask to rotate agent 0, 90, 180, 270 degrees
        """
        self.transformation_action = first_action
        self.original_agent_dir = self.env.agent_dir 

        self.original_obs = self.env.render("rgb_image", highlight=False, tile_size=self.env.tile_size)
        
        if self.transformation_action is not None:
          self.env.agent_dir = (self.env.agent_dir + self.transformation_action)%4
          self.transformed_obs = self.env.render("rgb_image", highlight=False, tile_size=self.env.tile_size)
          self.env.agent_dir = (self.env.agent_dir + (4-self.transformation_action))%4
        else:
          self.transformed_obs = self.env.render("rgb_image", highlight=False, tile_size=self.env.tile_size)
            
     
        second_action, _ = self.first_agent.predict(self.transformed_obs)
        self.environment_action = second_action
        self.original_obs, reward, done, info = self.env.step(second_action)
        
        
        return self.original_obs, reward, done, info
        
    def render(self, pov="agent", highlight=False, **kwargs):
        """
        pov : agent / original
        """
        if pov=="agent":

            if self.transformation_action is not None:
              self.env.agent_dir = (self.env.agent_dir + self.transformation_action)%4
              observation = self.env.render(highlight=highlight, **kwargs)
              self.env.agent_dir = (self.env.agent_dir+ (4-self.transformation_action))%4
            else:
              observation = self.render(pov="original")


        else:
            observation = self.env.render(highlight=highlight, **kwargs)

        return observation

In [None]:
def train_2nd_agent_and_validate_rotated(trial_description, trial_name, config, modelName = "first_agent", randomColor = False, agent1_actions={}, agent2_actions={}, log_wb=False):
  ### Initiate W&B connection
  project_name = "learning-by-hacking"
  if log_wb:
    run = wandb.init(
      project=project_name,
      entity="learning-by-hacking",
      name = trial_name,
      config=config,
      sync_tensorboard=True,  # auto-upload sb3's tensorboard metrics
      monitor_gym=True,  # auto-upload the videos of agents playing the game
      save_code=True,  # optional
    )

  ### Load agent
  first_agent = PPO.load(modelName, custom_objects=custom_objects)
  print("loaded:", "gamma =", first_agent.gamma, "n_steps =", first_agent.n_steps)

  ############################################################################################################## 
  # Only this section is expected to get changed per different experiments

  ### choose random color or not
  second_env_kwargs = {"size": config["ROOM_SIZE"], "ball_color": config["2ND_BALL_COLOR"], "tile_size": config["TILE_SIZE"]} 
  second_env = ObsRotatorSwitchWrapper(**second_env_kwargs)
  
  ## vectorize for training
  second_env_vector = make_vec_env(
    Hacked90DegEnvironment,
    n_envs=2,                           # max 2 for colaboratory
    vec_env_cls=SubprocVecEnv,
    env_kwargs={"env": second_env, 
                "first_agent": first_agent},
    monitor_dir="env_monitor")
  
  ### train second agent
  policy_kwargs = dict(activation_fn=nn.ReLU)
  second_agent = PPO(
      config["policy_type"], 
      second_env_vector,
      gamma=0.85,
      policy_kwargs=policy_kwargs,
      verbose=1,
      tensorboard_log="second_agent_tb",
      ent_coef = 0
  )
  if log_wb:      
    second_agent.learn(total_timesteps=config["total_timesteps"], callback=WandbCallback( #callback for W&B
          model_save_path=f"models/{run.id}", verbose=2, ),
          )
  else: 
    second_agent.learn(total_timesteps=config["total_timesteps"])
  second_agent.save("second_agent")

  ############################################################################################################## 
  
  ### Validate
  env = Hacked180DegEnvironment(second_env, first_agent)
  n_val_episodes = 1000
  mean_reward, std_reward = evaluate_policy(second_agent, env, n_eval_episodes=n_val_episodes)
  if log_wb:
   wandb.log({'val_mean_reward': mean_reward, 'val_std_reward': std_reward}) #log mean reward
  save_gif2("Second agent on second environment", 100, 10, second_agent, env, agent1_actions, agent2_actions, log_to_wb = log_wb) #log gif to W&B

  wandb.finish() 


In [None]:
#Experiment rotate agent and validate
trial_description = 'Validating second agent on new, second, environment, along with logging of the training'
trial_name = '2nd agent training and validation - randomly rotate agent n*90 degrees (on reset)'
project_name = "learning-by-hacking"
config = {
    "policy_type": "CnnPolicy",
    "total_timesteps": 200000,
    "env_name": "2nd environment",
    "ROOM_SIZE": 9,
    "BALL_COLOR": "green",
    "2ND_BALL_COLOR": "green",
    "TILE_SIZE": 8,
    "Trial description": trial_description}
agent1_actions = {
    0: 'Turn left', 1: 'Turn right', 2: 'Move forward',
    3: 'Pickup', 4: 'Drop', 5:'Toggle', 6: 'Done'
    }
agent2_actions = {
        0: 'Rotate 0', 1: 'Rotate 90', 2: 'Rotate 180', 3: 'Rotate 270', 
    }

train_2nd_agent_and_validate_rotated(trial_description, trial_name, config, modelName = "first_agent", randomColor = False,  agent1_actions= agent1_actions,agent2_actions = agent2_actions, log_wb=True)

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


loaded: gamma = 0.99 n_steps = 2048
Using cuda device
Wrapping the env in a VecTransposeImage.
Logging to second_agent_tb/PPO_1
