<a href="https://colab.research.google.com/github/tmoopenn/unsupervised-object-detection-with-rpn/blob/master/Relief_R_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch
print(torch.cuda.is_available())

if torch.cuda.is_available():  
  dev = "cuda:0" 
else:  
  dev = "cpu"  
device = torch.device(dev)

In [0]:
# clone repo
!git clone https://github.com/mila-iqia/atari-representation-learning.git

# PyTorch and scikit learn
!pip install scikit-learn

# Baselines for Atari preprocessing
# Tensorflow is a dependency, but you don't need to install the GPU version
!pip install git+git://github.com/openai/baselines

# pytorch-a2c-ppo-acktr for RL utils
!pip install git+git://github.com/ankeshanand/pytorch-a2c-ppo-acktr-gail

# Clone and install package
!pip install -r atari-representation-learning/requirements.txt
!pip install git+git://github.com/mila-iqia/atari-representation-learning.git

Build a standard AutoEncoder that will take in an atari frame as input and its output will be a reconstruction of the input. In the training process, we are interested in the filters that the network learns more than the latent space representation.

In [0]:
import torch.nn as nn
import torch.nn.functional as F 
import numpy as np
from collections import OrderedDict

## AutoEncoder Definition 

class AutoEncoder(nn.Module):
  def __init__(self):
    super(AutoEncoder, self).__init__()
    self.encoder = nn.Sequential(OrderedDict([ 
        ('conv1', nn.Conv2d(1, 16, 3, stride=2, padding=1)),
        ('relu1', nn.ReLU()),
        ('conv2', nn.Conv2d(16, 32, 3, stride=2, padding=1)),
        ('relu2', nn.ReLU()),
        ('conv3', nn.Conv2d(32, 64, 7))
    ]))
    self.decoder = nn.Sequential(
        nn.ConvTranspose2d(64, 32, 7),
        nn.ReLU(),
        nn.ConvTranspose2d(32, 16, 3, stride=2, padding=1, output_padding=1),
        nn.ReLU(),
        nn.ConvTranspose2d(16, 1, 3, stride=2, padding=1, output_padding=1),
        nn.Sigmoid()
    )

  def forward(self, x):
    z = self.encoder(x)
    x_hat = self.decoder(z)
    return x_hat



In [0]:
example_model = AutoEncoder()

## PRINT LAYERS ##
# for k,v in example_model.state_dict().items():
#   print("Layer {}".format(k))
#   print(v)

## Get a iterator for layers of the encoder ##
## tuples of the form (layer name, layer) ##
encoder_layers = [layer for layer in example_model.encoder.named_children()]
e1_name, e1_layer = encoder_layers[0]

## Layer name ## 
print(e1_name)

## Layer weights ## 
print(e1_layer.weight)

## Layer bias ## 
print(e1_layer.bias)

## Check Layer's type ##
print(isinstance(e1_layer, nn.Conv2d)) 

In [0]:
!pwd

In [0]:
import os 
from os import sys 

#sys.path.append('/content/drive/My\ Drive/atari-representation-learning')
#%cd drive/My\ Drive/atari-representation-learning
from atariari.benchmark.episodes import get_ppo_rollouts, get_random_agent_rollouts 


def get_episodes(env_name,
                 steps,
                 seed=42,
                 num_processes=1,
                 num_frame_stack=1,
                 downsample=False,
                 color=False,
                 entropy_threshold=0.6,
                 collect_mode="random_agent",
                 train_mode="train_encoder",
                 checkpoint_index=-1,
                 min_episode_length=64):

    if collect_mode == "random_agent":
        # List of episodes. Each episode is a list of 160x210 observations
        episodes, episode_labels = get_random_agent_rollouts(env_name=env_name,
                                                             steps=steps,
                                                             seed=seed,
                                                             num_processes=num_processes,
                                                             num_frame_stack=num_frame_stack,
                                                             downsample=downsample, color=color)

    elif collect_mode == "pretrained_ppo":
        # List of episodes. Each episode is a list of 160x210 observations
        episodes, episode_labels = get_ppo_rollouts(env_name=env_name,
                                                   steps=steps,
                                                   seed=seed,
                                                   num_processes=num_processes,
                                                   num_frame_stack=num_frame_stack,
                                                   downsample=downsample,
                                                   color=color,
                                                   checkpoint_index=checkpoint_index)


    else:
      assert False, "Collect mode {} not recognized".format(collect_mode)

    # Get indices for episodes that have min_episode_length
    ep_inds = [i for i in range(len(episodes)) if len(episodes[i]) > min_episode_length]
    episodes = [episodes[i] for i in ep_inds]
    
    # Shuffle
    inds = np.arange(len(episodes))
    rng = np.random.RandomState(seed=seed)
    rng.shuffle(inds)

    if train_mode == "train_encoder":
      assert len(inds) > 1, "Not enough episodes to split into train and val. You must specify enough steps to get at least two episodes"
      split_ind = int(0.8 * len(inds))
      tr_eps, val_eps = episodes[:split_ind], episodes[split_ind:]
      return tr_eps, val_eps

    if train_mode == "dry_run":
      return episodes


In [0]:
  from torch.utils.data import RandomSampler, BatchSampler
  import torch.nn.functional as F
  
  def generate_batch(episodes, batch_size):
      total_steps = sum([len(e) for e in episodes])
      print('Total Steps: {}'.format(total_steps))
      # Episode sampler
      # Sample `num_samples` episodes then batchify them with `self.batch_size` episodes per batch
      sampler = BatchSampler(RandomSampler(range(len(episodes)),
                                            replacement=True, num_samples=total_steps),
                              batch_size, drop_last=True)
      for indices in sampler:
          episodes_batch = [episodes[x] for x in indices]
          x_t, x_tprev, x_that, ts, thats = [], [], [], [], []
          for episode in episodes_batch:
              # Get one sample from this episode
              t, t_hat = 0, 0
              t, t_hat = np.random.randint(0, len(episode)), np.random.randint(0, len(episode))
              frame = episode[t]
              resized_frame = F.interpolate(frame.unsqueeze(0) / 255.0, size=160, mode='bicubic').squeeze(0)
              x_t.append(resized_frame)
          yield torch.stack(x_t).float().to(device) 

In [0]:
def train_one_epoch(model, game, total_steps, batch_size=64, learning_rate=1e-3,  train_mode="train_encoder", agent_type="random_agent"):
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=learning_rate, 
                                 weight_decay=1e-5) # <--
    train_eps, validation_eps = get_episodes(game, total_steps,
                                             collect_mode=agent_type)
    outputs = []
    print("Num episodes", len(train_eps))
    data_generator = generate_batch(train_eps, batch_size)
    for x_batch in data_generator:
      #print("batch shape", x_batch.shape)
      recon = model(x_batch)
      loss = criterion(recon, x_batch)
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      

      print('Loss:{:.4f}'.format( float(loss)))
      outputs.append((x_batch, recon),)
    return outputs

In [0]:
def train(model, game, steps_per_epoch, num_epochs=2, batch_size=64, learning_rate=1e-3):
  for ep in range(num_epochs):
    if ep % 2 == 0:
      train_one_epoch(model, game, steps_per_epoch, agent_type="random_agent")
    else:
      train_one_epoch(model, game, steps_per_epoch, agent_type="pretrained_ppo")

In [0]:
## TRAIN THE MODEL ## 
model = AutoEncoder()
model.to(device)

# check if model on gpu
print(next(model.parameters()).is_cuda)

game = 'PitfallNoFrameskip-v4'
train(model, game, 2000)

In [0]:
def get_feature_maps(model, image, target_layer_name):
  encoder_layers = [l for l in model.encoder.named_children()]

  feature_maps = []
  x = image
  
  # Gather feature maps for a specified layer of the encoder
  for layer_name, layer in encoder_layers:
    intermediate = layer(x)
    if layer_name == target_layer_name:
      feature_maps = intermediate
      break
    x = intermediate

  return feature_maps

In [0]:
def integrate_feature_maps(feature_maps):
  # Normalize each feature map by its maximal feature value
  # (batch_size, channels, h, w) -> (batch_size, channels) 
  max_pixel_values = np.max(feature_maps, axis=(2,3))
  # unsqueeze last dimension twice so we can divide (batch_size, channels) -> (batch_size, channels, 1, 1) 
  max_pixel_values = np.expand_dims(np.expand_dims(max_pixel_values,-1), -1)
  feature_maps = feature_maps / max_pixel_values

  # Integrate feature maps by adding normalized feature maps together element-wise 
  f_integrate = np.sum(feature_maps, 1)

  return f_integrate

In [0]:
def define_feature_levels(f_integrate, num_subranges):
  # Define feature level subranges 
  min_feature_value, max_feature_value = np.min(f_integrate), np.max(f_integrate)
  #feature_levels = torch.linspace(min_value, max_value, steps=num_subranges)

  stride = (max_feature_value - min_feature_value) / num_subranges
  feature_levels = [min_feature_value + stride * i for i in range(num_subranges)]

  return stride, feature_levels 

In [0]:
def get_feature_level_indices(f_integrate, feature_levels):
  # Switch to numpy values since no digitize function in torch 
  #feature_levels_np, stride_np = feature_levels.numpy(), stride.numpy()
  #f_integrate_np = f_integrate
  batch_dim = f_integrate.shape[0]

  # flatten f_integrate, assign each feature value to a feature level then reshape into 2D
  feature_level_indices = np.digitize(f_integrate.reshape(batch_dim, -1), feature_levels).reshape((-1, f_integrate.shape[1], f_integrate.shape[2]))

  return feature_level_indices

In [0]:
from queue import Queue
import itertools 

### TODO: CHANGE FROM SEARCH STYLE TO CALCULATION STYLE SO IT CAN BE BATCHED AND PARALLELIZED

def generate_small_rois(feature_level_indices, target_level):
  print(feature_level_indices.shape)
  x_indices, y_indices = np.where(feature_level_indices == target_level)
  num_candidates = x_indices.shape[0]
  visited = set()
  max_width, max_height = feature_level_indices.shape[1], feature_level_indices.shape[0]
  bboxes = np.zeros((0,4))
  while num_candidates > 0:
    # Run BFS to find to ROIs 
    q = Queue()
    x, y = list(x_indices).pop(), list(y_indices).pop()
    num_candidates -= 1
    #print("CANDIDATES", num_candidates)
    if (x,y) in visited:
      continue
    else:
      q.put((x, y))
      visited.add((x,y))
      x_min = x_max = x
      y_min = y_max = y
    while not q.empty():
      xy_coordinate = q.get()
      visited.add(xy_coordinate)
      x, y = xy_coordinate[0], xy_coordinate[1]
      # get cross product of two lists to generate all directions for neighboring pixel cells
      deltas = [delta for delta in itertools.product([-1,0,1], [-1,0,1])]
      # remove (0,0)
      deltas = set(deltas).difference(set([0,0]))
      for delta_x, delta_y in deltas:
        new_x, new_y = x + delta_x, y + delta_y
        # check pixel value is a valid position or visited already
        if new_x < 0 or new_y < 0 or new_x >= max_width or new_y >= max_height or (new_x, new_y) in visited:
          continue 
        # update bounding box if pixel value is target_level
        if feature_level_indices[new_x][new_y] == target_level:
          x_min = min(x_min, new_x)
          y_min = min(y_min, new_y)
          x_max = max(x_max, new_x)
          y_max = max(y_max, new_y)
          q.put((new_x, new_y))
          visited.add((new_x, new_y))
    bboxes = np.vstack((bboxes, np.array([x_min, y_min, x_max, y_max])))
  print("PROPOSED REGIONS", bboxes.shape[0])
  return bboxes
          
  

In [0]:
def generate_one_large_roi(feature_level_indices, target_level):
  indices = np.where(feature_level_indices == target_level)
  if indices[0].shape[0] == 0:
    return np.array([])
  x_indices, y_indices = indices[0], indices[1]
  return np.array([min(x_indices),
                   min(y_indices),
                   max(x_indices),
                   max(y_indices)
                  ])

In [0]:
def generate_rois(feature_level_indices, target_level):
  small_ROIs = generate_small_rois(feature_level_indices, target_level)
  large_ROI = generate_one_large_roi(feature_level_indices, small_ROIs)
  return small_ROIs, large_ROI

In [0]:
## Relief Region/Object Proposal Algorithm 

def relief_proposals(model, image, target_layer_name, num_subranges):
  ### GET FEATURE MAPS FROM ENCODER ###
  feature_maps = get_feature_maps(model, image, target_layer_name)
  feature_maps = feature_maps.to("cpu")
  feature_maps = feature_maps.detach().numpy()
  print("FEATURE MAPS SHAPE", feature_maps.shape)

  ### GENERATE INTEGRATED FEATURE MAP ###
  f_integrate = integrate_feature_maps(feature_maps)
  #print(f_integrate)
  print("FINTEGRATE SHAPE", f_integrate.shape)

  ### FEATURE LEVEL DEFINITIONS ###
  stride, feature_levels = define_feature_levels(f_integrate, num_subranges)
  print("STRIDE", stride)
  print(feature_levels)

  ### FEATURE LEVEL ASSIGNMENTS ###
  feature_level_indices = get_feature_level_indices(f_integrate, feature_levels)
  #print(feature_level_indices)
  print(feature_level_indices.shape)

  ### ROI GENERATION ### 
  feature_level_dictionary = {}
  for l in range(num_subranges):
    small, large = [], []
    print("######################## LEVEL {} #####################".format(l))
    for i in range(feature_level_indices.shape[0]):
      small_rois, large_roi = generate_rois(feature_level_indices[i], l)
      small.append(small_rois)
      large.append(large_roi)
    feature_level_dictionary[l] = (small, large)
  
  ### CAN ALSO COMBINE SMALL ROIs into LARGER ROIs GIVEN CERTAIN RULES ###


  ### TODO: LOCAL SEARCH TO REFINE ROIs ### 


  return feature_level_dictionary 

    
  










In [0]:
train_eps, validation_eps = get_episodes(game, 1000, collect_mode='pretrained_ppo')
data_generator = generate_batch(train_eps, 32)
images = [x for x in data_generator]

In [0]:
feature_level_bboxes = relief_proposals(model, images[0], 'conv1', 10)

In [0]:
import matplotlib.pyplot as plt