# Setup

In [0]:
!pip install torch torchvision



In [0]:
from google.colab import drive
from google.colab.patches import cv2_imshow

drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


#The model

In [0]:
import torch
import torch.nn as nn
from torchvision import models
import numpy as np

class SegNet(nn.Module):

  def conv(self, in_channels, out_channels, kernel_size=3, padding=1):
    return nn.Sequential(nn.Conv2d(in_channels, out_channels, kernel_size, padding=padding), nn.BatchNorm2d(out_channels), nn.ReLU())

  def skip_conv(self, in_channels, out_channels):
    return nn.Sequential(
        nn.Conv2d(in_channels, out_channels, 1), nn.BatchNorm2d(out_channels))

  def upconv(self, in_channels, out_channels):
    return nn.Sequential(
        nn.ConvTranspose2d(in_channels, out_channels, 4, stride=2),
        nn.BatchNorm2d(out_channels),
        nn.ReLU()
    )

  # Assume for simplicity that images are square
  def crop_to_size(self, input, out_size):
    crop_start = (input.size()[2] - out_size[2]) // 2
    crop_end = int(np.ceil((input.size()[2] - out_size[2]) / 2))
    return input[:, :, crop_start:-crop_end, crop_start:-crop_end]

  def __init__(self, out_channels, in_channels=3, feature_extract=True):
    super(SegNet, self).__init__()

    self.vgg = models.vgg16_bn(pretrained=True).features

    # Make the first conv layer have padding of 100
    new_conv = nn.Conv2d(in_channels, 64, 3, padding=100)
    new_conv.weight = self.vgg[0].weight
    new_conv.bias = self.vgg[0].bias
    self.vgg[0] = new_conv

    # Using VGG as a fixed feature extractor so we shouldn't optimize its weights
    if feature_extract:
      for param in self.vgg.parameters():
        param.requires_grad = False

    self.fc7_conv = nn.Sequential(
        self.conv(512, 4096, kernel_size=7, padding=0),
        self.conv(4096, 4096, kernel_size=1, padding=0)
    )

    self.upconv1 = self.upconv(4096, out_channels)
    self.upconv2 = self.upconv(out_channels, out_channels)
    self.upconv3 = self.upconv(out_channels, out_channels)
    self.upconv4 = self.upconv(out_channels, out_channels)
    self.upconv5 = self.upconv(out_channels, out_channels)

    self.skip_conv1 = self.skip_conv(512, out_channels)
    self.dropout1 = nn.Dropout2d()
    self.skip_conv2 = self.skip_conv(256, out_channels)
    self.dropout2 = nn.Dropout2d()
    self.skip_conv3 = self.skip_conv(128, out_channels)
    self.dropout3 = nn.Dropout2d()
    self.skip_conv4 = self.skip_conv(64, out_channels)
    self.dropout4 = nn.Dropout2d()

  def forward(self, x):
    out1 = self.vgg[0:7](x)
    out2 = self.vgg[7:14](out1)
    out3 = self.vgg[14:24](out2)
    out4 = self.vgg[24:34](out3)
    out5 = self.fc7_conv(self.vgg[34:](out4))

    upconv_out1 = self.upconv1(out5) 
    skip = self.crop_to_size(self.skip_conv1(out4), upconv_out1.size())
    upconv_out1 = upconv_out1 + self.dropout1(skip)

    upconv_out2 = self.upconv2(upconv_out1)
    skip = self.crop_to_size(self.skip_conv2(out3), upconv_out2.size())
    upconv_out2 = upconv_out2 + self.dropout2(skip)

    upconv_out3 = self.upconv3(upconv_out2)
    skip = self.crop_to_size(self.skip_conv3(out2), upconv_out3.size())
    upconv_out3 = upconv_out3 + self.dropout3(skip)

    upconv_out4 = self.upconv4(upconv_out3)
    skip = self.crop_to_size(self.skip_conv4(out1), upconv_out4.size())
    upconv_out4 = upconv_out4 + self.dropout4(skip)

    out = self.upconv5(upconv_out4)
    out = self.crop_to_size(out, x.size())

    return out


# Datasets

In [0]:
#https://www.cs.stanford.edu/~roozbeh/pascal-parts/pascal-parts.html
from torchvision import datasets, transforms
import torchvision.transforms.functional as TF
from torch.utils.data import DataLoader, Dataset
from PIL import Image
from skimage import io

from os.path import isfile, join
from os import listdir as listdir

import random

class PascalPartsDataSet(Dataset):
    
    def __init__(self, img_dir, msk_dir, img_ext, msk_ext, augmentation=False):
        self.img_dir = img_dir
        self.msk_dir = msk_dir
        self.img_ext = img_ext
        self.msk_ext = msk_ext
        self.fNames = [f.split('.')[0] for f in listdir(img_dir) if isfile(join(img_dir, f))]
        self.augmentation = augmentation
    
    def data_augmentation(self, image, mask):
      # Random crop
      i, j, h, w = transforms.RandomResizedCrop.get_params(image, scale=(0.08, 1.0), ratio=(3. / 4., 4. / 3.))
      image = TF.crop(image, i, j, h, w)
      mask = TF.crop(mask, i, j, h, w)

      # Random horizontal flip
      if random.random() > 0.5:
        image = TF.hflip(image)
        mask = TF.hflip(mask)

      # Random rotate and scale
      scale = random.uniform(0.7, 1.4)
      angle = random.randint(-30, 30)
      image = TF.affine(image, angle, (0, 0), scale, 0)
      mask = TF.affine(mask, angle, (0, 0), scale, 0)

      # Random hue shift
      hue = random.uniform(-0.1, 0.1)
      image = TF.adjust_hue(image, hue)

      return image, mask

    def __len__(self):
        return len(self.fNames)

    def __getitem__(self, idx):
    
      img_name = self.fNames[idx] + self.img_ext
      msk_name = self.fNames[idx] + self.msk_ext

      img = Image.open(join(self.img_dir, img_name))
      # We divide by 100 so that any class will be mapped to a number in the 0-1 range
      mask = Image.fromarray(np.asarray(io.imread(join(self.msk_dir, msk_name))/100))

      if self.augmentation:
        img, mask = self.data_augmentation(img, mask)

      img = TF.to_tensor(TF.resize(img, (224, 224)))
      img = TF.normalize(img, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

      # Resize with nearest neighbour interpolation because we should not lerp between classes
      mask = TF.to_tensor(TF.resize(mask, (224, 224), interpolation=Image.NEAREST))[0] * 100
      mask = mask.long()

      return (img, mask)


def load_pascal_parts(parent_dir, batch_size):
  transform = transforms.Compose([
                transforms.Resize((224, 224), interpolation=0),
                transforms.ToTensor()
                ])
  
  train = PascalPartsDataSet(parent_dir + '/train/images', parent_dir + '/train/masks', '.jpg', '.png', augmentation=True)
  test = PascalPartsDataSet(parent_dir + '/test/images', parent_dir + '/test/masks', '.jpg', '.png')

  train_dloader = DataLoader(dataset=train, batch_size=batch_size, shuffle=True, num_workers=8)
  test_dloader = DataLoader(dataset=test, batch_size=batch_size, shuffle=False, num_workers=8)

  return train_dloader, test_dloader

# Training

In [0]:
from torch import optim
from tqdm import tqdm

import numpy as np

def train(lr, epochs, batch_size, parent_dir, check_point=None):

  model = SegNet(7)
  model.cuda()

  model_params = [param for param in model.parameters() if param.requires_grad]
  optimizer = optim.Adam(model_params, lr=lr)
  criterion = nn.CrossEntropyLoss()

  if check_point is not None:
    model.load_state_dict(check_point['model_state_dict'])
    optimizer.load_state_dict(check_point['optimizer_state_dict'])

  train_dloader, test_dloader = load_pascal_parts(parent_dir, batch_size)

  model.train()

  for epoch in range(epochs):

    running_loss = 0
    for images, masks in tqdm(train_dloader):
      images = images.cuda()
      masks = masks.cuda()

      output = model(images)

      loss = criterion(output, masks)
      
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      
      running_loss += loss.item()

    print('Epoch: {} '.format(epoch), 'Training loss: ', running_loss / len(train_dloader))

  checkpoint = {'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict()}

  return model, checkpoint, train_dloader, test_dloader


def load_model(cp_dir):
  checkpoint = torch.load(cp_dir)

  model = SegNet(7)
  model.cuda()

  model.load_state_dict(checkpoint['model_state_dict'])

  model.eval()

  return model


In [0]:
# parent_dir = 'drive/My Drive/Colab Notebooks/CSC420 Project/PASCAL_HumanParts/'
parent_dir = 'drive/My Drive/CSC420 Project/PASCAL_HumanParts/'

# cp_dir = 'drive/My Drive/Colab Notebooks/CSC420 Project/checkpoint'
cp_dir = 'drive/My Drive/CSC420 Project/checkpoint'

# Training

In [0]:
cp = torch.load(cp_dir)

!nvidia-smi -L # show gpu
for i in range(10): # do 10 in a row and save after each 50 epochs just to be sure
  model, cp, train_dloader, test_dloader = train(0.001, 50, 25, parent_dir, cp)
  torch.save(cp, cp_dir)


# Testing

In [0]:
import matplotlib.pyplot as plt
# Calculate IOU for each class and the average IOU over all classes
# This computes and averages the IOUs over a batch
def iou_metric(output, mask, num_classes, smooth=1e-6):

  iou = np.array([])

  for t in range(1, num_classes):
    x = (output == t).int()
    y = (mask == t).int()

    intersection = torch.sum(x*y, (1, 2))
    union = torch.sum(x + y, (1, 2)) - intersection
    
    iou = np.append(iou, torch.mean((intersection.float() + smooth) / (union.float() + smooth)).item())

  iou = np.append(iou, np.mean(iou))

  return iou*100


# Evaluate the model on the test set using the IOU metric
def evaluate(model, test_dloader, num_classes):
  
  model.eval()

  iou = np.zeros(num_classes)

  for images, masks in test_dloader:
    images = images.cuda()
    output = model(images)
    _, predicted = torch.max(output.data, 1, keepdim=True)
    predicted = predicted.cpu()

    iou += iou_metric(predicted, masks, num_classes)

  iou /= len(test_dloader)

  return iou

In [0]:
# show some resulting images
def show_results(model, test_dloader, num_classes):
  
  model.eval()
  mean = [0.485, 0.456, 0.406]
  std = [0.229, 0.224, 0.225]
  for images, masks in test_dloader:
    images = images.cuda()
    output = model(images)
    _, predicted = torch.max(output.data, 1, keepdim=True)
    predicted = predicted.cpu()
    images = images.cpu()
    images[:, 0, :, :] = images[:, 0, :, :] * std[0] + mean[0]
    images[:, 1, :, :] = images[:, 1, :, :] * std[1] + mean[1]
    images[:, 2, :, :] = images[:, 2, :, :] * std[2] + mean[2]
    plt.imshow(TF.to_pil_image(images.cpu()[0]))
    plt.show()  
    plt.imshow(TF.to_pil_image(predicted[0] / 100.))
    plt.show()
    plt.imshow(TF.to_pil_image(masks[0] / 100.))
    plt.show()


In [0]:
segNet_model = load_model(cp_dir)

train_dloader, test_dloader = load_pascal_parts(parent_dir, batch_size=25)

In [0]:
print(evaluate(segNet_model, test_dloader, 7))

In [0]:
show_results(segNet_model, test_dloader, 7)

# Matching


In [0]:
segNet_model = load_model(cp_dir)
vgg = models.vgg16_bn(pretrained=True).features
vgg.cuda()

In [0]:
import cv2 as cv
from skimage.draw import line_aa, line
import matplotlib.pyplot as plt

def vgg_feature_match(image0, image1, segNet_model, vgg, patch_size=3 , keypoint_window=11, threshold = 100):
  # makes sure image has 3 channels only and are 224 x 224
  image0 = TF.to_tensor(TF.resize(image0, (224, 224)))[:3]
  image1 = TF.to_tensor(TF.resize(image1, (224, 224)))[:3]
  normalized_image0 = TF.normalize(image0, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]).unsqueeze(0).cuda()
  normalized_image1 = TF.normalize(image1, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]).unsqueeze(0).cuda()

  # Use VGG-16 up to the pooling layer
  features_0 = vgg[:6](normalized_image0)[0].data.cpu()
  features_1 = vgg[:6](normalized_image1)[0].data.cpu()

  _, body_mask0 = torch.max(segNet_model(normalized_image0).data, 1, keepdim=True)
  _, body_mask1 = torch.max(segNet_model(normalized_image1).data, 1, keepdim=True)

  body_mask0 = body_mask0.cpu().squeeze()
  body_mask1 = body_mask1.cpu().squeeze()

  patch_offset = patch_size // 2

  # A feature and match for each body part
  feature_map_0 = [[] for i in range(6)]
  feature_map_1 = [[] for i in range(6)]
  all_matches = [[] for i in range(6)]

  keypoint_window_offest = keypoint_window // 2
  for i in range(keypoint_window_offest, 224 - keypoint_window_offest, keypoint_window):
    for j in range(keypoint_window_offest, 224 - keypoint_window_offest, keypoint_window):
      if (body_mask0[i,j] != 0):  # Ignore background pixels
        # Require all pixels in a window to have the same class to consider the point
        if(torch.sum(
            body_mask0[i - keypoint_window_offest : i + keypoint_window_offest + 1, 
                       j - keypoint_window_offest : j + keypoint_window_offest + 1]) == (keypoint_window * keypoint_window * body_mask0[i,j])):
          feature_map_0[body_mask1[i,j] - 1].append((i,j))

        
  for i in range(patch_offset, 224 - patch_offset - 1):
    for j in range(patch_offset, 224 - patch_offset - 1):
      if (body_mask1[i,j] != 0): feature_map_1[body_mask1[i,j] - 1].append((i,j))

  for i in range(6):
    feature_patch_matches = [features_1[:, x1 - patch_offset : x1 + patch_offset + 1, y1 - patch_offset : y1 + patch_offset + 1] for x1,y1 in feature_map_1[i]]
    if (len(feature_patch_matches) > 0):          
      for x0,y0 in feature_map_0[i]:
        feature_patch0 = features_0[:, x0 - patch_offset : x0 + patch_offset + 1, y0 - patch_offset : y0 + patch_offset + 1]
        responses = [torch.sum(np.abs(feature_patch_match - feature_patch0)) for feature_patch_match in feature_patch_matches]
        match_idx = np.argmin(responses)
        if (responses[match_idx] > threshold): continue #thresholding to prevent outliers
        all_matches[i].append(((x0,y0),feature_map_1[i][match_idx]))

  resultImage = torch.cat((image0.cpu(), image1.cpu()), axis=2)
  result_mask = torch.cat((body_mask0.cpu(), body_mask1.cpu()),axis=1)
  plt.imshow(TF.to_pil_image(resultImage))
  plt.show()  
  plt.imshow(result_mask)
  plt.show()

  resultImage = np.array(TF.to_pil_image(resultImage).convert('RGB'))
  for i in range(len(all_matches)):
    for match in all_matches[i]:
      cv.line(resultImage, (match[0][1], match[0][0]), (match[1][1] + 224, match[1][0]), (255,0,0), 2)

  plt.imshow(resultImage)
  plt.show()

  return resultImage
  

In [0]:
vidcap = cv.VideoCapture(join('drive/My Drive/CSC420 Project/', '170728_Berlin_D_023.mp4'))
# vidcap = cv.VideoCapture(join('drive/My Drive/Colab Notebooks/CSC420 Project/', '170609_F_Varanasi_004.mp4'))

success, image = vidcap.read()
count = 0
interval = 10
skip = 50
while success:
  success, image0 = vidcap.read()
  i = 0
  while(success and i < interval):
    success, image1 = vidcap.read()
    i += 1

  count += (interval + 1)


  if count < skip : continue # skip first 300 frames
  match = vgg_feature_match(Image.fromarray(cv.cvtColor(image0,cv.COLOR_BGR2RGB).astype('uint8'), 'RGB'), 
                            Image.fromarray(cv.cvtColor(image1,cv.COLOR_BGR2RGB).astype('uint8'), 'RGB'), segNet_model, vgg, patch_size=11, keypoint_window=11,threshold=500)
  