In [2]:
import os
import random
import math

import numpy as np
import pandas as pd
import pdb
from collections import OrderedDict
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import collections  as mc
matplotlib.rcParams['figure.figsize'] = [6, 6]
matplotlib.rcParams['figure.dpi'] = 200

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision import transforms

from data_helper import UnlabeledDataset, LabeledDataset
from helper import collate_fn, draw_box

# random.seed(0)
# np.random.seed(0)
# torch.manual_seed(0);

# All the images are saved in image_folder
# All the labels are saved in the annotation_csv file
image_folder = '/scratch/vr1059/self-driving-data/data'
annotation_csv = '/scratch/vr1059/self-driving-data/data/annotation.csv'

# You shouldn't change the unlabeled_scene_index
# The first 106 scenes are unlabeled
unlabeled_scene_index = np.arange(106)
# The scenes from 106 - 133 are labeled
# You should devide the labeled_scene_index into two subsets (training and validation)
train_labeled_scene_index = np.arange(106, 128)
val_labeled_scene_index = np.arange(128, 132)
test_labeled_scene_index = np.arange(132, 134)

In [3]:
def unit_vector(vector):
    """ Returns the unit vector of the vector.  """
    return vector / np.linalg.norm(vector)

def angle_between(v1, v2):
    """ Returns the angle in radians between vectors 'v1' and 'v2'::

            >>> angle_between((1, 0, 0), (0, 1, 0))
            1.5707963267948966
            >>> angle_between((1, 0, 0), (1, 0, 0))
            0.0
            >>> angle_between((1, 0, 0), (-1, 0, 0))
            3.141592653589793
    """
    v1_u = unit_vector(v1)
    v2_u = unit_vector(v2)
    return np.degrees(np.arccos(np.clip(np.dot(v1_u, v2_u), -1.0, 1.0)))

#### Mappings from bin to class, and back

In [4]:
# class_label = 1
# class_dict = dict()
# reverse_class_dict = []
# reverse_class_dict.append((-100, -100))
# for i in range(400, 800, 50):
#     for j in range(100, 700, 50):
#         class_dict[(i, j)] = class_label
#         class_label += 1
#         reverse_class_dict.append((i, j))
        
# class_dict[(-100, -100)] = 0

In [5]:
class_label = 1
class_dict = dict()
reverse_class_dict = []
reverse_class_dict.append((-100, -100))
for i in range(400, 800, 50):
    for j in range(100, 700, 50):
        top_right_corner = (i+50, j)
        bottom_right_corner = (i+50, j+50)
        
        v1 = np.array([bottom_right_corner[0] - 400, 800 - bottom_right_corner[1] - 400])
        v3 = np.array([top_right_corner[0] - 400, 800 - top_right_corner[1] - 400])
        v2 = np.array([2, 0])
        
        if abs(angle_between(v1, v2)) <= 35 or abs(angle_between(v3, v2)) <= 35:
            class_dict[(i, j)] = class_label
            class_label += 1
            reverse_class_dict.append((i, j))
        else:
            continue
        
class_dict[(-100, -100)] = 0

In [6]:
def round_up(x):
    return int(math.ceil(x / 50.0)) * 50

def round_down(x):
    return round_up(x) - 50

In [7]:
def val_collate_fn(batch):
    front_imgs = []
    front_right_imgs = []
    front_left_imgs = []
    target = []
    road_imgs = []
    bbs = []
    target_counts = []
    for x in batch:
        # input
        front_left_imgs.append(torch.as_tensor(x[0][0]))
        front_imgs.append(torch.as_tensor(x[0][1]))
        front_right_imgs.append(torch.as_tensor(x[0][2]))
        road_imgs.append(torch.as_tensor(x[2]))
        
        # target
        bb_tens = x[1]['bounding_box']
        current_bbs = []
        bins = np.zeros(59)
        counts = np.zeros(20)
        count = 0
        
        for i, corners in enumerate(bb_tens):
            # Get bird's eye view coordinates 
            point_squence = torch.stack([corners[:, 0], corners[:, 1], corners[:, 3], corners[:, 2]])
            xs = point_squence.T[0] * 10 + 400
            ys = -point_squence.T[1] * 10 + 400
            
            # Get the top of the bounding box (top-center) point. Covers the edge case when the top of the car
            # is peeking into the image, but we can't . ...Although...
            # If we can only see the top part of the car, that means there's another view that 
            # has a much bigger slice of it, and can capture it. 
            # So I actually don't need to do this. And it will prob be helpful,
            # because why make this view try very hard to learn this, when another view can handle it just fine. 
            # Eh, I'll focus on this later. I just want to get this working for now. 
            if xs[2] - xs[0] > 5:
                top_center_x, top_center_y = 0.5*(xs[2] + xs[3]), 0.5*(ys[2] + ys[3])
            else:
                top_center_x, top_center_y = 0.5*(xs[0] + xs[1]), 0.5*(ys[0] + ys[1])
                
            # We do (800 - top_center_y) because matplotlib y-axis starts from the top. 
            v1 = np.array([top_center_x - 400, 800 - top_center_y - 400])
            v2 = np.array([2, 0])
            
            if abs(angle_between(v1, v2)) <= 35 and x[1]['category'][i] not in [1, 3, 6, 8]:
                current_bbs.append((xs, ys))
                # we're in the bucket of the front_img
                top_of_car = (top_center_x.item(), top_center_y.item())
                key = (round_down(top_of_car[0]), round_down(top_of_car[1]))
                if key not in class_dict:
                    print(key)
                bin_id = class_dict[key]
                bins[bin_id] = 1
                count += 1
                
        target.append(torch.as_tensor(bins))
        counts[count] = 1
        target_counts.append(torch.as_tensor(counts))
        bbs.append(current_bbs)
                
    boom = torch.stack(front_imgs), torch.stack(target), torch.stack(road_imgs), bbs, torch.stack(target_counts)
    return boom

In [14]:
def front_collate_fn(batch):
    front_imgs = []
    target = []
    road_imgs = []
    bbs = []
    target_counts = []
    for x in batch:
        # input
        flip_flag = False
        # Flipping with probability 0.5
        if np.random.choice([0, 1]):
            flip_flag = True
            img = x[0][1].numpy()
            flipped_img = np.fliplr(img.transpose(1, 2, 0)).transpose(2, 0, 1)
            front_imgs.append(torch.as_tensor(flipped_img.copy()))
        else:
            front_imgs.append(torch.tensor(x[0][1]))
        
        road_imgs.append(torch.as_tensor(x[2]))
        
        # target
        bb_tens = x[1]['bounding_box']
        current_bbs = []
        bins = np.zeros(59)
        counts = np.zeros(20)
        count = 0
        
        for i, corners in enumerate(bb_tens):
            # Get bird's eye view coordinates 
            point_squence = torch.stack([corners[:, 0], corners[:, 1], corners[:, 3], corners[:, 2]])
            xs = point_squence.T[0] * 10 + 400
            ys = -point_squence.T[1] * 10 + 400
            
            # Get the top of the bounding box (top-center) point. Covers the edge case when the top of the car
            # is peeking into the image, but we can't . ...Although...
            # If we can only see the top part of the car, that means there's another view that 
            # has a much bigger slice of it, and can capture it. 
            # So I actually don't need to do this. And it will prob be helpful,
            # because why make this view try very hard to learn this, when another view can handle it just fine. 
            # Eh, I'll focus on this later. I just want to get this working for now. 
            if xs[2] - xs[0] > 5:
                top_center_x, top_center_y = 0.5*(xs[2] + xs[3]), 0.5*(ys[2] + ys[3])
            else:
                top_center_x, top_center_y = 0.5*(xs[0] + xs[1]), 0.5*(ys[0] + ys[1])
                
            # We do (800 - top_center_y) because matplotlib y-axis starts from the top. 
            v1 = np.array([top_center_x - 400, 800 - top_center_y - 400])
            v2 = np.array([2, 0])
            
            if abs(angle_between(v1, v2)) <= 35 and x[1]['category'][i] not in [1, 3, 6, 8]:
                current_bbs.append((xs, ys))
                # we're in the bucket of the front_img
                if flip_flag:
                    # vertically flip where the top_center coordinates of the bounding box are. 
                    top_of_car = (top_center_x.item(), 800 - top_center_y.item())
                else:
                    top_of_car = (top_center_x.item(), top_center_y.item())
                key = (round_down(top_of_car[0]), round_down(top_of_car[1]))
                if key not in class_dict:
                    print(key)
                bin_id = class_dict[key]
                bins[bin_id] = 1
                count += 1
                
        target.append(torch.as_tensor(bins))
        counts[count] = 1
        target_counts.append(torch.as_tensor(counts))
        bbs.append(current_bbs)
                
    boom = torch.stack(front_imgs), torch.stack(target), torch.stack(road_imgs), bbs, torch.stack(target_counts)
    return boom

In [15]:
# The labeled dataset can only be retrieved by sample.
# And all the returned data are tuple of tensors, since bounding boxes may have different size
# You can choose whether the loader returns the extra_info. It is optional. You don't have to use it.
val_transform = transforms.ToTensor()
train_transform = transforms.Compose([
    transforms.RandomApply([
        transforms.ColorJitter(brightness = 0.5, contrast = 0.5, saturation = 0.4, hue = (-0.5, 0.5)),
        transforms.Grayscale(3),
        transforms.RandomAffine(8),
    ]),
    transforms.ToTensor(),
])
labeled_trainset = LabeledDataset(image_folder=image_folder,
                                  annotation_file=annotation_csv,
                                  scene_index=train_labeled_scene_index,
                                  transform=train_transform,
                                  extra_info=True
                                 )
labeled_valset = LabeledDataset(image_folder=image_folder,
                                  annotation_file=annotation_csv,
                                  scene_index=val_labeled_scene_index,
                                  transform=val_transform,
                                  extra_info=True
                                 )

train_loader = torch.utils.data.DataLoader(labeled_trainset, batch_size=64, num_workers=2, shuffle=True, collate_fn=front_collate_fn)
val_loader = torch.utils.data.DataLoader(labeled_valset, batch_size=64, num_workers=2, shuffle=True, collate_fn=val_collate_fn)

In [16]:
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        
        self.encoder = torchvision.models.resnet50()
        self.encoder.fc = nn.Identity()
        
        self.compress = nn.Sequential(OrderedDict([
            ('linear0', nn.Linear(2048, 64)),
            ('relu', nn.ReLU())
        ]))
        
        self.classification = nn.Sequential(OrderedDict([
            ('linear1', nn.Linear(64, 59)),
        ]))
        
        self.counts = nn.Sequential(OrderedDict([
            ('count1', nn.Linear(64, 20))
        ]))
        
    def forward(self, x):
        x = self.encoder(x)
        x = self.compress(x)
        return self.classification(x), self.counts(x)

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = SimpleModel()

# for name, param in model.encoder.named_parameters():
#     if("bn" not in name):
#         param.requires_grad = False
        
# unfreeze_layers = [model.encoder.layer3, model.encoder.layer4]
# for layer in unfreeze_layers:
#     for param in layer.parameters():
#         param.requires_grad = True
        
model = model.to(device)
location_criterion = nn.BCEWithLogitsLoss()
count_criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-4)
best_val_loss = 100

In [18]:
def train():
    model.train()
    labeled_trainset = LabeledDataset(image_folder=image_folder,
                                  annotation_file=annotation_csv,
                                  scene_index=train_labeled_scene_index,
                                  transform=train_transform,
                                  extra_info=True
                                 )
    train_loader = torch.utils.data.DataLoader(labeled_trainset, batch_size=64, num_workers=2, shuffle=True, collate_fn=front_collate_fn)
    
    train_losses = []
    loc_losses = []
    count_losses = []
    for i, (sample, target, road_img, bbs, target_count) in enumerate(train_loader):

        optimizer.zero_grad()

        sample = sample.to(device)
        target = target.to(device)
        target_count = target_count.to(device)

        y_hat, y_count = model(sample)
        
        loc_loss = location_criterion(y_hat, target.float())
        count_loss = count_criterion(y_count, target_count.float())
        loss = 15 * loc_loss + count_loss
        
        train_losses.append(loss.item())
        loc_losses.append(loc_loss.item())
        count_losses.append(count_loss.item())

        loss.backward()
        optimizer.step()

        if i % 10 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, i * len(sample), len(train_loader.dataset),
                10. * i / len(train_loader), loss.item()))
            
    print("\nAverage Train Epoch Loss: ", np.mean(train_losses))
    print("Average Train Loc Epoch Loss: ", np.mean(loc_losses))
    print("Average Train Count Epoch Loss: ", np.mean(count_losses))
            
def val():
    model.eval()
    val_losses = []
    loc_losses = []
    count_losses = []
    for i, (sample, target, road_img, bbs, target_count) in enumerate(val_loader):

        model.eval()

        sample = sample.to(device)
        target = target.to(device)
        target_count = target_count.to(device)

        with torch.no_grad():
            y_hat, y_count = model(sample)
            
            loc_loss = location_criterion(y_hat, target.float())
            count_loss = count_criterion(y_count, target_count.float())
            loss = 15 * loc_loss + count_loss

            val_losses.append(loss.item())
            loc_losses.append(loc_loss.item())
            count_losses.append(count_loss.item())
            
    print("Average Validation Epoch Loss: ", np.mean(val_losses))
    print("Average Validation Loc Epoch Loss: ", np.mean(loc_losses))
    print("Average Validation Count Epoch Loss: ", np.mean(count_losses))
    print("\n")
    global best_val_loss
    if np.mean(val_losses) < best_val_loss:
        best_val_loss = np.mean(val_losses)
        torch.save(model.state_dict(), 'best_val_loss_class_counts_flip.pt')

In [None]:
epochs = 40
for epoch in range(epochs):
    train()
    val()




Average Train Epoch Loss:  5.653563526543704
Average Train Loc Epoch Loss:  0.3287117200141603
Average Train Count Epoch Loss:  0.7228877727280963
Average Validation Epoch Loss:  6.0960118770599365
Average Validation Loc Epoch Loss:  0.3707309663295746
Average Validation Count Epoch Loss:  0.5350473895668983







Average Train Epoch Loss:  3.35056273503737
Average Train Loc Epoch Loss:  0.20161043039777063
Average Train Count Epoch Loss:  0.3264062987132506
Average Validation Epoch Loss:  5.452432334423065
Average Validation Loc Epoch Loss:  0.347235020250082
Average Validation Count Epoch Loss:  0.24390708096325397







Average Train Epoch Loss:  2.976323512467471
Average Train Loc Epoch Loss:  0.18608649921688167
Average Train Count Epoch Loss:  0.18502602184360678
Average Validation Epoch Loss:  4.799320757389069
Average Validation Loc Epoch Loss:  0.306241899728775
Average Validation Count Epoch Loss:  0.20569237880408764







Average Train Epoch Loss:  2.841462184082378
Average Train Loc Epoch Loss:  0.17874846878376874
Average Train Count Epoch Loss:  0.16023516282439232
Average Validation Epoch Loss:  4.680347561836243
Average Validation Loc Epoch Loss:  0.2985377460718155
Average Validation Count Epoch Loss:  0.2022814229130745







Average Train Epoch Loss:  2.8011733239347283
Average Train Loc Epoch Loss:  0.176446832716465
Average Train Count Epoch Loss:  0.1544708213345571
Average Validation Epoch Loss:  4.792924880981445
Average Validation Loc Epoch Loss:  0.3055608458817005
Average Validation Count Epoch Loss:  0.20951208844780922







Average Train Epoch Loss:  2.7104508768428457
Average Train Loc Epoch Loss:  0.17073033038865437
Average Train Count Epoch Loss:  0.14949592914093623
Average Validation Epoch Loss:  4.775478363037109
Average Validation Loc Epoch Loss:  0.3044836111366749
Average Validation Count Epoch Loss:  0.20822415873408318







Average Train Epoch Loss:  2.665528904307972
Average Train Loc Epoch Loss:  0.1678734631700949
Average Train Count Epoch Loss:  0.14742695844986223
Average Validation Epoch Loss:  4.61064088344574
Average Validation Loc Epoch Loss:  0.29236943274736404
Average Validation Count Epoch Loss:  0.22509931214153767







Average Train Epoch Loss:  2.5817325331948022
Average Train Loc Epoch Loss:  0.16232918575406075
Average Train Count Epoch Loss:  0.14679476110772652
Average Validation Epoch Loss:  4.608488380908966
Average Validation Loc Epoch Loss:  0.2933262772858143
Average Validation Count Epoch Loss:  0.20859413780272007







Average Train Epoch Loss:  2.5026538209481672
Average Train Loc Epoch Loss:  0.15721542963927443
Average Train Count Epoch Loss:  0.1444223648445173
Average Validation Epoch Loss:  4.335243225097656
Average Validation Loc Epoch Loss:  0.27486685290932655
Average Validation Count Epoch Loss:  0.2122402787208557







Average Train Epoch Loss:  2.4533475529063833
Average Train Loc Epoch Loss:  0.1540427908978679
Average Train Count Epoch Loss:  0.14270568401976066
Average Validation Epoch Loss:  4.200329959392548
Average Validation Loc Epoch Loss:  0.26508430019021034
Average Validation Count Epoch Loss:  0.2240653894841671







Average Train Epoch Loss:  2.3965169082988393
Average Train Loc Epoch Loss:  0.15028003552420574
Average Train Count Epoch Loss:  0.14231639727950096
Average Validation Epoch Loss:  4.2822107672691345
Average Validation Loc Epoch Loss:  0.271354952827096
Average Validation Count Epoch Loss:  0.21188639476895332







Average Train Epoch Loss:  2.326546110890128
Average Train Loc Epoch Loss:  0.1456779038364237
Average Train Count Epoch Loss:  0.1413775770501657
Average Validation Epoch Loss:  4.371231257915497
Average Validation Loc Epoch Loss:  0.2775772586464882
Average Validation Count Epoch Loss:  0.2075723558664322







Average Train Epoch Loss:  2.2680765905163507
Average Train Loc Epoch Loss:  0.1418877372687513
Average Train Count Epoch Loss:  0.13976054858754983
Average Validation Epoch Loss:  4.388655304908752
Average Validation Loc Epoch Loss:  0.277799554169178
Average Validation Count Epoch Loss:  0.2216620035469532







Average Train Epoch Loss:  2.232106859033758
Average Train Loc Epoch Loss:  0.13953970643607053
Average Train Count Epoch Loss:  0.1390112474222075
Average Validation Epoch Loss:  4.271104961633682
Average Validation Loc Epoch Loss:  0.27006446942687035
Average Validation Count Epoch Loss:  0.22013788670301437







Average Train Epoch Loss:  2.1872888071970507
Average Train Loc Epoch Loss:  0.13661059026013722
Average Train Count Epoch Loss:  0.13812994279644705
Average Validation Epoch Loss:  4.2923479080200195
Average Validation Loc Epoch Loss:  0.2700642719864845
Average Validation Count Epoch Loss:  0.2413838766515255







Average Train Epoch Loss:  2.137532724575563
Average Train Loc Epoch Loss:  0.13334406912326813
Average Train Count Epoch Loss:  0.1373716794293035
Average Validation Epoch Loss:  4.302679061889648
Average Validation Loc Epoch Loss:  0.2725216820836067
Average Validation Count Epoch Loss:  0.21485375985503197








In [None]:
# Removed random seeds. Also refresh train dataset and dataloader (with collate_fn called) every epoch. 
# Testing if this helps generalization, rather than having the same 3000-sized dataset every time. 

# 16.1 best val loss (epoch 14)
# I wonder if decreasing the number of classes would help. 
# ResNet pre-trained=True, freeze everything except last two layers, differential learning. 

# 0.31 best val loss
# 0.327 best val loss, ResNet 34 with L2 penalty (0.5)

# 0.2999 best val loss from combined classify + counts. 

# 0.286 best val loc loss, 2.79 best val count loss
# 0.275 best val loc loss, 3.23 best val count loss

# 0.288 best val loc loss, 2.82 best val count loss

# 0.34 best val loc loss, 2.73 best val count loss

# 0.265 best val loc loss, 2.708 best val count loss

# 0.268, 2.59 (hue -0.5)
# 0.260 2.45 (hue, 12 factor on loc lsos)

# 0.277
# 0.272, 2.36
# 0.260, 2.45

# 0.258, 2.39
# Penalize the model for saying there isn't a car when there is. 

# Trying BCELoss for counts. 
# Added an extra 128 compression layer before bins/count prediction. 
# 0.255 best val loc loss. 

# Trying compression to 60 before bins/count prediction layer. 


#### Verifying targets are correct

In [None]:
sample, target, road_imgs, bbs = iter(train_loader).next()

In [None]:
idx = -1

In [None]:
idx += 1

In [None]:
plt.imshow(sample[idx].numpy().transpose(1, 2, 0))

In [None]:
def draw_box(ax, class_box):
    box_xs = [class_box[0], class_box[0], class_box[0]+50, class_box[0]+50, class_box[0]]
    box_ys = [class_box[1], class_box[1]+50, class_box[1]+50, class_box[1], class_box[1]]
    ax.plot(box_xs, box_ys, color="green")

In [None]:
boom = torch.tensor([4, 5])

In [None]:
torch.cat((boom, torch.tensor([6])))

In [None]:
fig, ax = plt.subplots()
ax.imshow(road_imgs[idx], cmap ='binary');
ax.plot(400, 400, 'x', color="red")

# `target` is 32 by 81. Find the indices where there's a 1. 
bin_ids = (target[idx] == 1).nonzero()
for bin_id in bin_ids:
    class_box = reverse_class_dict[bin_id]
    draw_box(ax, class_box)
    
# I also need to draw the bounding boxes. 
for bb in bbs[idx]:
    x_ = torch.cat((bb[0], torch.tensor([bb[0][0]])))
    y_ = torch.cat((bb[1], torch.tensor([bb[1][0]])))
    ax.plot(x_, y_, color='orange')