This preliminary notebook using some code form https://www.kaggle.com/mrinath/efficientdet-train-pytorch

In [None]:
# Try doing all the installations here
!pip install -I numpy

!pip install -I torchvision
!pip install -I torch -U   

# First, we need to install pycocotools. This library will be used for computing the evaluation metrics following the COCO metric for intersection over union.

!pip install cython
# Install pycocotools, the version by default in Colab
# has a bug fixed in https://github.com/cocodataset/cocoapi/pull/354
!pip install -I 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI' --no-binary pycocotools
!ls /kaggle/input/baseline-predict-pytorch

# # !pip install -I pycocotools==2.0.0


In [None]:
# !ls /kaggle/input/baseline-predict-pytorch

In [None]:
# !cp -r /kaggle/input/cococode/PythonAPI/pycocotools/ /kaggle/working

In [None]:
# !python /kaggle/input/cococode/PythonAPI/setup.py  build_ext install

In [None]:
# import pycocotools._mask as mask

In [None]:
# git clone the utility functions and evaluation functions from pytorch coco dataset

!git clone https://github.com/pytorch/vision.git
!cd vision
!git checkout v0.8.2

In [None]:
!ls vision/references

In [None]:
!cp vision/references/detection/utils.py /kaggle/working/
!cp vision/references/detection/transforms.py /kaggle/working/
!cp vision/references/detection/coco_eval.py /kaggle/working/
!cp vision/references/detection/engine.py /kaggle/working/
!cp vision/references/detection/coco_utils.py /kaggle/working/

In [None]:
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import sys
import os

# os.environ['TORCH_HOME'] = '\\kaggle\\input\\resnet'

import glob
import sklearn
import math
import random

import cv2
import albumentations as A
from albumentations.pytorch import ToTensorV2

from transformers import get_cosine_schedule_with_warmup

from PIL import Image

import warnings
warnings.filterwarnings('ignore')
from sklearn import metrics, model_selection, preprocessing
from sklearn.model_selection import GroupKFold
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(device)


In [None]:
# train only on images with detections let see
df = pd.read_csv("../input/tensorflow-great-barrier-reef/train.csv")
# df = df[df.annotations != '[]']
# df = df.reset_index(drop = True)
df.head(20)

In [None]:
df['fold'] = -1
kf = GroupKFold(n_splits = 5)
for fold, (train_idx, val_idx) in enumerate(kf.split(df, y = df.video_id.tolist(), groups=df.sequence)):
    df.loc[val_idx, 'fold'] = fold

In [None]:
df.head()

In [None]:
df.fold.value_counts()

In [None]:
# add the imaging paths to the dataframe
df['path'] = [f"../input/tensorflow-great-barrier-reef/train_images/video_{a}/{b}.jpg" for a,b in zip(df["video_id"],df["video_frame"])]
df['annotations'] = df['annotations'].apply(eval)
df['number_boxes'] = df['annotations'].apply(lambda x: len(x))
df.head()

In [None]:
# plot some of the images
import matplotlib.pyplot as plt
from matplotlib import patches

def get_rectangle_edges_from_pascal_bbox(bbox):
    xmin_top_left, ymin_top_left, xmax_bottom_right, ymax_bottom_right = bbox

    bottom_left = (xmin_top_left, ymax_bottom_right)
    width = xmax_bottom_right - xmin_top_left
    height = ymin_top_left - ymax_bottom_right

    return bottom_left, width, height

def draw_pascal_voc_bboxes(
    plot_ax,
    bboxes,
    get_rectangle_corners_fn=get_rectangle_edges_from_pascal_bbox,
):
    for bbox in bboxes:
        bottom_left, width, height = get_rectangle_corners_fn(bbox)

        rect_1 = patches.Rectangle(
            bottom_left,
            width,
            height,
            linewidth=2,
            edgecolor="black",
            fill=False,
        )
        rect_2 = patches.Rectangle(
            bottom_left,
            width,
            height,
            linewidth=2,
            edgecolor="red",
            fill=False,
        )

        # Add the patch to the Axes
        plot_ax.add_patch(rect_1)
        plot_ax.add_patch(rect_2)

def draw_image(
    image, bboxes=None, draw_bboxes_fn=draw_pascal_voc_bboxes, figsize=(10, 10)
):
    fig, ax = plt.subplots(1, figsize=figsize)
    ax.imshow(image)

    if bboxes is not None:
        draw_bboxes_fn(ax, bboxes)

    plt.show()

In [None]:
class DataAdaptor:
    def __init__(self,df):
        self.df = df
    def __len__(self):
        return len(self.df)
    
    def get_boxes(self, row):
        """Returns the bboxes for a given row as a 3D matrix with format [x_min, y_min, x_max, y_max]"""
        
        boxes = pd.DataFrame(row['annotations'], columns=['x', 'y', 'width', 'height']).astype(float).values
        
        # Change from [x_min, y_min, w, h] to [x_min, y_min, x_max, y_max]
        boxes[:, 2] = np.clip(boxes[:, 0] + boxes[:, 2],0,1280)
        boxes[:, 3] = np.clip(boxes[:, 1] + boxes[:, 3],0,720) 
        
        return boxes
    
    def get_image_bb(self , idx):
        img_src = self.df.loc[idx,'path']
        image   = cv2.imread(img_src)
        image   = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        row     = self.df.iloc[idx]
        bboxes  = self.get_boxes(row) 
        class_labels = np.ones(len(bboxes))
        return image, bboxes, class_labels, idx
    
        
    def show_image(self, index):
        image, bboxes, class_labels, image_id = self.get_image_bb(index)
        print(f"image_id: {image_id}")
        draw_image(image, bboxes.tolist())
#         print(class_labels) 
        return image

In [None]:
train_ds = DataAdaptor(df)

In [None]:
im,bb,_,_ = train_ds.get_image_bb(4005)
bb

In [None]:
img = train_ds.show_image(2016)

In [None]:
np.where(df["number_boxes"] > 2)

In [None]:
num_seq = [len(df[df['video_id'] == i]) for i in range(3)]
labels = ["0", "1", "2"]

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(9,6))
ax.set_facecolor('aliceblue')
plt.grid(color="gray", linestyle="-", zorder=0)
plt.ylabel("Number of Frames", fontsize=16, fontweight="bold")
plt.xlabel("Video ID", fontsize=16, fontweight="bold")
plt.title("Length of train videos", fontsize=20, fontweight="bold")
plt.bar(labels, num_seq, color="orange", zorder=3)
plt.show()

In [None]:
max_num = max(df.number_boxes)
max_sample = df[df["number_boxes"] == max_num].sample()
max_vid_id = max_sample.video_id.values[0]
max_vid_frame = max_sample.video_frame.values[0]

print('\033[1m' + f"Maximum number of starfish in one frame: {max_num} (Video {max_vid_id}, Frame {max_vid_frame})" + '\033[0m')

In [None]:
img = train_ds.show_image(max_sample.index[0])

In [None]:
# Check number of samples without boxes
min_num = 0
min_sample = df[df["number_boxes"] == 0]
print(len(min_sample), len(df), len(df)-len(min_sample))

# Training an Object Detection Model
Before we settled on using pytorch, we trained the model  using Tensorflow and Keras libraries but Pytorch performed better. 
We used the FasterR-CNN pre-trained model and fine-tuned it as per the requirements of this project.
PyTorch uses torch.utils.data.DataLoader and torch.utils.data.Dataset class to work with data. Dataset stores the samples and their corresponding labels, and DataLoader wraps an iterable around the Dataset. 
For this object detection project, we will be using a TorchVision dataset. To declare, initialize, and manipulate objects in Python, we use classes. 
The __getitem__ reads the image using the image_id we have in the dataframe, and also we can get all the bounding boxes associated with that image. We then initialize a dict called target, which will be passed to model for training. This target will have metadata of the annotation like actual bounding box coordinates, it’s corresponding labels, image_id, area of the bounding boxes. The area parameter is used during evaluation with the COCO metric, to separate the metric scores between small, medium, and large boxes. If we set iscrowd as True, those instances will be ignored during evaluation. The __len__ method gives the size of the Dataset.

In [None]:
import os
import numpy as np
import torch
import torch.utils.data
from PIL import Image

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset,DataLoader
from torch import optim
from torchvision import transforms


class CotsData(torch.utils.data.Dataset):
    def __init__(self, df, transforms=None):
        self.ds = df
        self.transforms = transforms
    
    def get_boxes(self, row):
        """Returns the bboxes for a given row as a 3D matrix with format [x_min, y_min, x_max, y_max]"""
        
        boxes = pd.DataFrame(row['annotations'], columns=['x', 'y', 'width', 'height']).astype(float).values
        
        # Change from [x_min, y_min, w, h] to [x_min, y_min, x_max, y_max]
        boxes[:, 2] = np.clip(boxes[:, 0] + boxes[:, 2],0,1280)
        boxes[:, 3] = np.clip(boxes[:, 1] + boxes[:, 3],0,720) 
        
        return boxes
            
    def __getitem__(self, idx):
        # load images
        img_path = self.ds.loc[idx,'path']
        # mask_path = os.path.join(self.root, "PedMasks", self.masks[idx])
        img = Image.open(img_path).convert("RGB")
        # note that we haven't converted the mask to RGB,
        
        row = self.ds.iloc[idx]
        boxes = self.get_boxes(row)
        num_objs = self.ds.loc[idx, 'number_boxes']

        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        # there is only one class
        labels = torch.ones((num_objs,), dtype=torch.int64)
        
        image_id = torch.tensor([idx])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        # suppose all instances are not crowd
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64) # check this probably have to set this to true

        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd

        if self.transforms is not None:
            img, target = self.transforms(img, target)

        return img, target

    def __len__(self):
        return len(self.ds)

In [None]:
!ls /kaggle/input/tensorflow-great-barrier-reef

In [None]:
img_npy = np.load("/kaggle/input/tensorflow-great-barrier-reef/example_test.npy")

In [None]:
testdf = pd.read_csv("/kaggle/input/tensorflow-great-barrier-reef/test.csv")

In [None]:
subdf = pd.read_csv("/kaggle/input/tensorflow-great-barrier-reef/example_sample_submission.csv")
subdf

In [None]:
!cat transforms.py

In [None]:
!ls /kaggle/input/tensorflow-great-barrier-reef/greatbarrierreef/

# Training, Validation Datasets and Data Loaders.
The next step is to create our training and validation datasets. The data loader loads the training data in batches into the model for training. For this also we will be using PyTorch’s DataLoader utility.

In [None]:
# get training and validation dataframes
def get_train_val(df, train=True):
    if train:
        df2 = df[df.video_id != 2]
        dfn = df2[df.number_boxes>0]
        dfo = df2[df.number_boxes==0]
        dfno = dfo.sample(n=1000, replace=False, random_state=1)
        result = pd.concat([dfn, dfno])
    else:
        df2 = df[df.video_id==2]
        dfn = df2[df.number_boxes>0]
        dfo = df2[df.number_boxes==0]
        dfno = dfo.sample(n=100, replace=False, random_state=1)
        result = pd.concat([dfn, dfno])
    
    return result

In [None]:
# fold_n = 1
# train_df= df[df.fold != fold_n]
# val_df  = df[df.fold == fold_n]

train_df = get_train_val(df, train=True)
val_df = get_train_val(df, train=False)

# use our dataset and defined transformations
dataset = CotsData(train_df.reset_index(drop=True), get_transform(train=True))
dataset_test = CotsData(val_df.reset_index(drop=True), get_transform(train=False))

In [None]:
# split the dataset in train and test set
torch.manual_seed(1)
# indices = torch.randperm(len(dataset)).tolist()
# dataset = torch.utils.data.Subset(dataset, indices[:-50])
# dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:])

# define training and validation data loaders
data_loader = torch.utils.data.DataLoader(
    dataset, batch_size=8, shuffle=True, num_workers=4,
    collate_fn=utils.collate_fn)

data_loader_test = torch.utils.data.DataLoader(
    dataset_test, batch_size=1, shuffle=False, num_workers=4,
    collate_fn=utils.collate_fn)

In [None]:
from engine import train_one_epoch, evaluate
import utils
import transforms as T


def get_transform(train):
    transforms = []
    # converts the image, a PIL image, into a PyTorch Tensor
    transforms.append(T.ToTensor())
    if train:
        # during training, randomly flip the training images
        # and ground-truth for data augmentation
        transforms.append(T.RandomHorizontalFlip(0.5))
#         transforms.append(T.RandomPhotometricDistort())
#         transforms.append(T.RandomZoomOut())
    return T.Compose(transforms)

# Faster R-CNN
The Faster R-CNN profoundly replaces the Selective Search technique with much efficient Region Proposal Network that generates the detected areas in an image. It is highly used for carrying out real-time performance-based tasks in object detection tasks. 
We initialize our model using torchvision’s FasterRCNN with a resnet50 backbone. 
We set pretrained as true, so the function will return a model pre-trained on COCO. Here we set the num_classes as 2, considering background as one class.
Before we start the training we can declare the number of epochs to train and also set the optimizer and learning rate scheduler.
The optimizer we are using here is SGD (Stochastic Gradient Descent). The learning rate scheduler helps to adjust the learning rate during the course of the training to achieve more accuracy and speed up convergence. We use StepLR scheduler which decays the learning rate of each parameter group by gamma every step_size epochs. The gamma and step_size hyperparameters will decide the learning rate decay. Finally, we gonna train this model for 20 epochs.

In [None]:
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
      
def get_instance_segmentation_model(num_classes):
    # load an instance segmentation model pre-trained on COCO
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True) # trainable_backbone_layers=4
    #model.load_state_dict(torch.load('/kaggle/input/resnet/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth'))

    # get the number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    # replace the pre-trained head with a new one
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    #hidden_layer = 256

    return model

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# our dataset has two classes only - background and person
num_classes = 2

# get the model using our helper function
model = get_instance_segmentation_model(num_classes)
# move model to the right device
model.to(device)

# construct an optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.05,
                            momentum=0.9, weight_decay=0.0005)

# and a learning rate scheduler which decreases the learning rate by
# 10x every 3 epochs
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                               step_size=3,
                                               gamma=0.1)

In [None]:
from torch.optim.lr_scheduler import StepLR
num_epochs = 20

for epoch in range(num_epochs):
    # train for one epoch, printing every 10 iterations
    train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10)
    # update the learning rate
    lr_scheduler.step()
    # evaluate on the test dataset
    evaluate(model, data_loader_test, device=device)

torch.save(model.state_dict(), 'checkpoint-video2.pth')

The model is saved for inference to make our predictions. Once the training is completed we will have the model.pth file.

# Observations During Training
During the fine tuning process of our model, we included a trainable backbone layers of 4 and trained for 30 epochs which caused our model to perfom poorly most likely due to Over fitting.

In [None]:
# PATH = 'checkpoint.pth'
# torch.save({
#             'epoch': epoch,
#             'model_state_dict': model.state_dict(),
#             'optimizer_state_dict': optimizer.state_dict(),
#             'loss': loss,
#             }, PATH)


In [None]:
import os
os.chdir(r'../working')
from IPython.display import FileLink
FileLink(r'checkpoint-video2.pth')

In [None]:
# model_path = '/kaggle/input/savemodel/checkpoint.pth'

In [None]:
# state_dict = torch.load(model_path)
# # print(state_dict.keys())
# model.load_state_dict(state_dict)

In [None]:
# def apply_nms(orig_prediction, iou_thresh=0.3, score_thresh=0.35):
    
#     # torchvision returns the indices of the bboxes to keep
#     # function to implement non maximm suppression
#     # might also need to eliminate predictions with very low scores
#     # trim low scores first
    
#     keep = orig_prediction['scores'] >= score_thresh
    
#     scores_prediction = {}
#     scores_prediction['boxes'] = orig_prediction['boxes'][keep]
#     scores_prediction['scores'] = orig_prediction['scores'][keep]
#     scores_prediction['labels'] = orig_prediction['labels'][keep]
    
#     keep = torchvision.ops.nms(scores_prediction['boxes'], scores_prediction['scores'], iou_thresh)
    
#     final_prediction = {}
#     final_prediction['boxes'] = scores_prediction['boxes'][keep]
#     final_prediction['scores'] = scores_prediction['scores'][keep]
#     final_prediction['labels'] = scores_prediction['labels'][keep]
    
#     return final_prediction

# def return_predict_string(predictions):
#     str_p = ''
#     for i, score in enumerate(predictions['scores']):
#         box = predictions['boxes'][i].cpu()
#         str_p += f'{score} {int(np.round(box[0]))} {int(np.round(box[1]))} {int(np.round(box[2]-box[0]))} {int(np.round(box[3]-box[1]))} '
    
#     str_p = str_p.strip(' ')
#     if str_p == '':
#         str_p = '0.9 716 678 54 42'
    
#     return str_p

# def preprocess_img(img):
#     img = img/255.
#     x,y, c = img.shape
#     img = img.reshape(c,x,y)
#     return torch.from_numpy(img)

In [None]:
# # pick one image from the test set
# img, target = dataset_test[5]
# # put the model in evaluation mode
# model.eval()
# with torch.no_grad():
#     prediction = model([img.to(device=device, dtype=torch.float)])[0]
#     final_pred = apply_nms(prediction, 0.2)
    
# print('predicted #boxes: ', len(prediction['labels']))
# print('real #boxes: ', len(target['labels']))
# print('nms predict #boxes: ', len(final_pred['labels']))
# print('scores: ', prediction['scores'])
# print(return_predict_string(prediction))

# print(prediction)

In [None]:
# import greatbarrierreef
# rows=[]
# ii = 0
# env = greatbarrierreef.make_env()   # initialize the environment
# iter_test = env.iter_test()    # an iterator which loops over the test set and sample submission
# for (pixel_array, sample_prediction_df) in iter_test:
#     pixel_p = preprocess_img(pixel_array)
#     prediction = model([pixel_p.to(device, dtype=torch.float)])[0]
#     sample_prediction_df['annotations'] = anno = '0.5 0 0 100 100' #return_predict_string(apply_nms(prediction, 0.3))  # make your predictions here
#     rows.append([ii, anno])
#     env.predict(sample_prediction_df)
#     ii += 1



In [None]:
# rows
# model([pixel_p.to(device=device, dtype=torch.float)])[0]

In [None]:
# test_ds = DataAdaptor(val_df.reset_index(drop=True))

In [None]:
# img5 = test_ds.show_image(5)

In [None]:
# pixel_p*255.

In [None]:
# plt.imshow(pixel_p.reshape(720,1280,3))