## In this notebook, we will explore the dataset and convert the data into the input format required by tensorflow object detection API

References
1. https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/using_your_own_dataset.md
2. https://www.kaggle.com/khanhlvg/cots-detection-w-tensorflow-object-detection-api#Prepare-the-training-dataset

In [None]:
!pip install scikit-learn==1.0.2  we dont need this

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from ast import literal_eval
from PIL import Image, ImageDraw
from IPython import display
from tqdm.notebook import tqdm
import os

from sklearn.model_selection import StratifiedGroupKFold 




In [None]:
class CONFIG:
    train_images_root_dir = "../input/tensorflow-great-barrier-reef/train_images"
    train_meta_data_dir = "../input/tensorflow-great-barrier-reef/train.csv"
    n_splits = 5

In [None]:
train_df = pd.read_csv(CONFIG.train_meta_data_dir)
train_df['annotations'] = train_df['annotations'].map(lambda x:literal_eval(x))
train_df.head()

## Checking the % of frames where n starfish are detected

In [None]:
train_df['n_cots_det'] = train_df['annotations'].map(lambda x: len(x))
train_df.iloc[20:25]

### We can see that there are some frames with alot of detections
lets check if they are valid

In [None]:
train_df['n_cots_det'].value_counts()

## Creating the filepaths for the dataset

In [None]:
def get_file_paths(dataframe):
    dataframe = dataframe.copy()
    df_length = len(dataframe)
    
    filepaths = []
    
    for i in tqdm(range(df_length)):
        row = dataframe.iloc[i]
        video_folder = f"video_{row['video_id']}"
        image_name = f"{row['video_frame']}.jpg"
        filepath = os.path.join(CONFIG.train_images_root_dir, video_folder, image_name)
        filepaths.append(filepath)
    
    dataframe['filepath'] = filepaths
    return dataframe

train_df = get_file_paths(train_df)
train_df.head()

## Visualizing some of the training data


In [None]:
def plot_image_with_bbox(image_filepath, annotations):
    image = Image.open(image_filepath)
    draw = ImageDraw.Draw(image) # returns a ImageDraw object
    
    for annotation in annotations:
        draw.rectangle([
            annotation['x'], annotation['y'], 
            annotation['x']+annotation['width'], annotation['y']+annotation['height']
        ], outline='red', width=3)
    plt.figure(figsize=(20,20))    
    plt.imshow(image)

def plot_batch_image_with_bbox(subset_df, max_plots=10):
    length = len(subset_df)
#     sample_size = min(length, max_plots)
#     subset_df = subset_df.sample(sample_size).copy()
    subset_df.reset_index(inplace=True, drop=True)
    
    
    for i in range(length):
        row=subset_df.iloc[i]
        plot_image_with_bbox(row["filepath"], row['annotations'])
        

In [None]:
plot_batch_image_with_bbox(train_df[train_df['n_cots_det']==18])

### Train validation split
For this 3 videos, its important to split the data into train validation properly as subsequent frames are very similar so we cannot randomly split them or it will cause serious data leak. Some option will be as follows. We can also see that around 22% of the images contains COTS so we will also need to split them in approximate ratio to reflex the "true" distribution

1. If we training by video and essemble them, but it might take too long for inference
2. Group the data into fixed window size to minimize the subsequent frame in the validation data
3. Split the data into groups seperated by zero detection and use StratifiedGroupKfold


In [None]:
train_df['is_cot_detected'] = (train_df['n_cots_det']>0).astype(np.int32)
train_df['is_cot_detected'].hist()

In [None]:
def assign_group(is_detected_list:list)->list: 
    groupings = [0]
    n_samples = len(is_detected_list)
    group_num = 0
    length_counter = 0
    for i in range(1, n_samples):
        if is_detected_list[i] == 0 and is_detected_list[i-1] == 1:
            # start of new group 
            group_num += 1
            length_counter = 0
            groupings.append(group_num)
        
        elif length_counter == 2000 and is_detected_list[i-1] == 0:
            group_num += 1
            length_counter = 0
            groupings.append(group_num)
            
        else:
            groupings.append(group_num)
            length_counter +=1
            
    return groupings
   

In [None]:
train_df['group_number'] = assign_group(train_df['is_cot_detected'].tolist())
train_df.head()

In [None]:
train_df['group_number']

In [None]:
k_fold_spilter = StratifiedGroupKFold(n_splits=CONFIG.n_splits)
k_fold_n = 0
for train_idx, test_idx in k_fold_spilter.split(train_df['is_cot_detected'],train_df['is_cot_detected'], groups=train_df['group_number']):
    train_df.iloc[test_idx,-1] = k_fold_n
    k_fold_n+=1

In [None]:
train_df

In [None]:
for i in range(5):
    subset = train_df[train_df['k_fold']==i].copy()
    print(f"Fold {i} length: ", len(subset))
    print(f"Fold {i} cot det", subset['is_cot_detected'].sum())
    print(f"Fold {i} cot det percentage", subset['is_cot_detected'].sum()/len(subset))
    print("Groups")
    print(subset['group_number'].unique())
    print()

In [None]:
train_df.to_csv("train_df.csv")