<a href="https://colab.research.google.com/github/wtaisner/tensorflow-great-barrier-reef/blob/main/data_augmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Data augmentation

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip uninstall opencv-python-headless -y
!pip install opencv-python-headless==4.1.2.30
!pip install --upgrade albumentations

Collecting opencv-python-headless==4.1.2.30
  Downloading opencv_python_headless-4.1.2.30-cp37-cp37m-manylinux1_x86_64.whl (21.8 MB)
[K     |████████████████████████████████| 21.8 MB 1.7 MB/s 
Installing collected packages: opencv-python-headless
Successfully installed opencv-python-headless-4.1.2.30
Collecting albumentations
  Downloading albumentations-1.1.0-py3-none-any.whl (102 kB)
[K     |████████████████████████████████| 102 kB 4.4 MB/s 
Collecting qudida>=0.0.4
  Downloading qudida-0.0.4-py3-none-any.whl (3.5 kB)
Installing collected packages: qudida, albumentations
  Attempting uninstall: albumentations
    Found existing installation: albumentations 0.1.12
    Uninstalling albumentations-0.1.12:
      Successfully uninstalled albumentations-0.1.12
Successfully installed albumentations-1.1.0 qudida-0.0.4


In [None]:
import albumentations as A
import cv2
import pandas as pd
import ast
import os
from tqdm import tqdm
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import matplotlib as plt
import yaml

In [None]:
%cd
%cd /content/drive

In [None]:
%mkdir -p './MyDrive/yolo/data/images/train'
%mkdir -p './MyDrive/yolo/data/images/val'
%mkdir -p './MyDrive/yolo/data/images/test'
%mkdir -p './MyDrive/yolo/data/labels/train'
%mkdir -p './MyDrive/yolo/data/labels/test'
%mkdir -p './MyDrive/yolo/data/labels/val'


In [None]:
%cd
%mkdir -p '/content/augmented/images'

In [1]:
WIDTH = 1280
HEIGHT = 720

KAGGLE_PATH_ANNOTATIONS = '/kaggle/input/tensorflow-great-barrier-reef/train.csv'
KAGGLE_PATH_IMG_DIR = '/kaggle/input/tensorflow-great-barrier-reef/train_images/'
LOCAL_PATH_ANNOTATIONS = 'data/train.csv'
LOCAL_PATH_IMG_DIR = 'data/train_images/'
COLAB_PATH_ANNOTATIONS = '/content/drive/MyDrive/data/train.csv'
COLAB_PATH_IMG_DIR = '/content/drive/MyDrive/data/train_images/'


KAGGLE_SAVE = '/kaggle/augmented'

COLAB_SAVE = '/content/augmented'

TRAIN_IMG = 'data/images/train'
VAL_IMG = 'data/images/val'
TEST_IMG = 'data/images/test'

TRAIN_LBL = 'data/labels/train'
VAL_LBL = 'data/labels/val'
TEST_LBL = 'data/labels/test'

KAGGLE_PREFIX = '/kaggle/working/'
COLAB_PREFIX = '/content/drive/MyDrive/yolo/'

In [None]:
transform_flip_h = A.Compose(
    [A.HorizontalFlip(p=1)],
    bbox_params = A.BboxParams(format='yolo', label_fields = ['category_ids'])
)
transform_flip_v = A.Compose(
    [A.VerticalFlip(p=1)],
    bbox_params = A.BboxParams(format='yolo', label_fields = ['category_ids'])
)
transform_colors = A.Compose(
    [A.RandomBrightnessContrast(p=0.5),
    A.RGBShift(r_shift_limit=30, g_shift_limit=30, b_shift_limit=30, p=0.7)],
    bbox_params = A.BboxParams(format='yolo', label_fields = ['category_ids'])
)

transformations = [transform_flip_v, transform_flip_h, transform_colors]

In [None]:
def create_augmented_data(annotations_file, img_dir, save_path, transformations=transformations):
    img_labels = pd.read_csv(annotations_file)
    annotated = img_labels[img_labels['annotations'] != '[]']
    bboxes = {'id':[], 'bboxes':[]}
    i = 0
    for idx in tqdm(range(len(annotated))):
        image = cv2.imread(os.path.join(img_dir, 'video_{}'.format(annotated.iloc[idx][0]),'{}.jpg'.format(annotated.iloc[idx][2])))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        cv2.imwrite(os.path.join(save_path, 'images/im{}.jpg'.format(i)), cv2.cvtColor(image, cv2.COLOR_RGB2BGR))
        labels = annotated.iloc[idx][-1]
        labels = ast.literal_eval(labels)
        coords = []
        for parsed_label in labels:
            x, y = parsed_label['x'], parsed_label['y']
            w, h = parsed_label['width'], parsed_label['height']
            cx, cy = min((0.5*w + x)/WIDTH, 1), min((0.5*h + y)/HEIGHT, 1)
            nw, nh = min(w/WIDTH, 1), min(h/HEIGHT, 1)
            coords.append([cx, cy, nw, nh])
        labels = [0 for _ in range(len(coords))]
        bboxes['id'].append(i)
        bboxes['bboxes'].append(coords)
        i += 1
            
        try:
            for transform in transformations:
                transformed = transform(image=image, bboxes=coords, category_ids=labels)
                img = transformed['image']
                boxes = transformed['bboxes']
                cv2.imwrite(os.path.join(save_path, 'images/im{}.jpg'.format(i)), cv2.cvtColor(img, cv2.COLOR_RGB2BGR))
                bboxes['id'].append(i)
                bboxes['bboxes'].append(boxes)
                i += 1
        except:
            print(os.path.join(img_dir, 'video_{}'.format(annotated.iloc[idx][0]),'{}.jpg'.format(annotated.iloc[idx][2])))
    csv = pd.DataFrame(data=bboxes)
    csv.to_csv(os.path.join(save_path, 'data.csv'))
    
    

In [None]:
create_augmented_data(COLAB_PATH_ANNOTATIONS, COLAB_PATH_IMG_DIR, COLAB_SAVE)

Dataset preparation - yolo

In [None]:
class StarfishDataset(Dataset):
    def __init__(self,
                 annotations_file='/content/augmented/data.csv',
                 img_dir='/content/augmented/images'
                 ):
        self.img_labels = pd.read_csv(annotations_file)
        self.annotated = self.img_labels[self.img_labels['bboxes'] != '[]']  # get only annotated frames
        self.img_dir = img_dir

    def __len__(self):
        return len(self.annotated)

    def __getitem__(self, idx):
        image = os.path.join(self.img_dir, 'im{}.jpg'.format(self.annotated.iloc[idx][0]))
        coords = self.annotated.iloc[idx][-1]
        coords = np.array(ast.literal_eval(coords))
        labels = np.array([0 for _ in range(len(coords))])
        labels = np.expand_dims(labels, axis=0)
        labels = np.concatenate((labels.T, coords), axis=1)
        boxes = np.array(labels)
        return image, boxes

In [None]:
dataset = StarfishDataset()
print(len(dataset))

In [None]:
train_size = 1500
val_size = 10000
test_size = len(dataset) - train_size - val_size

train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset, (train_size, val_size, test_size))

print('Train dataset: {} instances, validation dataset: {}, test dataset: {}'.format(len(train_dataset), len(val_dataset), len(test_dataset)))

In [None]:
def prepare_dataset(dataset, path_img, path_lbl):
    i = 0
    for (image, label) in dataset:
        image = cv2.imread(image)
        file_image = path_img + '/im' + str(i) + '.jpg'
        file_label = path_lbl + '/im' + str(i) + '.txt'
        cv2.imwrite(file_image, image)
        np.savetxt(file_label, label, fmt='%i %.4f %.4f %.4f %.4f')
        i += 1

In [None]:
prepare_dataset(train_dataset, COLAB_PREFIX+TRAIN_IMG, COLAB_PREFIX+TRAIN_LBL)

In [None]:
prepare_dataset(val_dataset, COLAB_PREFIX+VAL_IMG, COLAB_PREFIX+VAL_LBL)

In [None]:
prepare_dataset(test_dataset, COLAB_PREFIX+TEST_IMG, COLAB_PREFIX+TEST_LBL)

In [None]:
config = {
'path': '../data',
'train': 'images/train',
'val': 'images/val',
'test': 'images/test',
'nc': 1,
'names': ['starfish']
}

In [None]:
%cd /content/drive/MyDrive/yolo
!git clone https://github.com/ultralytics/yolov5

In [None]:
%cd 
with open("/content/drive/MyDrive/yolo/data_yolo.yaml", "w") as file:
    yaml.dump(config, file, default_flow_style=False)