# YOLOX on COTS dataset: train

In [None]:
DEBUG = False
COLAB = True
VER = 'vclbyolo30'
WORK_DIR = '/content/drive/MyDrive/reef' if COLAB else '/u01/mrorange/reef'

## Install YOLOX

In [None]:
if COLAB:
    from google.colab import drive
    drive.mount('/content/drive')

In [None]:
if COLAB:
    %cd {WORK_DIR}
    !git clone https://github.com/Megvii-BaseDetection/YOLOX -q
    %cd YOLOX
    !pip install -U pip && pip install -r requirements.txt
    !pip install -v -e . 
    !pip install 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'

In [None]:
import ast
import os
import json
import time
import random
import numpy as np
import pandas as pd
import torch
import importlib
import cv2 
from shutil import copyfile
from tqdm.notebook import tqdm
tqdm.pandas()
from sklearn.model_selection import GroupKFold
from PIL import Image
from string import Template
from IPython.display import display
import warnings
if DEBUG:
    warnings.filterwarnings('ignore', category=UserWarning)

In [None]:
CONFIG = {
    'ver': VER,
    'bbone': 'yolox_l.pth', # yolox_s.pth yolox_nano.pth
    'width': 1280, 
    'height': 720,
    'resize': (1920, 1920), # 460 640 960 1024 (1280, 736) (1280, 1280)
    'batch_size': 2, # 8
    'workers': 4 if COLAB else 8,
    'folds': 5,
    'bal_split': None, #'train_split_balanced_v1', None
    'val_fold': 4,
    'val_video': 3, # 1, 2, 3
    'empty_sh': .2,
    'test_conf': .1, # default .01
    'nmsthre': .65, # default .65
    'basic_lr_per_img': .01 / 64, # default .01 / 64
    'random_size': (40, 61), # default (14, 26)
    'mosaic_prob': 0, # default 1.0
    'mixup_prob': 0.5, # default 1.0
    'hsv_prob': 0.5, # default 1.0
    'flip_prob': 0.5, # default 0.5
    'degrees': 0, # default 10.0
    'translate': 0.1, # default 0.1
    'mosaic_scale': (0.6, 1.4), # default (0.1, 2)
    'mixup_scale': (0.5, 1.5), # default (0.5, 1.5)
    'shear': 2.0, # default 2.0
    'enable_mixup': True, # default True
    'warmup_epochs': 2,
    'no_aug_epochs': 0,
    'epochs': 4 if DEBUG else 8,
    'seed': 2022
}
VER_DATA = f'fs{CONFIG["folds"]}vf{CONFIG["val_fold"]}'
DATA_PATH = f'{WORK_DIR}/data'
if CONFIG["bal_split"]:
    YDATA_PATH = f'{WORK_DIR}/data_bs_{VER_DATA}'
elif CONFIG['val_video']:
    YDATA_PATH = f'{WORK_DIR}/data_vv_{CONFIG["val_video"] - 1}'
else:
     YDATA_PATH = f'{WORK_DIR}/data_{VER_DATA}'
print('data path:', YDATA_PATH)
MDLS_PATH = f'{WORK_DIR}/models_{VER}'
TH = CONFIG['test_conf']
NMS_TH = CONFIG['nmsthre']
NUM_CLASSES = 1
COCO_CLASSES = (
  'starfish',
)
PIPELINE_CONFIG_PATH = f'cots_config_{VER}.py'
if not os.path.exists(YDATA_PATH):
    os.mkdir(YDATA_PATH)
if not os.path.exists(MDLS_PATH):
    os.mkdir(MDLS_PATH)
with open(f'{MDLS_PATH}/config.json', 'w') as file:
    json.dump(CONFIG, file)

def seed_all(seed=0):
    np.random.seed(seed)
    random_state = np.random.RandomState(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    return random_state    
    
random_state = seed_all(CONFIG['seed'])
start_time = time.time()

## Data preprocessing

In [None]:
def get_bbox(anns):
    bboxes = [list(ann.values()) for ann in anns]
    return bboxes

def get_path(row):
    row['image_path'] = f'{DATA_PATH}/train_images/video_{row.video_id}/{row.video_frame}.jpg'
    return row

def df_proc(df):
    df['annotations'] = df['annotations'].progress_apply(lambda x: ast.literal_eval(x))
    df['bboxes'] = df.annotations.progress_apply(get_bbox)
    df["width"] = CONFIG['width']
    df["height"] = CONFIG['height']
    df = df.progress_apply(get_path, axis=1)
    return df

In [None]:
df = pd.read_csv(f'{DATA_PATH}/{CONFIG["bal_split"]}.csv') if CONFIG["bal_split"] else pd.read_csv(f'{DATA_PATH}/train.csv') 
df["num_bbox"] = df['annotations'].apply(lambda x: str.count(x, 'x'))
df = df[df["num_bbox"] > 0]
df = df_proc(df)

df_ = pd.read_csv(f'{DATA_PATH}/train.csv') 
df_["num_bbox"] = df_['annotations'].apply(lambda x: str.count(x, 'x'))
df_ = df_[df_["num_bbox"] == 0]
df_ = df_.sample(int(CONFIG['empty_sh'] * len(df)))
df_.reset_index(inplace=True)
df_ = df_proc(df_)

gkf = GroupKFold(n_splits=CONFIG['folds'])
if CONFIG["bal_split"]:
    df['fold'] = df['fold_id']
    df_['fold'] = -1
    for i, (train_idxs, val_idxs) in enumerate(gkf.split(df_, groups=df_['sequence'])):
        df_.loc[val_idxs, 'fold'] = i
    df = df.append(df_)
else:
    df = df.append(df_)
    if DEBUG: 
        df = df.sample(100)
    df = df.reset_index(drop=True)
    df['fold'] = -1
    for i, (train_idxs, val_idxs) in enumerate(gkf.split(df, groups=df['sequence'])):
        df.loc[val_idxs, 'fold'] = i
    if CONFIG['val_video']:
        df['fold'] = df['video_id']
display(df.head())

In [None]:
df.groupby(by=['video_id', 'sequence', 'fold']).sum()['num_bbox']

## Annotation files and image folders

In [None]:
def save_ann_json(json_ann, filename):
    with open(filename, 'w') as file:
        output_json = json.dumps(json_ann)
        file.write(output_json)

def dataset2coco(df, dest_path):
    global ann_id
    anns_json = {
        "info": [],
        "licenses": [],
        "categories": [],
        "images": [],
        "annotations": []
    }
    info = {
        "year": "2021",
        "version": "1",
        "description": "COTS dataset - COCO format",
        "contributor": "",
        "url": "https://kaggle.com",
        "date_created": "2021-11-30T15:01:26+00:00"
    }
    anns_json["info"].append(info)
    lic = {
        "id": 1,
        "url": "",
        "name": "Unknown"
    }
    anns_json["licenses"].append(lic)
    classes = {
        "id": 1, 
        "name": "starfish", 
        "supercategory": "none"
    }
    anns_json["categories"].append(classes)
    for ann_row in df.itertuples():        
        images = {
            "id": ann_row[0],
            "license": 1,
            "file_name": ann_row.image_id + '.jpg',
            "height": ann_row.height,
            "width": ann_row.width,
            "date_captured": "2021-11-30T15:01:26+00:00"
        }
        anns_json["images"].append(images)
        bbox_list = ann_row.bboxes
        for bbox in bbox_list:
            b_width = bbox[2]
            b_height = bbox[3]
            # some boxes in COTS are outside the image height and width
            if (bbox[0] + bbox[2] > 1280):
                b_width = 1280 - bbox[0]
            if (bbox[1] + bbox[3] > 720):
                b_height = 720 - bbox[1]
            image_anns = {
                "id": ann_id,
                "image_id": ann_row[0],
                "category_id": 1,
                "bbox": [bbox[0], bbox[1], b_width, b_height],
                "area": bbox[2] * bbox[3],
                "segmentation": [],
                "iscrowd": 0
            }
            ann_id += 1
            anns_json["annotations"].append(image_anns)
    print('COTS annotation to COCO json format done, files:', len(df))
    return anns_json

In [None]:
train_path = f'{YDATA_PATH}/train2017'
val_path = f'{YDATA_PATH}/val2017'
val_fold = (CONFIG['val_video'] - 1) if CONFIG['val_video'] else CONFIG['val_fold']

if (not os.path.exists(train_path)) and (not os.path.exists(val_path)):
    !mkdir -p {train_path}
    !mkdir -p {val_path}
    !mkdir -p {YDATA_PATH}/annotations

    for i, row in tqdm(df.iterrows(), total=len(df)):
        if row.fold != val_fold:
            copyfile(
                f'{row.image_path}', 
                f'{train_path}/{row.image_id}.jpg'
            )
        else:
            copyfile(
                f'{row.image_path}', 
                f'{val_path}/{row.image_id}.jpg'
            ) 
    ann_id = 0
    train_anns_json = dataset2coco(
        df[df.fold != val_fold], 
        train_path
    )
    val_anns_json = dataset2coco(
        df[df.fold == val_fold], 
        val_path
    )
    save_ann_json(
        train_anns_json, 
        f'{YDATA_PATH}/annotations/train.json'
    )
    save_ann_json(
        val_anns_json, 
        f'{YDATA_PATH}/annotations/val.json'
    )

print('train files:', len(os.listdir(train_path)))
print('val files:', len(os.listdir(val_path)))

elapsed_time = time.time() - start_time
print(f'time elapsed: {elapsed_time // 60:.0f} min {elapsed_time % 60:.0f} sec')

## Config

In [None]:
config_file_template = '''
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) Megvii, Inc. and its affiliates.

import os
from yolox.exp import Exp as MyExp

class Exp(MyExp):
    def __init__(self):
        super(Exp, self).__init__()
        # ---------------- model config ---------------- #
        self.num_classes = $num_classes
        self.depth = $depth
        self.width = $width
        self.act = 'silu'

        # ---------------- dataloader config ---------------- #
        # set worker to 4 for shorter dataloader init time
        self.data_num_workers = $num_workers
        self.input_size = $input_size  # (height, width)
        # Actual multiscale ranges: [640-5*32, 640+5*32].
        # To disable multiscale training, set the
        # self.multiscale_range to 0.
        # self.multiscale_range = 5
        # You can uncomment this line to specify a multiscale range
        self.random_size = $random_size
        self.data_dir = "$data_dir"
        self.train_ann = "train.json"
        self.val_ann = "val.json"

        # --------------- transform config ----------------- #
        self.mosaic_prob = $mosaic_prob
        self.mixup_prob = $mixup_prob
        self.hsv_prob = $hsv_prob
        self.flip_prob = $flip_prob
        self.degrees = $degrees
        self.translate = $translate
        self.mosaic_scale = $mosaic_scale
        self.mixup_scale = $mixup_scale
        self.shear = $shear
        self.enable_mixup = $enable_mixup

        # --------------  training config --------------------- #
        self.warmup_epochs = $warmup_epochs
        self.max_epoch = $max_epoch
        self.warmup_lr = 0
        self.basic_lr_per_img = $basic_lr_per_img
        self.scheduler = "yoloxwarmcos"
        self.no_aug_epochs = $no_aug_epochs
        self.min_lr_ratio = 0.05
        self.ema = True

        self.weight_decay = 5e-4
        self.momentum = 0.9
        self.print_interval = 100
        self.eval_interval = 1
        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]

        # -----------------  testing config ------------------ #
        self.test_size = $test_size
        self.test_conf = $test_conf
        self.nmsthre = $nmsthre

'''
print(config_file_template)

In [None]:
 %cd {WORK_DIR}/YOLOX

if CONFIG['bbone'] == 'yolox_s.pth':
    depth, width = .33, .50
elif CONFIG['bbone'] == 'yolox_m.pth':
    depth, width = .67, .75
elif CONFIG['bbone'] == 'yolox_l.pth':
    depth, width = 1, 1
else:
    print('ERROR backbone')


pipeline = Template(config_file_template).substitute(
    num_classes=NUM_CLASSES,
    depth=depth,
    width=width,
    max_epoch=CONFIG['epochs'],
    data_dir=YDATA_PATH,
    num_workers=CONFIG['workers'],
    input_size=CONFIG['resize'],
    test_size=(max(CONFIG['resize']), max(CONFIG['resize'])),
    test_conf=CONFIG['test_conf'],
    nmsthre=CONFIG['nmsthre'],
    basic_lr_per_img=CONFIG['basic_lr_per_img'],
    random_size=CONFIG['random_size'],
    mosaic_prob=CONFIG['mosaic_prob'],
    mixup_prob=CONFIG['mixup_prob'],
    hsv_prob=CONFIG['hsv_prob'],
    flip_prob=CONFIG['flip_prob'],
    degrees=CONFIG['degrees'],
    translate=CONFIG['translate'],
    mosaic_scale=CONFIG['mosaic_scale'],
    mixup_scale=CONFIG['mixup_scale'],
    shear=CONFIG['shear'],
    enable_mixup=CONFIG['enable_mixup'],
    warmup_epochs=CONFIG['warmup_epochs'],
    no_aug_epochs=CONFIG['no_aug_epochs']
)
with open(PIPELINE_CONFIG_PATH, 'w') as file:
    file.write(pipeline)

!cat cots_config_{VER}.py

In [None]:
voc_cls = '''
VOC_CLASSES = (
  "starfish",
)
'''
with open('./yolox/data/datasets/voc_classes.py', 'w') as f:
    f.write(voc_cls)

coco_cls = '''
COCO_CLASSES = (
  "starfish",
)
'''
with open('./yolox/data/datasets/coco_classes.py', 'w') as f:
    f.write(coco_cls)

!more ./yolox/data/datasets/coco_classes.py

## Train

In [None]:
if not os.path.exists(CONFIG['bbone']):
    !wget https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/{CONFIG['bbone']}
!cp ./tools/train.py ./

In [None]:
!python train.py \
    -f cots_config_{VER}.py \
    -d 1 \
    -b {CONFIG['batch_size']} \
    --fp16 \
    -o \
    -c {CONFIG['bbone']}

In [None]:
!cp {WORK_DIR}/YOLOX/YOLOX_outputs/cots_config_{VER}/best_ckpt.pth {WORK_DIR}/models_{VER}
!cp {WORK_DIR}/YOLOX/YOLOX_outputs/cots_config_{VER}/last_epoch_ckpt.pth {WORK_DIR}/models_{VER}
!cp {WORK_DIR}/YOLOX/YOLOX_outputs/cots_config_{VER}/train_log.txt {WORK_DIR}/models_{VER}
!cp cots_config_{VER}.py {WORK_DIR}/models_{VER}

elapsed_time = time.time() - start_time
print(f'time elapsed: {elapsed_time // 60:.0f} min {elapsed_time % 60:.0f} sec')

## Inference

In [None]:
from yolox.utils import postprocess
from yolox.data.data_augment import ValTransform

cots_config = f'cots_config_{VER}'
current_exp = importlib.import_module(cots_config)
exp = current_exp.Exp()
test_size = [int(x * 1) for x in CONFIG['resize']]
model = exp.get_model()
model.cuda()
model.eval()
ckpt_file = f'{WORK_DIR}/models_{VER}/best_ckpt.pth'
ckpt = torch.load(ckpt_file, map_location='cpu')
model.load_state_dict(ckpt["model"])

In [None]:
def yolox_infer(img, model, test_size, th, nms_th, num_classes): 
    #bboxes = []
    #bbclasses = []
    #scores = []
    preproc = ValTransform(legacy=False)
    tensor_img, _ = preproc(img, None, test_size)
    tensor_img = torch.from_numpy(tensor_img).unsqueeze(0)
    tensor_img = tensor_img.float()
    tensor_img = tensor_img.cuda()
    with torch.no_grad():
        outputs = model(tensor_img)
        outputs = postprocess(
            outputs, num_classes, th,
            nms_th, class_agnostic=True)
    if outputs[0] is None:
        return torch.Tensor([]), torch.Tensor([]), torch.Tensor([])
    outputs = outputs[0].cpu()
    bboxes = outputs[:, 0 : 4]
    bboxes /= min(test_size[0] / img.shape[0], test_size[1] / img.shape[1])
    bbclasses = outputs[:, 6]
    scores = outputs[:, 4] * outputs[:, 5]
    return bboxes, bbclasses, scores

In [None]:
def draw_yolox_preds(img, bboxes, scores, bbclasses, th, classes_dict):
    for i in range(len(bboxes)):
        box = bboxes[i]
        cls_id = int(bbclasses[i])
        score = scores[i]
        if score < th:
            continue
        x0 = int(box[0])
        y0 = int(box[1])
        x1 = int(box[2])
        y1 = int(box[3])
        cv2.rectangle(img, (x0, y0), (x1, y1), (0, 255, 0), 2)
        cv2.putText(
            img, 
            '{}:{:.1f}%'.format(classes_dict[cls_id], score * 100), 
            (x0, y0 - 3), 
            cv2.FONT_HERSHEY_PLAIN, 
            1.4, 
            (0, 255, 0), 
            thickness=2)
    return img

In [None]:
for id_test in range(10, 16):
    img_test = os.listdir(f'{YDATA_PATH}/val2017')[id_test]
    TEST_IMAGE_PATH = f'{YDATA_PATH}/val2017/{img_test}'
    img = cv2.imread(TEST_IMAGE_PATH)

    bboxes, bbclasses, scores = yolox_infer(
        img, model, test_size,
        TH, NMS_TH, NUM_CLASSES)
    out_image = draw_yolox_preds(
        img, 
        bboxes,
        scores, 
        bbclasses, 
        TH, 
        COCO_CLASSES)
    out_image = cv2.cvtColor(out_image, cv2.COLOR_BGR2RGB)
    display(Image.fromarray(out_image))

## Val score

In [None]:
from typing import List
from torchvision.ops import box_iou

In [None]:
def calculate_score(
    preds: List[torch.Tensor],
    gts: List[torch.Tensor],
    iou_th: float
) -> float:
    num_tp = 0
    num_fp = 0
    num_fn = 0
    for p, gt in zip(preds, gts):
        if len(p) and len(gt):
            iou_matrix = box_iou(p, gt)
            tp = len(torch.where(iou_matrix.max(0)[0] >= iou_th)[0])
            fp = len(p) - tp
            fn = len(torch.where(iou_matrix.max(0)[0] < iou_th)[0])
            num_tp += tp
            num_fp += fp
            num_fn += fn
        elif len(p) == 0 and len(gt):
            num_fn += len(gt)
        elif len(p) and len(gt) == 0:
            num_fp += len(p)
    if (5 * num_tp + 4 * num_fn + num_fp )!=0:
        score = 5 * num_tp / (5 * num_tp + 4 * num_fn + num_fp )
    else:
        score = np.nan
    if (num_tp+num_fn) != 0:
        recall = num_tp/ (num_tp+num_fn)
    else:
        recall=np.nan
    if (num_tp+num_fp)!=0:
        precission = num_tp/ (num_tp+num_fp)
    else:
        precission=np.nan
    return score, precission, recall

def evaluate_f2(th, test_size):
    scores = []
    prec05 = []
    rec05 = []
    prec03 = []
    rec03 = []
    iou_ths = np.arange(.3, .85, .05)
    for i, row in tqdm(df_val.iterrows(), total=len(df_val), desc=f'th {th}'):
        img_path = row.image_path
        img = cv2.imread(img_path)
        bboxes, bbclasses, scores_ = yolox_infer(
            img, model, test_size,
            TH, NMS_TH, NUM_CLASSES)
        gts =  torch.Tensor([
            [x[0], x[1], x[0] + x[2], x[1] + x[3]] 
            for x in row.bboxes])
        bboxes = bboxes[scores_ > th] if scores_.tolist() else bboxes
        score = [calculate_score(bboxes.int().unsqueeze(0), gts.unsqueeze(0), iou_th)[0] 
                 for iou_th in iou_ths]
        scores.append(np.nanmean(score))
        prec05.append(calculate_score(bboxes.int().unsqueeze(0), gts.unsqueeze(0), .5)[1]) 
        prec03.append(calculate_score(bboxes.int().unsqueeze(0), gts.unsqueeze(0), .3)[1]) 
        rec05.append(calculate_score(bboxes.int().unsqueeze(0), gts.unsqueeze(0), .5)[2]) 
        rec03.append(calculate_score(bboxes.int().unsqueeze(0), gts.unsqueeze(0), .3)[2]) 
    print(f'threshold {th} |',
          f'F2 score: {np.nanmean(scores):.3f} |',
          f'precision at .5: {np.nanmean(prec05):.3f}',
          f'precision at .3: {np.nanmean(prec03):.3f} |',
          f'recall at .5: {np.nanmean(rec05):.3f}',
          f'recall at .3: {np.nanmean(rec03):.3f}')
    val_results = {
        'threshold': th,
        'F2 score': np.nanmean(scores),
        'precision_at_p5': np.nanmean(prec05),
        'precision_at_p3': np.nanmean(prec03),
        'recall_at_p5': np.nanmean(rec05),
        'recall_at_p3': np.nanmean(rec03)
    }
    return val_results

In [None]:
MULT = 1
test_size_mult = [int(MULT * x) for x in CONFIG['resize']]

df_val = df[df.fold == val_fold]
for th in [.1, .3, .5]:
    val_results = evaluate_f2(th=th, test_size=test_size_mult)
    with open(f'{MDLS_PATH}/best_val_results_th_{th}.json', 'w') as file:
        json.dump(val_results, file)

In [None]:
model = exp.get_model()
model.cuda()
model.eval()
ckpt_file = f'{WORK_DIR}/models_{VER}/last_epoch_ckpt.pth'
ckpt = torch.load(ckpt_file, map_location='cpu')
model.load_state_dict(ckpt["model"])

df_val = df[df.fold == val_fold]
for th in [.1, .3, .5]:
    val_results = evaluate_f2(th=th, test_size=test_size_mult)
    with open(f'{MDLS_PATH}/last_val_results_th_{th}.json', 'w') as file:
        json.dump(val_results, file)