In [None]:
!python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'

In [None]:
import detectron2
from detectron2.engine import DefaultTrainer
from detectron2.config import get_cfg
from detectron2 import model_zoo
from detectron2.data import DatasetCatalog, MetadataCatalog, detection_utils
from detectron2.data.datasets import register_coco_instances
from detectron2.data.transforms import ResizeTransform
from detectron2.structures import BoxMode
from detectron2.evaluation import COCOEvaluator
import json
import csv
import numpy as np
import torch
from torch.utils.data import random_split
from torchvision.datasets import MNIST
import cv2
import os
from google.colab import files

In [None]:
TORCH_VERSION = ".".join(torch.__version__.split(".")[:2])
CUDA_VERSION = torch.__version__.split("+")[-1]
print("torch: ", TORCH_VERSION, "; cuda: ", CUDA_VERSION)

torch:  2.0 ; cuda:  cu118


In [None]:
def process_bbox(bbox):
  substr = bbox[1:len(bbox) - 1]
  nums = substr.split(', ')
  ret = []
  for i in nums:
    ret.append(int(float(i) * 0.25))
  return ret


In [None]:
# Encountered some bugs using ResizeTransform directly in modeling, so I switched to thsi approach: resize the pictures as a standalone function and save the resized pictures in the workspace
def resize_images():
  filepath = 'train.csv'
  with open(filepath, 'r') as f:
    reader = csv.DictReader(f)
    cnt = 0
    visited=set()

    for row in reader:
      if row['image_id'] in visited:
        continue
      visited.add(row['image_id'])
      resize = ResizeTransform(new_h=256, new_w=256, h=1024, w=1024)
      image = resize.apply_image(cv2.imread(get_image_path(row['image_id'])))
      filename = os.path.join('resized', f"{row['image_id']}.jpg")
      cv2.imwrite(filename, image)
      cnt+=1

In [None]:
def get_image_path(image_id):
  return f'train/{image_id}.jpg'

def scale(magnitude, origin, new):
  return int(magnitude * (new/origin))

def generate_annotations2():
  filepath = 'train.csv'
  result = []
  with open(filepath, 'r') as f:
    reader = csv.DictReader(f)

    for row in reader:
      bbox = process_bbox(row['bbox'])

      annotation = {
          'bbox': bbox,
          'bbox_mode': BoxMode.XYWH_ABS,
          'category_id': 0,
          'category': 'wheat',
          'image_id': row['image_id'],
      }

      # if the current row has the same image_id as the preivous one, then just append the annotation to the previous one, avoiding duplication
      if result and row['image_id'] == result[-1]['image_id']:
        result[-1]['annotations'].append(annotation)
        continue

      height = scale(float(row['height']), 1024, 256)
      width = scale(float(row['width']), 1024, 256)
      image_path = f'resized/{row['image_id']}.jpg'
      image = cv2.imread(image_path)

      image_data = {
          'file_name': image_path,
          'image_id': row['image_id'],
          'height': height,
          'width': width,
          'image': image,
          'annotations': [annotation]
      }
      result.append(image_data)

  return result


In [None]:
def register_data():
  DatasetCatalog.clear()
  MetadataCatalog.clear()
  DatasetCatalog.register('training_dataset2', generate_annotations2)
  MetadataCatalog.get('training_dataset2').set(thing_classes=['wheat'])

  # taking 80% as the training data and rest as validation data
  overall = DatasetCatalog.get('training_dataset2')
  train_len = 0.8 * len(overall)
  train_data, val_data = torch.utils.data.random_split(overall, [train_len, len(overall) - train_len])
  train_data_name = 'train_data'
  val_data_name = 'tune_data' # an interesting problem, seems like DefaultTrainer doesn't take words starting with 'v', so I changed 'val_data' to 'tune_data'

  DatasetCatalog.register(train_data_name, lambda: train_data)
  MetadataCatalog.get(train_data_name).set(thing_classes=['wheat'])

  DatasetCatalog.register(val_data_name, lambda: val_data)
  MetadataCatalog.get(val_data_name).set(thing_classes=['wheat'])

In [None]:
# The purpose of creating this custom trainer is to enable it using validation data
class MyTrainer(DefaultTrainer):

  @classmethod
  def build_evaluator(cls, cfg, dataset_name='tune_data'):
    return COCOEvaluator(dataset_name, cfg, False, output_dir='out')

In [None]:
def train_and_save_model():
  cfg = get_cfg()
  cfg.merge_from_file(model_zoo.get_config_file("COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml")) # choose faster rcnn as I don't have ground truth masks in the training data
  cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml")
  cfg.SOLVER.OPTIMIZER = "Adam"  # tried with SGD originally, wasn't satisfying
  cfg.SOLVER.ADAM = True
  cfg.SOLVER.ADAM_BETAS = (0.9, 0.999)
  cfg.SOLVER.ADAM_EPSILON = 1e-08
  cfg.SOLVER.BASE_LR = 0.001
  cfg.MODEL.ROI_HEADS.NUM_CLASSES = 1 # only one class wheat
  cfg.SOLVER.MAX_ITER = 675 #
  cfg.SOLVER.IMS_PER_BATCH = 4 # tried with 2 initially, was too slow
  cfg.TEST.EVAL_PERIOD = 300
  cfg.DATASETS.TRAIN = (train_data_name,)
  cfg.DATASETS.TEST = (val_data_name,)

  trainer = MyTrainer(cfg)
  trainer.resume_or_load(resume=False)

  trainer.train()

  # save the model for later use
  save_path = "model.pth"
  torch.save(trainer.state_dict(), save_path)
  files.download(save_path)

# This takes way too long to train and has too long logs, so I just paste the logs from last line: fast_rcnn/cls_accuracy: 0.8876953125, total_loss: 0.9084931015968323

In [None]:
resize_images()
!zip -r resized.zip resized
from google.colab import files
files.download('resized.zip') # save the resized images for later use

register_data()
train_data()