<a href="https://colab.research.google.com/github/tleyden/FruitPunch_AI_Bootcamp/blob/main/FruitPunch_Capstone_Project_YOLO_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Pip installs

In [None]:
!apt install -y jq
!pip install pycocotools
!python -m pip install pyyaml==5.1
!pip install wandb -qU


Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
The following additional packages will be installed:
  libjq1 libonig4
The following NEW packages will be installed:
  jq libjq1 libonig4
0 upgraded, 3 newly installed, 0 to remove and 20 not upgraded.
Need to get 276 kB of archives.
After this operation, 930 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libonig4 amd64 6.7.0-1 [119 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libjq1 amd64 1.5+dfsg-2 [111 kB]
Get:3 http://archive.ubuntu.com/ubuntu bionic/universe amd64 jq amd64 1.5+dfsg-2 [45.6 kB]
Fetched 276 kB in 0s (829 kB/s)
Selecting previously unselected package libonig4:amd64.
(Reading database ... 124016 files and directories currently installed.)
Preparing to unpack .../libo

## Imports

In [43]:
import os
from matplotlib import pyplot as plt
import cv2

from pycocotools.coco import COCO
import skimage.io as io
from pathlib import Path

import sys, distutils.core
from google.colab.patches import cv2_imshow
import json
import shutil
import yaml

## Weights and biases setup

In [48]:
import wandb
wandb.login() 

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

## Mount google drive

In [None]:
# Mount google drive
from google.colab import drive
drive.mount("/content/drive")
     

Mounted at /content/drive


## Download zips from google bucket

In [None]:
from google.colab import auth
auth.authenticate_user()

if not os.path.exists("Labeled data-20211126T095740Z-001.zip"):
    !gsutil cp "gs://fruitpunch-ai-tleyden/Labeled data-20211126T095740Z-001.zip" .
if not os.path.exists("Labeled data-20211126T095740Z-002.zip"):
    !gsutil cp "gs://fruitpunch-ai-tleyden/Labeled data-20211126T095740Z-002.zip" .


Copying gs://fruitpunch-ai-tleyden/Labeled data-20211126T095740Z-001.zip...
| [1 files][  2.0 GiB/  2.0 GiB]   79.9 MiB/s                                   
Operation completed over 1 objects/2.0 GiB.                                      
Copying gs://fruitpunch-ai-tleyden/Labeled data-20211126T095740Z-002.zip...
/ [1 files][809.6 MiB/809.6 MiB]   83.7 MiB/s                                   
Operation completed over 1 objects/809.6 MiB.                                    


In [None]:
if not os.path.exists("labeled_data"):
    !unzip -q "Labeled data-20211126T095740Z-001.zip"
    !unzip -q "Labeled data-20211126T095740Z-002.zip"

## Re-arrange directories to match expected structure

In [20]:
if not os.path.exists("labeled_data"):
    os.makedirs("labeled_data")
    !mv "Labeled data" labeled_data
if not os.path.exists("labeled_data/images/test"):
    !mkdir labeled_data/images/test 
    !mv labeled_data/images/*.PNG labeled_data/images/test 

Already have labeled_data dir


## Set some global variables

In [53]:
DATA_PATH = '/content/labeled_data/'


use_single_batch = False
if use_single_batch:
    # Smaller subset of dataset for faster iteration
    LABELS_PATH = DATA_PATH + 'annotations_single_batch/'
    IMAGES_PATH = DATA_PATH + 'images_single_batch/'
else:
    LABELS_PATH = DATA_PATH + 'annotations/'
    IMAGES_PATH = DATA_PATH + 'images/'

# Get paths to IMAGE directories
TRAIN_IMAGES_PATH = IMAGES_PATH + 'train/'
TEST_IMAGES_PATH = IMAGES_PATH + 'test/'
VAL_IMAGES_PATH = IMAGES_PATH + 'val/'

TRAIN_LABELS = LABELS_PATH + 'instances_train.json'
TEST_LABELS = LABELS_PATH + 'instances_test_dataset.json'
VAL_LABELS = LABELS_PATH + 'instances_val.json'



## Create single batch annotations

Create a smaller dataset that can be used for faster iteration

In [24]:
def create_single_batch_annotations(num_instances_to_collect):
    """
    Create a copy of annotations/instances_train.json, instances_test_dataset.json and instances_val.json
    in the annotations_single_batch directory with a smaller subset of annotations.
    """
    source_annotations = ["instances_train.json", "instances_test_dataset.json", "instances_val.json"]
    for source_annotation in source_annotations:
        subset_json = {}
        with open(os.path.join(DATA_PATH, "annotations", source_annotation), "r") as f:
            instances_json = json.loads(f.read())
        subset_json["licenses"] = instances_json["licenses"]
        subset_json["info"] = instances_json["info"]
        subset_json["categories"] = instances_json["categories"]
        subset_images = []
        subset_annotations = []
        collected_image_ids = []
        
        for annotation_json in instances_json["annotations"]:
            if len(subset_annotations) >= num_instances_to_collect:
                # We have collected enough
                break
            subset_annotations.append(annotation_json)
            collected_image_ids.append(annotation_json["image_id"])
            
        for image_json in instances_json["images"]:
            if image_json["id"] not in collected_image_ids:
                continue
            subset_images.append(image_json)

            # Copy image to target image dir
            if "test" in source_annotation:
                target_img_dir = "test"
            elif "train" in source_annotation:
                target_img_dir = "train"
            elif "val" in source_annotation:
                target_img_dir = "val"

            file_name = image_json["file_name"]
            source_img = os.path.join(DATA_PATH, "images", target_img_dir, file_name)
            target_img = os.path.join(DATA_PATH, "images_single_batch", target_img_dir, file_name)
            if not os.path.exists(target_img):
                os.link(source_img, target_img)


        print(f"For {source_annotation} collected {len(subset_annotations)} instances")
        subset_json["images"] = subset_images
        subset_json["annotations"] = subset_annotations

        # Write json to target dir
        source_annotation_full_path = os.path.join(DATA_PATH, "annotations_single_batch", source_annotation)
        with open(source_annotation_full_path, "w") as f:
            json.dump(subset_json, f)
        print(f"Wrote {source_annotation_full_path}")




In [25]:
if use_single_batch:
    if not os.path.exists(os.path.join(DATA_PATH, "annotations_single_batch")):
        os.makedirs(os.path.join(DATA_PATH, "annotations_single_batch"))
    if not os.path.exists(os.path.join(DATA_PATH, "images_single_batch")):
        os.makedirs(os.path.join(DATA_PATH, "images_single_batch"))
        os.makedirs(os.path.join(DATA_PATH, "images_single_batch", "train"))
        os.makedirs(os.path.join(DATA_PATH, "images_single_batch", "test"))
        os.makedirs(os.path.join(DATA_PATH, "images_single_batch", "val"))

create_single_batch_annotations(64)

For instances_train.json collected 64 instances
Wrote /content/labeled_data/annotations_single_batch/instances_train.json
For instances_test_dataset.json collected 64 instances
Wrote /content/labeled_data/annotations_single_batch/instances_test_dataset.json
For instances_val.json collected 64 instances
Wrote /content/labeled_data/annotations_single_batch/instances_val.json


## Convert annotations from coco -> yolo format

In [None]:
!git clone https://github.com/ultralytics/JSON2YOLO.git  # clone repo
%pip install -qr JSON2YOLO/requirements.txt # install dependencies

Cloning into 'JSON2YOLO'...
remote: Enumerating objects: 277, done.[K
remote: Counting objects: 100% (113/113), done.[K
remote: Compressing objects: 100% (31/31), done.[K
remote: Total 277 (delta 102), reused 84 (delta 82), pack-reused 164[K
Receiving objects: 100% (277/277), 74.65 KiB | 1.96 MiB/s, done.
Resolving deltas: 100% (177/177), done.
/content/JSON2YOLO


In [54]:
from general_json2yolo import convert_coco_json

if use_single_batch:
    convert_coco_json('/content/labeled_data/annotations_single_batch')
else:
    convert_coco_json('/content/labeled_data/annotations')

Annotations /content/labeled_data/annotations/instances_test_dataset.json: 100%|██████████| 1900/1900 [00:00<00:00, 4920.98it/s]
Annotations /content/labeled_data/annotations/instances_train.json: 100%|██████████| 13111/13111 [00:01<00:00, 7081.87it/s]
Annotations /content/labeled_data/annotations/instances_val.json: 100%|██████████| 3278/3278 [00:00<00:00, 7147.11it/s]


## Copy images to yolo dir

One confusing thing here is that it actually puts the images into the labels directory to keep them in a single directory (as expected by yolov5).  A TODO item is to rename this directory to be clearer

In [55]:
if use_single_batch:
    imgs_source_dir = os.path.join("labeled_data", "images_single_batch")
else:
    imgs_source_dir = os.path.join("labeled_data", "images")

imgs_train = os.path.join(imgs_source_dir, "train")
imgs_test = os.path.join(imgs_source_dir, "test")
imgs_val = os.path.join(imgs_source_dir, "val")

imgs_target_dir = os.path.join("new_dir", "labels")
imgs_target_train =  os.path.join(imgs_target_dir, "train")
imgs_target_test =  os.path.join(imgs_target_dir, "test_dataset")
imgs_target_val =  os.path.join(imgs_target_dir, "val")

!cp {imgs_train}/*.PNG {imgs_target_train}/
!cp {imgs_test}/*.PNG {imgs_target_test}/
!cp {imgs_val}/*.PNG {imgs_target_val}/


## Create yolo data yaml file

In [56]:
yolo_data_yaml = {
    "nc": 1,
    "names": ["Human"],
    "train": ["/content/new_dir/labels/train"],
    "test": ["/content/new_dir/labels/test_dataset"],
    "val": ["/content/new_dir/labels/val"]
}

with open('data.yaml', 'w') as file:
    yaml.dump(yolo_data_yaml, file)

## Install yolo

In [57]:
if not os.path.exists("yolov5"):
    !git clone https://github.com/ultralytics/yolov5  # clone repo
    %pip install -qr yolov5/requirements.txt # install dependencies


## Train yolo

In [None]:
!python yolov5/train.py --img 640 --batch 24 --epochs 300 --data data.yaml --weights yolov5l.pt --cache


[34m[1mwandb[0m: Currently logged in as: [33mtleyden[0m ([33meyepi[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mtrain: [0mweights=yolov5l.pt, cfg=, data=data.yaml, hyp=yolov5/data/hyps/hyp.scratch-low.yaml, epochs=300, batch_size=24, imgsz=640, rect=False, resume=False, nosave=False, noval=False, noautoanchor=False, noplots=False, evolve=None, bucket=, cache=ram, image_weights=False, device=, multi_scale=False, single_cls=False, optimizer=SGD, sync_bn=False, workers=8, project=yolov5/runs/train, name=exp, exist_ok=False, quad=False, cos_lr=False, label_smoothing=0.0, patience=100, freeze=[0], save_period=-1, seed=0, local_rank=-1, entity=None, upload_dataset=False, bbox_interval=-1, artifact_alias=latest
[34m[1mgithub: [0mup to date with https://github.com/ultralytics/yolov5 ✅
YOLOv5 🚀 v7.0-34-g1ae9194 Python-3.8.16 torch-1.13.0+cu116 CUDA:0 (Tesla T4, 15110MiB)

[34m[1mhyperparameters: [0mlr0=0.01, lrf=0.01, momentum=0.937, weight_decay=0.0005, warm

## Eval yolo

### Generate bbox predictions on test set

In [None]:
!python yolov5/detect.py --device 0 --weights traun_fruitpunch_poachers_best.pt --source /content/new_dir/labels/test_dataset


### Calculate mAP score on test set

In [None]:
!python yolov5/val.py --device 0 --weights traun_fruitpunch_poachers_best.pt --data data.yaml

### Display a few test predictions next to actual labels


In [None]:
def superimpose_yolo_boxes(image_path, yolo_labelfile_path, classnames):
    """
    Return an image object with the yolo bounding boxes superimposed.
    
    :param classnames: a list of classnames.  The yolo labels will have the index, 
                       and indexing into classnames will give the actual classname 
                       (eg, "forklift")
    
    
    Steps:
    
    1. Load the image from the image path
    2. Load the yolo labels
    3. For each yolo label, draw a bounding box
    4. Return the image
    """
    
    image = cv2.imread(image_path)
    height, width, channels = image.shape
    line_thickness = round(0.002 * (image.shape[0] + image.shape[1]) / 2) + 1 
    
    with open(yolo_labelfile_path, "r") as yolo_labelfile:
        for line in yolo_labelfile:
            fields = line.split()
            class_label = classnames[int(fields[0])]
            
            normalized_x_center = float(fields[1])
            normalized_y_center = float(fields[2])
            normalized_w = float(fields[3])
            normalized_h = float(fields[4])
            
            x_center = normalized_x_center*width
            y_center = normalized_y_center*height
            w = normalized_w*width
            h = normalized_h*height                        
            
            xmin = round(x_center - w/2)  # Left
            ymin = round(y_center - h/2)  # Top
            xmax = round(x_center + w/2)  # Right
            ymax = round(y_center + h/2)  # Bottom  
                        
            color = [random.randint(0, 255) for _ in range(3)]

            top_left = (xmin, ymin)
            bottom_right = (xmax, ymax)
            cv2.rectangle(image, top_left, bottom_right, color, thickness=line_thickness, lineType=cv2.LINE_AA)
            
            # Shift the label down a few pixels if the bounding box is at the top of the image
            top_left_with_offset = top_left
            if ymin <= 20:
                top_left_with_offset = (xmin, ymin + 20)
            cv2.putText(image, str(class_label), top_left_with_offset, 0, line_thickness / 3, [225, 255, 255], thickness=line_thickness, lineType=cv2.LINE_AA)
            
        RGB_im = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        return RGB_im

In [None]:
import random
test_predictions_path = "yolov5/runs/detect/exp4"

# Get a few sample predicted images
all_images = os.listdir(test_predictions_path)
sampled_prediction_images = random.sample(all_images, 3)
print(sampled_prediction_images)
help(random.sample)

# Find the corresponding test images (non-superimposed bboxes) and their corresponding labels files
test_images_path = os.path.join(CONTENT_ROOT, "new_dir", "labels", "test_dataset")
unlabeled_test_images = []
label_files_for_test_images = []
for img in sampled_prediction_images:
    img_full_path = os.path.join(test_images_path, img)
    filename_no_extension = os.path.splitext(img)[0]
    yolo_label_file_full_path = os.path.join(test_images_path, f"{filename_no_extension}.txt")
    print(f"img_full_path: {img_full_path} yolo_label_file_full_path: {yolo_label_file_full_path}")
    unlabeled_test_images.append(img_full_path)
    label_files_for_test_images.append(yolo_label_file_full_path)
    

# Superimpose yolo labels on test images
# Plot them side by side with ground truth labeled test images
_, axs = plt.subplots(nrows=len(sampled_prediction_images), ncols=2, figsize=(20, 20))
for sampled_prediction_image, unlabeled_test_image, label_files_for_test_image, ax in zip(sampled_prediction_images, unlabeled_test_images, label_files_for_test_images, axs):
    print(f"Displaying image: {sampled_prediction_image}, {unlabeled_test_image}, {label_files_for_test_image} on ax: {ax}")
    superimposed_img = superimpose_yolo_boxes(
        image_path=unlabeled_test_image, 
        yolo_labelfile_path=label_files_for_test_image, 
        classnames=["Human"]
    )
    # print(f"superimposed_img: {superimposed_img}")
    sampled_prediction_image_full_path = os.path.join(test_predictions_path, sampled_prediction_image)
    sampled_prediction_image_cv2 = cv2.imread(sampled_prediction_image_full_path)
    ax[0].imshow(sampled_prediction_image_cv2)
    ax[1].imshow(superimposed_img)