# IRP - Yolov8 training Guide

In this notebook, we present the different steps for the training of a Ultralytics Yolov8 model.\
See [YOLO docs](https://docs.ultralytics.com/) for more information

# Setup

Pip install Ultralytics, check software and hardware and import the needed packages

In [1]:
%pip install ultralytics
import ultralytics
ultralytics.checks()

Ultralytics YOLOv8.0.157 🚀 Python-3.10.10 torch-2.0.0 CUDA:0 (Tesla P100-PCIE-16GB, 16281MiB)
Setup complete ✅ (2 CPUs, 15.6 GB RAM, 4958.9/8062.4 GB disk)


In [2]:
import os

from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import shutil
from glob import glob

from ultralytics import YOLO

import yaml

# 1. Data Preprocessing

The training dataset will be first divided into a training set and a validating set. The files needed for the training will be created. 

In [3]:
# Define the root data directory
DATA_DIR = "../IRP_dataset/YOLO_train"

# Working directory
WORKING_DIR = "/kaggle/working"

In [4]:
# Define the Classes present in the training dataset
class_list = ['bird', 'aircraft']

In [5]:
# Get the path and the object labels of each images in the training set
# Each image contains either type of objects
files_dir = glob(os.path.join(DATA_DIR, "images/*.jpg"))
labels = [file.split("/")[-1] for file in files_dir]

# Get the labels
labels = [0 if label.startswith("bird") else 1 if label.startswith('aircraft') else 1 if label.startswith('jet') else 2 for label in labels]

In [6]:
train_df = pd.DataFrame(
    {
        "image_path": files_dir,
        "labels": labels
    })

train_df.head(10)

Unnamed: 0,image_path,labels
0,../IRP_dataset/YOLO_train/images/jet2_03976.jpg,1
1,../IRP_dataset/YOLO_train/images/aircraft3_002...,1
2,../IRP_dataset/YOLO_train/images/jet1_02585.jpg,1
3,../IRP_dataset/YOLO_train/images/aircraft7_054...,1
4,../IRP_dataset/YOLO_train/images/jet1_03480.jpg,1
5,../IRP_dataset/YOLO_train/images/bird9_00248.jpg,0
6,../IRP_dataset/YOLO_train/images/aircraft4_003...,1
7,../IRP_dataset/YOLO_train/images/aircraft4_003...,1
8,../IRP_dataset/YOLO_train/images/aircraft4_004...,1
9,../IRP_dataset/YOLO_train/images/aircraft12_00...,1


In [7]:
# Split the dataset intro a training and validating set. Evenly split both class
skf  = StratifiedKFold(n_splits = 5)
train_df['fold'] = -1

for fold, (train_idx, val_idx) in enumerate(skf.split(np.zeros(len(train_df.labels.tolist())), train_df.labels.tolist())):
    train_df.loc[val_idx, 'fold'] = fold

train_df['fold'] = train_df['fold']
    
train_df.head()

Unnamed: 0,image_path,labels,fold
0,../IRP_dataset/YOLO_train/images/jet2_03976.jpg,1,0
1,../IRP_dataset/YOLO_train/images/aircraft3_002...,1,0
2,../IRP_dataset/YOLO_train/images/jet1_02585.jpg,1,0
3,../IRP_dataset/YOLO_train/images/aircraft7_054...,1,0
4,../IRP_dataset/YOLO_train/images/jet1_03480.jpg,1,0


In [8]:
def split_data(train_df, fold):

    # Retrieve training and validating files location
    val_files   = []
    train_files = []
    val_files += list(train_df[train_df.fold==fold].image_path.unique())
    train_files += list(train_df[train_df.fold!=fold].image_path.unique())
    
    # Create all the needed directory
    os.makedirs(os.path.join(WORKING_DIR, "labels/train"), exist_ok = True)
    os.makedirs(os.path.join(WORKING_DIR, "labels/val"), exist_ok = True)
    os.makedirs(os.path.join(WORKING_DIR, "images/train"), exist_ok = True)
    os.makedirs(os.path.join(WORKING_DIR, "images/val"), exist_ok = True)
    label_dir = os.path.join(DATA_DIR, "labels")
    
    # Copy all the files
    for file in tqdm(train_files): # we use tqdm to see the progress of the copying of files
        shutil.copy(file, os.path.join(WORKING_DIR, "images/train"))
        filename = file.split('/')[-1].split('.')[0]
        if os.path.exists(os.path.join(label_dir, filename+'.txt')):
            shutil.copy(os.path.join(label_dir, filename+'.txt'), os.path.join(WORKING_DIR, "labels/train"))
        else:
            with open(os.path.join(WORKING_DIR, "labels/train", filename+'.txt'), "w") as f:
                f.write("")

    for file in tqdm(val_files):
        shutil.copy(file, os.path.join(WORKING_DIR, "images/val"))
        filename = file.split('/')[-1].split('.')[0]
        if os.path.exists(os.path.join(label_dir, filename+'.txt')):
            shutil.copy(os.path.join(label_dir, filename+'.txt'), os.path.join(WORKING_DIR, "labels/val"))
        else:
            with open(os.path.join(WORKING_DIR, "labels/val", filename+'.txt'), "w") as f:
                f.write("")

In [9]:
# Split the training dataset using the 4th Fold as the validating one
split_data(train_df, 4)

  0%|          | 0/7918 [00:00<?, ?it/s]

  0%|          | 0/1979 [00:00<?, ?it/s]

In [10]:
#  Creating a yaml file for training
with open(os.path.join(WORKING_DIR, 'train.txt'), 'w') as f:
    for path in glob(os.path.join(WORKING_DIR, 'images/train/*')):
        f.write(path+'\n')

with open(os.path.join(WORKING_DIR, 'val.txt'), 'w') as f:
    for path in glob(os.path.join(WORKING_DIR, 'images/val/*')):
        f.write(path+'\n')


data = dict(
    train =  os.path.join(WORKING_DIR, 'train.txt') ,
    val   =  os.path.join(WORKING_DIR, 'val.txt' ),
    nc    = len(class_list),
    names = class_list)

with open(os.path.join(WORKING_DIR, 'source.yaml'), 'w') as outfile:
    yaml.dump(data, outfile, default_flow_style=False)

data = dict(
    train =  os.path.join('', 'train.txt') ,
    val   =  os.path.join('', 'val.txt' ),
    nc    = len(class_list),
    names = class_list)

with open(os.path.join(WORKING_DIR, 'source2.yaml'), 'w') as outfile:
    yaml.dump(data, outfile, default_flow_style=False)

f = open(os.path.join(WORKING_DIR, 'source2.yaml'), 'r')
print('\nyaml contents:')
print(f.read())
print("\n\n")


yaml contents:
names:
- bird
- aircraft
nc: 2
train: train.txt
val: val.txt






# 2. Train

In [11]:
# Disable the WandB monitoring of the training
os.environ['WANDB_DISABLED'] = 'true'

In [12]:
# Train the model using the custom dataset.

model = YOLO('yolov8x.pt')
model.train(data=os.path.join(WORKING_DIR, "source.yaml"), epochs=30, batch=16, imgsz=640, optimizer="SGD")

Downloading https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8x.pt to 'yolov8x.pt'...
100%|██████████| 131M/131M [00:01<00:00, 73.0MB/s]
Ultralytics YOLOv8.0.157 🚀 Python-3.10.10 torch-2.0.0 CUDA:0 (Tesla P100-PCIE-16GB, 16281MiB)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=yolov8x.pt, data=/kaggle/working/source.yaml, epochs=30, patience=50, batch=16, imgsz=640, save=True, save_period=-1, cache=False, device=None, workers=8, project=None, name=None, exist_ok=False, pretrained=True, optimizer=SGD, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, show=False, save_txt=False, save_conf=False, save_crop=False, show_labels=True, show_conf=True, vid_stride=1,