# Setup

Remember to install CUDA dependencies

In [1]:
import glob
import os
import shutil
import re
import random
seed = 420
random.seed(seed)

from ultralytics import YOLO, settings
root_dir = os.getcwd().replace('\\notebooks', '')
settings.update({'datasets_dir': f'{root_dir}/data/kaggle', 'runs_dir': f'{root_dir}/yolo/runs'})

from roboflow import Roboflow

# Sorting Kaggle dataset into train/val

Source: https://www.kaggle.com/datasets/hamidl/yoloqrlabeled?resource=download

In [None]:
# create required directories
if not os.path.exists('../data/kaggle/'):
    os.mkdir('../data/kaggle')
    os.mkdir('../data/kaggle/images')
    os.mkdir('../data/kaggle/images/train')
    os.mkdir('../data/kaggle/images/val')
    os.mkdir('../data/kaggle/labels')
    os.mkdir('../data/kaggle/labels/train')
    os.mkdir('../data/kaggle/labels/val')

    # copy images and labels to required directories by train/val splits

    # total images
    total = len(glob.glob('../data/YOLO-QR-datasets/Dataset 1/images/*.jpg')) + \
            len(glob.glob('../data/YOLO-QR-datasets/Dataset 2/images/*.jpg')) + \
            len(glob.glob('../data/YOLO-QR-datasets/Dataset 3/images/*.jpg'))

    train = int(total * 0.9) # floored (we may miss our on 1 or 2 images but that's fine)
    val = int(total * 0.1)
    print(f'Train : {train}')
    print(f'Val   : {val}')

    # getting filepaths of all images and annotations
    filepaths = {'images': [], 'labels': []}
    for i in range(1, 4):
        # images
        for file in glob.glob(f'../data/YOLO-QR-datasets/Dataset {i}/images/*.jpg'):
            filepaths['images'].append(file)
        # labels
        for file in glob.glob(f'../data/YOLO-QR-datasets/Dataset {i}/labels/*.txt'):
            filepaths['labels'].append(file)

    print(filepaths['images'][:2])
    print(filepaths['labels'][:2])

    # shuffled indexes
    indexes = list(range(total))
    random.Random(seed).shuffle(indexes)
    print(indexes[:2])

    # copy images and labels to required directories with train/val splits
    pattern = r'\d+' # regex pattern for grabbing first number in label file
    for i, index in enumerate(indexes):
        if i < train:
            # train

            # copy image file as-is
            shutil.copyfile(filepaths['images'][index], f'../data/kaggle/images/train/{i}.jpg')

            # copy label file but change class index to 0
            with open(filepaths['labels'][index], 'r') as f:
                line = f.readline()
            num = re.search(pattern, line).group()
            line = line.replace(num, '0', 1)
            with open(f'../data/kaggle/labels/train/{i}.txt', 'w') as f:
                f.write(line)
        
        else:
            # val
            shutil.copyfile(filepaths['images'][index], f'../data/kaggle/images/val/{i}.jpg')
            with open(filepaths['labels'][index], 'r') as f:
                line = f.readline()
            num = re.search(pattern, line).group()
            line = line.replace(num, '0', 1)
            with open(f'../data/kaggle/labels/val/{i}.txt', 'w') as f:
                f.write(line)

    # create yaml file for training YOLO on
    with open('../data/kaggle/data.yaml', 'w') as f:
        f.write(f'path: {root_dir}/data/kaggle/\n')
        f.write('train: images/train\n')
        f.write('val: images/val\n')
        f.write('test: images/test\n')
        f.write('\n')
        f.write('nc: 1\n')
        f.write('names: [\'Data Matrix\']\n')

# Sorting MAN dataset into train/val/test (fine-tuning dataset)

source: https://app.roboflow.com/rs-xldmw/data-matrix-codes/

In [None]:
# create required directories
if not os.path.exists('../data/MAN/'):
    os.mkdir('../data/MAN')
    os.mkdir('../data/MAN/images')
    os.mkdir('../data/MAN/images/train')
    os.mkdir('../data/MAN/images/val')
    os.mkdir('../data/MAN/images/test')
    os.mkdir('../data/MAN/labels')
    os.mkdir('../data/MAN/labels/train')
    os.mkdir('../data/MAN/labels/val')
    os.mkdir('../data/MAN/labels/test')

    # download dataset from roboflow
    with open('../roboflow_api_key.txt', 'r') as f:
        api_key = f.readline().strip()
    
    rf = Roboflow(api_key=api_key)
    project = rf.workspace('rs-xldmw').project('data-matrix-codes')
    version = project.version(3)
    dataset = version.download('yolov11')
    dl_folder = f'{version.name.replace(' ', '-')}-{version.version}'

    # sort files according to train/val/test splits
    for file in glob.glob(f'{dl_folder}/train/images/*.jpg'):
        shutil.move(file, f'../data/MAN/images/train/')
    for file in glob.glob(f'{dl_folder}/train/labels/*.txt'):
        shutil.move(file, f'../data/MAN/labels/train/')
    for file in glob.glob(f'{dl_folder}/valid/images/*.jpg'):
        shutil.move(file, f'../data/MAN/images/val/')
    for file in glob.glob(f'{dl_folder}/valid/labels/*.txt'):
        shutil.move(file, f'../data/MAN/labels/val/')
    for file in glob.glob(f'{dl_folder}/test/images/*.jpg'):
        shutil.move(file, f'../data/MAN/images/test/')
    for file in glob.glob(f'{dl_folder}/test/labels/*.txt'):
        shutil.move(file, f'../data/MAN/labels/test/')
    
    # delete downloaded roboflow dataset
    shutil.rmtree(dl_folder)

    # create yaml file for training YOLO on
    with open('../data/MAN/data.yaml', 'w') as f:
        f.write(f'path: {root_dir}/data/MAN/\n')
        f.write('train: images/train\n')
        f.write('val: images/val\n')
        f.write('test: images/test\n')
        f.write('\n')
        f.write('nc: 1\n')
        f.write('names: [\'Data Matrix\']\n')

# Training

## From Scratch

yolov11 model trained from scratch on Kaggle QR code dataset.

Took ~4.6hrs on rtx 2070

In [4]:
model = YOLO('yolo11n.yaml', task='detect') # build a new model from YAML

In [None]:
model.train(
    data='../data/kaggle/data.yaml',      # path to yaml file which specifies dataset parameters
    epochs=100,
    imgsz=640,
    single_cls=True,                      # single class training
    patience=10,                          # early stopping patience (after this many epochs with no improvement stop training)
    pretrained=False,                     # don't use pre-trained weights
    plots=True,                           # create plots

    # solving GPU memory issue?
    workers=0,                            # number of worker threads for data loading (0 reduces memory problems at cost of slower training)
    batch = 8,                            # batch size (default 16, reducing to 8 can help)
)

In [7]:
# move best model to models directory
shutil.move('../yolo/runs/detect/train/weights/best.pt', '../yolo/models/kaggle_scratch.pt')
del model # flush memory

## Fine-Tuned Kaggle

In [15]:
model = YOLO('../yolo/models/kaggle_scratch.pt', task='detect') # load kaggle trained

In [None]:
model.train(
    data='../data/MAN/data.yaml',      # path to yaml file which specifies dataset parameters
    epochs=100,
    imgsz=640,
    single_cls=True,                      # single class training
    patience=10,                          # early stopping patience (after this many epochs with no improvement stop training)
    pretrained=False,                     # don't use pre-trained weights
    plots=True,                           # create plots

    # solving GPU memory issue?
    workers=0,                            # number of worker threads for data loading (0 reduces memory problems at cost of slower training)
    batch = 8,                            # batch size (default 16, reducing to 8 can help)
)

In [None]:
# move best model to models directory
shutil.move('../yolo/runs/detect/train2/weights/best.pt', '../yolo/models/kaggle_finetuned.pt')
del model # flush memory

## Ultralytics Pretrained Fine-Tuned

In [8]:
model = YOLO('../yolo/models/yolo11n.pt', task='detect') # using ultralytics pre-trained model (on COCO dataset)

In [None]:
model.train(
    data='../data/MAN/data.yaml',      # path to yaml file which specifies dataset parameters
    epochs=100,
    imgsz=640,
    single_cls=True,                      # single class training
    patience=10,                          # early stopping patience (after this many epochs with no improvement stop training)
    pretrained=False,                     # don't use pre-trained weights
    plots=True,                           # create plots

    # solving GPU memory issue?
    workers=0,                            # number of worker threads for data loading (0 reduces memory problems at cost of slower training)
    batch = 8,                            # batch size (default 16, reducing to 8 can help)
)

In [None]:
# move best model to models directory
shutil.move('../yolo/runs/detect/train3/weights/best.pt', '../yolo/models/ultralytics_finetuned.pt')
del model # flush memory