# Imports

In [None]:
from sklearn.model_selection import train_test_split

import os
import shutil
from glob import glob
import yaml

# 1. Prepare data

In [None]:
"""
Source dataset structure

data_categories/

    - s_bar_double/
        - 0001.png
        - 0002.png

    - ns_non_school/
        - 0002.png
        - 0003.png
  
  
label_txt_files/
    
    - 0001.txt
    - 0002.txt
    - 0003.txt
    - 0004.txt
"""

In [None]:
data_src = 'data_categories/'
label_src_dir = 'label_txt_files' # folder containing yolo format label text files

data_cats = os.listdir(data_src)

# folders that start with "s_" contains school tiles and "ns_" non-school tiles
school_cats = sorted([i for i in data_cats if i.startswith('s_')])
non_school_cats = sorted([i for i in data_cats if i.startswith('ns_')])

In [None]:
# corrupt images in dataset
corrupt_list = ['42304318_E.png',
'61212325.png',
'61223303.png',
'42201308.png',
'41606305_N.png',
'11302316_E.png',
'23301340_W.png',
'11301320_E.png']

## 1.1. Choose which categories to put into training dataset

In [None]:
# Please list all the categories that are present in the "data_categories" folder
# and comment the items you do not want to include in the training dataset

school_cats = [
    's_bar_adjacent',
    's_bar_double',
    's_bar_double_1',
    's_bar_double_2',
    's_bar_double_3',
    's_bar_double_4',
    # 's_bar_double_short_obvious',
    's_bar_multiple',
    # 's_bar_single',
    's_bar_triple',
    's_group_parallel',
    's_group_parallel_1',
    's_group_parallel_2',
    's_group_parallel_3',
    's_group_parallel_4',
    's_group_parallel_5',
    's_shape_C',
    's_shape_L',
    # 's_shape_O',
    's_wings'
]

non_school_cats = [
    'ns_fences',
    'ns_fences_1',
    'ns_fences_2',
    'ns_housings',
    'ns_large_buildings',
    'ns_like_building',
    'ns_non_school',
    'ns_plain'
]

## 1.2. Create dataset train / val folders for model training

In [None]:
"""
Training dataset structure

dataset/

    - train/
        - images/
            - 0001.png
            - 0002.png
        - labels/
            - 0001.txt
            - 0002.txt
            
    - val/
        - images/
            - 0003.png
            - 0004.png
        - labels/
            - 0003.txt
            - 0004.txt

"""

In [None]:
# root dir for training dataset
dataset_root = 'dataset'
val_size = 0.20

# create dataset train/val folders
train_image_folder = os.path.join(dataset_root, 'train/images')
train_label_folder = os.path.join(dataset_root, 'train/labels')
val_image_folder = os.path.join(dataset_root, 'val/images')
val_label_folder = os.path.join(dataset_root, 'val/labels')

os.makedirs(train_image_folder, exist_ok = True)
os.makedirs(train_label_folder, exist_ok = True)
os.makedirs(val_image_folder, exist_ok = True)
os.makedirs(val_label_folder, exist_ok = True)

## 1.3. Copy school images from chosen categories above to training dataset dir

In [None]:
for building_type in school_cats:
    image_folder = f'{data_src}/{building_type}/*/*'
    image_list = glob(image_folder)
    image_list = [i for i in image_list if os.path.split(i)[1] not in corrupt_list]
    
    # split train/val    
    train, val = train_test_split(image_list, test_size=val_size, random_state=42)
    
    print(f'{building_type} : {len(image_list)} = {len(train)} / {len(val)}')
    
    # copy train data
    for src_image in train:
        
        # copy images
        dirname, filename = os.path.split(src_image)
        dst_image = os.path.join(train_image_folder, filename)
        shutil.copy(src_image, dst_image)
        
        # copy labels
        label_dst_dir = train_label_folder
        src_label = os.path.join(label_src_dir, filename.replace('.png', '.txt'))
        dst_label = os.path.join(label_dst_dir, filename.replace('.png', '.txt'))
        shutil.copy(src_label, dst_label)
        
    # copy val data
    for src_image in val:
        
        # copy images
        dirname, filename = os.path.split(src_image)
        dst_image = os.path.join(val_image_folder, filename)
        shutil.copy(src_image, dst_image)
        
        # copy labels
        label_dst_dir = val_label_folder
        src_label = os.path.join(label_src_dir, filename.replace('.png', '.txt'))
        dst_label = os.path.join(label_dst_dir, filename.replace('.png', '.txt'))
        shutil.copy(src_label, dst_label)

## 1.4. Copy non-school images to training dataset dir

In [None]:
for non_school_type in non_school_cats:
    image_folder = f'{data_src}/{non_school_type}/*/*'
    
    image_list = glob(image_folder)
    image_list = [i for i in image_list if os.path.split(i)[1] not in corrupt_list]
    
    # split train/val    
    train, val = train_test_split(image_list, test_size=val_size, random_state=42)
    
    print(f'{non_school_type} : {len(image_list)} = {len(train)} / {len(val)}')
    
    # copy train data
    for src_image in train:
        
        # copy images
        dirname, filename = os.path.split(src_image)
        dst_image = os.path.join(train_image_folder, filename)
        shutil.copy(src_image, dst_image)
                
        # create new label txt files
        label_dst_dir = train_label_folder
        dst_label = os.path.join(label_dst_dir, filename.replace('.png', '.txt'))
        
        # write new label txt file
        content = ''
        with open(dst_label, 'w') as f:
            f.write(content)
        
    # copy val data
    for src_image in val:
        
        # copy images
        dirname, filename = os.path.split(src_image)
        dst_image = os.path.join(val_image_folder, filename)
        shutil.copy(src_image, dst_image)
        
        # create new label txt files
        label_dst_dir = val_label_folder
        dst_label = os.path.join(label_dst_dir, filename.replace('.png', '.txt'))
        
        # write new blank label txt file
        content = ''
        with open(dst_label, 'w') as f:
            f.write(content)

## 1.5. Training dataset information

In [None]:
####################   assertion   ####################
train_img_ids = [os.path.splitext(i)[0] for i in os.listdir(train_image_folder)]
train_label_ids = [os.path.splitext(i)[0] for i in os.listdir(train_label_folder)]

val_img_ids = [os.path.splitext(i)[0] for i in os.listdir(val_image_folder)]
val_label_ids =[os.path.splitext(i)[0] for i in os.listdir(val_label_folder)]

intersection = list(set(train_img_ids) & set(train_label_ids))
assert(len(train_img_ids) == len(intersection))

intersection = list(set(val_img_ids) & set(val_label_ids))
assert(len(val_img_ids) == len(intersection))


####################   stats   ####################
train_labels = glob(os.path.join(train_label_folder, '*'))
val_labels = glob(os.path.join(val_label_folder, '*'))

non_school_train, non_school_val = 0, 0
school_train, school_val = 0, 0

for item in train_labels:
    with open(item, 'r') as file:
        data = file.readlines()
    if len(data) == 0: non_school_train += 1
    else: school_train += 1
        
for item in val_labels:
    with open(item, 'r') as file:
        data = file.readlines()
    if len(data) == 0: non_school_val += 1
    else: school_val += 1
        
train_total = len(os.listdir(train_image_folder))
val_total = len(os.listdir(val_image_folder))


####################   print   ####################
print("Total images in train :", train_total)
print("Total images in val :", val_total, '\n')

print('Train:')
print('school :', school_train)
print('non_school :', non_school_train, '\n')

print('Val:')
print('school :', school_val)
print('non_school :', non_school_val)

# 2. **YOLOv5**

## 2.1. Augmentations

In [None]:
yolov5_folder = "yolov5"
hyp_dir = os.path.join(yolov5_folder, "data/hyps/hyp.scratch-med.yaml")

with open(hyp_dir, "r") as stream:
    config = yaml.safe_load(stream)

config['mosaic'] = 0.0
config['scale'] = 0.0
config['flipud'] = 0.5
config['degrees'] = 0.5
config['mixup'] = 0.0

# write yaml file
with open(hyp_dir, 'w') as outfile:
    yaml.dump(config, outfile, default_flow_style=True)

## 2.2. Create yaml file

In [None]:
DATASET_FOLDER = dataset_root
os.listdir(DATASET_FOLDER)

In [None]:
data_yaml = dict(
    train = f"{DATASET_FOLDER}/train",
    val = f"{DATASET_FOLDER}/val",

    nc = 1,
    names = ['school']
)

yaml_dir = os.path.join(DATASET_FOLDER, 'data.yaml')

# write data.yaml file
with open(yaml_dir, 'w') as outfile:
    yaml.dump(data_yaml, outfile, default_flow_style=True)

## 2.3. Training

In [None]:
BATCH_SIZE = 150
IMG_SIZE = 256

EPOCHS = 50
project_folder = '/home/username/Desktop/school_detection'

In [None]:
%cd yolov5

!python train.py --img {IMG_SIZE} \
                 --batch {BATCH_SIZE} \
                 --epochs {EPOCHS} \
                 --data {yaml_dir} \
                 --weights yolov5l.pt \
                 --save-period 1 \
                 --project {project_folder} \
                 --hyp {hyp_dir}

%cd ..

## 2.4. Evaluation

In [None]:
WEIGHT_FILE = 'my_weights_dir/best.pt'

IMG_SIZE = 256
BATCH_SIZE = 150
IOU_THRESH = 0.2
CONF_THRESH = 0.5

In [None]:
%cd yolov5

!python val.py --img {IMG_SIZE} \
                 --batch-size {BATCH_SIZE} \
                 --conf-thres {CONF_THRESH} \
                 --data {yaml_dir} \
                 --weights {WEIGHT_FILE} \
                 --iou-thres {IOU_THRESH} \
                 --task val \
                 --project {project_folder} \
                 --save-txt \
                 --save-conf \
                 --name infer_outputs

%cd ..

## 2.5. Move training output files

In [None]:
weights_folder = os.path.join(project_folder, 'exp/weights')

dest_folder = os.path.join(project_folder, 'all_trained_weights')
os.makedirs(dest_folder, exist_ok = True)

item_list = os.listdir(weights_folder)

# move weight files from training_outputs folder to a separate folder
for item in item_list:
    src = os.path.join(weights_folder, item)
    dest = os.path.join(dest_folder, item)
    shutil.move(src, dest)

# copy best.pt and last.pt weight files to a folder
weights = ['best.pt', 'last.pt']

best_last_folder = os.path.join(project_folder, 'best_last_weights')
os.makedirs(best_last_folder, exist_ok = True)

os.listdir(dest_folder)

for w in weights:
    src = os.path.join(dest_folder, w)
    dest = os.path.join(best_last_folder, w)
    shutil.copy(src, dest)

- **best.pt** and **last.pt** can be found in **"best_last_folder"**