In [1]:
from ultralytics import YOLO
import pandas as pd
import numpy as np
import os
import imagesize

In [2]:
class_names = {
    0:'Person',
    1:'Car',
    2:'Truck',
    3:'UAV',
    4:'Aircraft',
    5:'Ship'
}

In [3]:
def inspect_dataset(
        target_dataset_root,  # ../datasets/new_dataset
        target_dataset_slice,  # train,test,val
):
 info = []  # list of lists, each list corresponds to an instance [cls_id, x, y, w, h, img]

 target_labels_dir = os.path.join(target_dataset_root, 'labels', target_dataset_slice)

 # Iterate over all files in the original dataset labels folder
 for filename in os.listdir(target_labels_dir):
  if filename.endswith('.txt'):
   # Read file
   with open(os.path.join(target_labels_dir, filename), "r") as f:
    # Iterate over instances in image and get present class ids
    for line in f:
     line_data = []
     # label data
     line_data = line.split()
     # Image name
     line_data.append(os.path.splitext(filename)[0])
     # Image size: could be done at image level and not row level
     img_path = os.path.join(target_dataset_root, 'images', target_dataset_slice,
                             os.path.splitext(filename)[0] + '.jpg')
     img_w, img_h = imagesize.get(img_path)
     line_data.extend([img_w, img_h])
     line_data.extend([img_path, os.path.join(target_labels_dir, filename)])
     # Append line data to info
     info.append(line_data)

 df = pd.DataFrame(info, columns=['new_class_id', 'xcn', 'ycn', 'wn', 'hn', 'img', 'img_w', 'img_h', 'image_path', 'label_path'])
 df = df.astype(
  {'new_class_id': 'int32', 'xcn': 'float32', 'ycn': 'float32', 'wn': 'float32', 'hn': 'float32', 'img': 'int64',
   'img_w': 'float32', 'img_h': 'float32', 'image_path': 'string', 'label_path': 'string'})
 df['class_name'] = df['new_class_id'].map(class_names)
 return df

In [4]:
df_val = inspect_dataset('/Users/johnny/Projects/datasets/custom_dataset_v2/', 'val')

In [5]:
df_val

Unnamed: 0,new_class_id,xcn,ycn,wn,hn,img,img_w,img_h,image_path,label_path,class_name
0,0,0.104688,0.515258,0.112500,0.152582,11303,640.0,426.0,/Users/johnny/Projects/datasets/custom_dataset...,/Users/johnny/Projects/datasets/custom_dataset...,Person
1,0,0.741406,0.524648,0.098438,0.171362,11303,640.0,426.0,/Users/johnny/Projects/datasets/custom_dataset...,/Users/johnny/Projects/datasets/custom_dataset...,Person
2,0,0.459375,0.311033,0.106250,0.265258,11303,640.0,426.0,/Users/johnny/Projects/datasets/custom_dataset...,/Users/johnny/Projects/datasets/custom_dataset...,Person
3,1,0.113008,0.516769,0.038066,0.033807,3644,1624.0,1200.0,/Users/johnny/Projects/datasets/custom_dataset...,/Users/johnny/Projects/datasets/custom_dataset...,Car
4,3,0.178125,0.514583,0.142188,0.129167,5235,1280.0,720.0,/Users/johnny/Projects/datasets/custom_dataset...,/Users/johnny/Projects/datasets/custom_dataset...,UAV
...,...,...,...,...,...,...,...,...,...,...,...
95016,1,0.406055,0.520774,0.016188,0.020995,4132,1624.0,1200.0,/Users/johnny/Projects/datasets/custom_dataset...,/Users/johnny/Projects/datasets/custom_dataset...,Car
95017,1,0.440119,0.525339,0.023608,0.026472,4132,1624.0,1200.0,/Users/johnny/Projects/datasets/custom_dataset...,/Users/johnny/Projects/datasets/custom_dataset...,Car
95018,0,0.455295,0.534467,0.012141,0.048380,4132,1624.0,1200.0,/Users/johnny/Projects/datasets/custom_dataset...,/Users/johnny/Projects/datasets/custom_dataset...,Person
95019,1,0.558832,0.561395,0.087012,0.082156,4132,1624.0,1200.0,/Users/johnny/Projects/datasets/custom_dataset...,/Users/johnny/Projects/datasets/custom_dataset...,Car


In [6]:
print(f"The number of objects is {len(df_val)}")
print(f"The number of images is {len(df_val['img'].unique())}")

The number of objects is 95021
The number of images is 13205


In [7]:
bin_edges = [0, 16**2, 32**2, 96**2, float('inf')]
bin_labels = ['Tiny', 'Small', 'Medium', 'Large']
df_val['bbox_area'] = (df_val['wn']*df_val['img_w'])*(df_val['hn']*df_val['img_h'])
df_val['bbox_size_category'] = pd.cut(df_val['bbox_area'], bins=bin_edges, labels=bin_labels, right=False)

In [8]:
df_val

Unnamed: 0,new_class_id,xcn,ycn,wn,hn,img,img_w,img_h,image_path,label_path,class_name,bbox_area,bbox_size_category
0,0,0.104688,0.515258,0.112500,0.152582,11303,640.0,426.0,/Users/johnny/Projects/datasets/custom_dataset...,/Users/johnny/Projects/datasets/custom_dataset...,Person,4679.995117,Medium
1,0,0.741406,0.524648,0.098438,0.171362,11303,640.0,426.0,/Users/johnny/Projects/datasets/custom_dataset...,/Users/johnny/Projects/datasets/custom_dataset...,Person,4599.013672,Medium
2,0,0.459375,0.311033,0.106250,0.265258,11303,640.0,426.0,/Users/johnny/Projects/datasets/custom_dataset...,/Users/johnny/Projects/datasets/custom_dataset...,Person,7683.994141,Medium
3,1,0.113008,0.516769,0.038066,0.033807,3644,1624.0,1200.0,/Users/johnny/Projects/datasets/custom_dataset...,/Users/johnny/Projects/datasets/custom_dataset...,Car,2507.913330,Medium
4,3,0.178125,0.514583,0.142188,0.129167,5235,1280.0,720.0,/Users/johnny/Projects/datasets/custom_dataset...,/Users/johnny/Projects/datasets/custom_dataset...,UAV,16926.000000,Large
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95016,1,0.406055,0.520774,0.016188,0.020995,4132,1624.0,1200.0,/Users/johnny/Projects/datasets/custom_dataset...,/Users/johnny/Projects/datasets/custom_dataset...,Car,662.350647,Small
95017,1,0.440119,0.525339,0.023608,0.026472,4132,1624.0,1200.0,/Users/johnny/Projects/datasets/custom_dataset...,/Users/johnny/Projects/datasets/custom_dataset...,Car,1217.913086,Medium
95018,0,0.455295,0.534467,0.012141,0.048380,4132,1624.0,1200.0,/Users/johnny/Projects/datasets/custom_dataset...,/Users/johnny/Projects/datasets/custom_dataset...,Person,1144.717529,Medium
95019,1,0.558832,0.561395,0.087012,0.082156,4132,1624.0,1200.0,/Users/johnny/Projects/datasets/custom_dataset...,/Users/johnny/Projects/datasets/custom_dataset...,Car,13930.977539,Large


In [9]:
df_val['bbox_size_category'].value_counts()

bbox_size_category
Medium    35772
Small     21698
Large     20803
Tiny      16748
Name: count, dtype: int64

# 1) Filter by Image Size (≥ 1920x1080)

In [10]:
df_size = df_val[(df_val['img_w']>= 1920)&(df_val['img_h']>= 1080)]

In [11]:
df_size

Unnamed: 0,new_class_id,xcn,ycn,wn,hn,img,img_w,img_h,image_path,label_path,class_name,bbox_area,bbox_size_category
23,3,0.398438,0.616204,0.012500,0.008333,7422,1920.0,1080.0,/Users/johnny/Projects/datasets/custom_dataset...,/Users/johnny/Projects/datasets/custom_dataset...,UAV,216.000000,Tiny
82,3,0.360156,0.598148,0.011979,0.005556,7344,1920.0,1080.0,/Users/johnny/Projects/datasets/custom_dataset...,/Users/johnny/Projects/datasets/custom_dataset...,UAV,138.000000,Tiny
83,1,0.486807,0.891343,0.008396,0.014769,7344,1920.0,1080.0,/Users/johnny/Projects/datasets/custom_dataset...,/Users/johnny/Projects/datasets/custom_dataset...,Car,257.113983,Small
84,0,0.380125,0.944889,0.008495,0.026176,7344,1920.0,1080.0,/Users/johnny/Projects/datasets/custom_dataset...,/Users/johnny/Projects/datasets/custom_dataset...,Person,461.083679,Small
85,0,0.916901,0.984667,0.007786,0.030657,7344,1920.0,1080.0,/Users/johnny/Projects/datasets/custom_dataset...,/Users/johnny/Projects/datasets/custom_dataset...,Person,494.994507,Small
...,...,...,...,...,...,...,...,...,...,...,...,...,...
94989,0,0.687198,0.270741,0.008251,0.018334,10004,3840.0,2160.0,/Users/johnny/Projects/datasets/custom_dataset...,/Users/johnny/Projects/datasets/custom_dataset...,Person,1254.673096,Medium
94990,0,0.699027,0.306811,0.025659,0.022881,10004,3840.0,2160.0,/Users/johnny/Projects/datasets/custom_dataset...,/Users/johnny/Projects/datasets/custom_dataset...,Person,4869.652832,Medium
94991,0,0.737579,0.260000,0.008491,0.016296,10004,3840.0,2160.0,/Users/johnny/Projects/datasets/custom_dataset...,/Users/johnny/Projects/datasets/custom_dataset...,Person,1147.660278,Medium
94992,0,0.324881,0.365857,0.023200,0.031715,10004,3840.0,2160.0,/Users/johnny/Projects/datasets/custom_dataset...,/Users/johnny/Projects/datasets/custom_dataset...,Person,6102.750000,Medium


In [12]:
print(f"The number of objects is {len(df_size)}")
print(f"The number of images is {len(df_size['img'].unique())}")

The number of objects is 23156
The number of images is 3154


In [13]:
df_size['bbox_size_category'].value_counts()

bbox_size_category
Small     7458
Medium    7212
Tiny      6449
Large     2037
Name: count, dtype: int64

In [14]:
df_size['class_name'].value_counts()

class_name
Person      15897
Car          4584
UAV          2206
Ship          321
Truck         147
Aircraft        1
Name: count, dtype: int64

# 2) Filter by maximum number of objects present (≤ 100)

In [15]:
object_counts = df_size['img'].value_counts()
selected_images = object_counts[object_counts <= 100].index
df_filtered = df_size[df_size['img'].isin(selected_images)]

In [16]:
df_filtered

Unnamed: 0,new_class_id,xcn,ycn,wn,hn,img,img_w,img_h,image_path,label_path,class_name,bbox_area,bbox_size_category
23,3,0.398438,0.616204,0.012500,0.008333,7422,1920.0,1080.0,/Users/johnny/Projects/datasets/custom_dataset...,/Users/johnny/Projects/datasets/custom_dataset...,UAV,216.000000,Tiny
82,3,0.360156,0.598148,0.011979,0.005556,7344,1920.0,1080.0,/Users/johnny/Projects/datasets/custom_dataset...,/Users/johnny/Projects/datasets/custom_dataset...,UAV,138.000000,Tiny
83,1,0.486807,0.891343,0.008396,0.014769,7344,1920.0,1080.0,/Users/johnny/Projects/datasets/custom_dataset...,/Users/johnny/Projects/datasets/custom_dataset...,Car,257.113983,Small
84,0,0.380125,0.944889,0.008495,0.026176,7344,1920.0,1080.0,/Users/johnny/Projects/datasets/custom_dataset...,/Users/johnny/Projects/datasets/custom_dataset...,Person,461.083679,Small
85,0,0.916901,0.984667,0.007786,0.030657,7344,1920.0,1080.0,/Users/johnny/Projects/datasets/custom_dataset...,/Users/johnny/Projects/datasets/custom_dataset...,Person,494.994507,Small
...,...,...,...,...,...,...,...,...,...,...,...,...,...
94989,0,0.687198,0.270741,0.008251,0.018334,10004,3840.0,2160.0,/Users/johnny/Projects/datasets/custom_dataset...,/Users/johnny/Projects/datasets/custom_dataset...,Person,1254.673096,Medium
94990,0,0.699027,0.306811,0.025659,0.022881,10004,3840.0,2160.0,/Users/johnny/Projects/datasets/custom_dataset...,/Users/johnny/Projects/datasets/custom_dataset...,Person,4869.652832,Medium
94991,0,0.737579,0.260000,0.008491,0.016296,10004,3840.0,2160.0,/Users/johnny/Projects/datasets/custom_dataset...,/Users/johnny/Projects/datasets/custom_dataset...,Person,1147.660278,Medium
94992,0,0.324881,0.365857,0.023200,0.031715,10004,3840.0,2160.0,/Users/johnny/Projects/datasets/custom_dataset...,/Users/johnny/Projects/datasets/custom_dataset...,Person,6102.750000,Medium


In [17]:
print(f"The number of objects is {len(df_filtered)}")
print(f"The number of images is {len(df_filtered['img'].unique())}")

The number of objects is 19455
The number of images is 3133


In [18]:
df_filtered['bbox_size_category'].value_counts()

bbox_size_category
Medium    6689
Small     6556
Tiny      4190
Large     2020
Name: count, dtype: int64

In [19]:
import pandas as pd
import os
import shutil
import yaml

base_dir = '/Users/johnny/Projects/datasets/Client_Validation_Set'

subfolders = ['train', 'val', 'test']
folders = ['images', 'labels']
for folder in folders:
    for subfolder in subfolders:
        os.makedirs(os.path.join(base_dir, folder, subfolder), exist_ok=True)

for index, row in df_filtered.iterrows():
    image_path = row['image_path']
    label_path = row['label_path']
    shutil.copy(image_path, os.path.join(base_dir, 'images', 'val'))
    shutil.copy(label_path, os.path.join(base_dir, 'labels', 'val'))
yaml_data = {
    'path': '../small-fast-detector/inference_tools/Evaluation/datasets/Client_Validation_Set',  # PUT HERE THE PATH
    'train': 'images/train',
    'val': 'images/val',
    'test': '',
    'names': {
        0: 'uav',
        1: 'airplane',
        2: 'boat',
        3: 'car',
        4: 'person',
        5: 'truck'
    }

}
with open(os.path.join(base_dir, 'data.yaml'), 'w') as file:
    yaml.dump(yaml_data, file, default_flow_style=False)

print("Dataset organized and YAML file created.")

Dataset organized and YAML file created.
