# Statistics of training dataset

In [2]:
import os
import cv2
import numpy as np
import pandas as pd
import yaml

config_fp = '/data/wangjiazhi/projs/yolov8_0512/configs/face_qrcode2.yaml'
with open(config_fp, 'r') as f:
    config = yaml.load(f, Loader=yaml.FullLoader)
config

{'train': ['/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/widerface/train',
  '/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/MAFA/train',
  '/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/Celeba/train',
  '/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/shangchao_data/train',
  '/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/Uface/train',
  '/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/Uface_register/train',
  '/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/hand/train',
  '/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/background/train',
  '/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/animals/train',
  '/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/PandaEmoji/train',
  '/mnt/pai-storage-12/data/qrcode_data/qrcode/public_1/train',
  '/mnt/pai-storage-12/data/qrcode_data/qrcode/public_2/train',
  '/mnt/pai-storage-12/data/qrcode_data/qrcode/synthetise/train',
  '/mnt/pai-storage-12/data/qrcode

In [3]:
train_dir_2_weights = {}
for train_dir, weights in zip(config['train'], config['weights']):
    train_dir_2_weights[train_dir] = weights
train_dir_2_weights

{'/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/widerface/train': 5,
 '/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/MAFA/train': 0.5,
 '/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/Celeba/train': 0.5,
 '/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/shangchao_data/train': 0.2,
 '/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/Uface/train': 2,
 '/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/Uface_register/train': 2,
 '/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/hand/train': 3,
 '/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/background/train': 1,
 '/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/animals/train': 2,
 '/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/PandaEmoji/train': 2,
 '/mnt/pai-storage-12/data/qrcode_data/qrcode/public_1/train': 1,
 '/mnt/pai-storage-12/data/qrcode_data/qrcode/public_2/train': 1,
 '/mnt/pai-storage-12/data/qrcode_data/qrcode/synthetise/train': 0.1,
 '/mnt/pa

In [4]:
train_dirs = config['train']
val_dirs = config['val']
train_weights = config['weights']
assert len(train_dirs) == len(train_weights), f"{len(train_dirs)} != {len(train_weights)}"

import os
from pathlib import Path

def analyze_directory(directory):
    img_root = os.path.join(directory, 'images')
    label_root = os.path.join(directory, 'labels')

    ### count image files
    # image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'}
    image_extensions = {'.jpg', '.png', '.jpeg'}
    image_count = 0
    total_count = 0
    other_extensions = set()
    
    for root, _, files in os.walk(img_root):
        for file in files:
            total_count += 1
            ext = os.path.splitext(file)[1].lower()
            
            if ext in image_extensions:
                image_count += 1
            else:
                other_extensions.add(ext)
    
    if total_count != image_count:
        print(f"Total files: {total_count}, Image files: {image_count}, Other files: {len(other_extensions)}")
        print(f"Other extensions: {other_extensions}")
    # else:
    #     print(f"Image files: {image_count}")

    ### count txt files
    txt_count = 0
    total_count = 0
    other_extensions = set()
    empty_label_count = 0  # 负样本图片数量（没有目标）
    face_bbox_count = 0
    qrcode_bbox_count = 0
    for root, _, files in os.walk(label_root):
        for file in files:
            total_count += 1
            ext = os.path.splitext(file)[1].lower()
            if ext == '.txt':
                txt_count += 1
            else:
                other_extensions.add(ext)
            
            file_path = os.path.join(root, file)
            with open(file_path, 'r') as f:
                lines = f.readlines()
                lines = [line.strip() for line in lines]
                if len(lines) == 0:
                    empty_label_count += 1
                else:
                    for line in lines:
                        if line.startswith('0'):
                            face_bbox_count += 1
                        elif line.startswith('1'):
                            qrcode_bbox_count += 1
                        else:
                            print(f"Invalid line: {line}")

    if total_count != txt_count:
        print(f"Total files: {total_count}, Txt files: {txt_count}, Other files: {len(other_extensions)}")
        print(f"Other extensions: {other_extensions}")
    # else:
    #     print(f"Txt files: {txt_count}")

    return {
        'img_count': image_count,
        'txt_count': txt_count,
        'empty_label_count': empty_label_count,
        'face_bbox_count': face_bbox_count,
        'qrcode_bbox_count': qrcode_bbox_count
    }


results = {}
for train_dir in train_dirs:
    print(train_dir)
    stats = analyze_directory(train_dir)
    stats['none_label_count'] = stats['img_count'] - stats['txt_count']
    print(stats)
    results[train_dir] = stats

/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/widerface/train
{'img_count': 12682, 'txt_count': 12682, 'empty_label_count': 0, 'face_bbox_count': 81039, 'qrcode_bbox_count': 0, 'none_label_count': 0}
/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/MAFA/train
{'img_count': 17789, 'txt_count': 17789, 'empty_label_count': 0, 'face_bbox_count': 27599, 'qrcode_bbox_count': 0, 'none_label_count': 0}
/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/Celeba/train
{'img_count': 39095, 'txt_count': 39095, 'empty_label_count': 0, 'face_bbox_count': 39095, 'qrcode_bbox_count': 0, 'none_label_count': 0}
/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/shangchao_data/train
{'img_count': 94737, 'txt_count': 94737, 'empty_label_count': 0, 'face_bbox_count': 179581, 'qrcode_bbox_count': 0, 'none_label_count': 0}
/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/Uface/train
{'img_count': 50656, 'txt_count': 50656, 'empty_label_count': 0, 'face_bbox_count': 50656, 'qr

In [5]:
total_img_count = sum([stats['img_count'] for stats in results.values()])
total_txt_count = sum([stats['txt_count'] for stats in results.values()])
print(f"total_img_count: {total_img_count}, total_txt_count: {total_txt_count}")

total_img_count: 361589, total_txt_count: 344118


## Bbox count
### without weight

In [None]:
face_img_count = 0
qrcode_img_count = 0
face_box_count = 0
qrcode_box_count = 0
background_img_count = 0

for train_dir, stats in results.items():
    if ('Facedetect' in train_dir and 'qrcode' in train_dir) or ('Facedetect' not in train_dir and 'qrcode' not in train_dir):
        raise ValueError("Unexpected train_dir: ", train_dir)
    elif 'Facedetect' in train_dir:
        # print("Facedetect train_dir: ", train_dir)
        face_img_count += stats['img_count']
    elif 'qrcode' in train_dir:
        # print("qrcode train_dir: ", train_dir)
        qrcode_img_count += stats['img_count']
    else:
        raise ValueError("Unexpected train_dir: ", train_dir)

    face_box_count += stats['face_bbox_count']
    qrcode_box_count += stats['qrcode_bbox_count']
    background_img_count += stats['none_label_count'] + stats['empty_label_count']

print(f"face_img_count: {face_img_count}, qrcode_img_count: {qrcode_img_count}, face_box_count: {face_box_count}, qrcode_box_count: {qrcode_box_count}, background_img_count: {background_img_count}")
total_bbox_count = face_box_count + qrcode_box_count
print(f"Face bbox ratio: {face_box_count / total_bbox_count:.2%}, Qrcode bbox ratio: {qrcode_box_count / total_bbox_count:.2%}")

### with weight

In [5]:
train_dir_2_weights_0 = {
    '/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/widerface/train': 2,
    '/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/MAFA/train': 1,
    '/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/Celeba/train': 1,
    '/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/shangchao_data/train': 1,
    '/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/Uface/train': 5,
    '/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/Uface_register/train': 2,
    '/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/hand/train': 3,
    '/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/background/train': 0.5,
    '/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/animals/train': 2,  # 5778
    '/mnt/pai-storage-12/data/qrcode_data/qrcode/public_1/train': 1,
    '/mnt/pai-storage-12/data/qrcode_data/qrcode/public_2/train': 1,
    '/mnt/pai-storage-12/data/qrcode_data/qrcode/synthetise/train': 0.2,
    '/mnt/pai-storage-12/data/qrcode_data/qrcode/250508/train': 1,
    '/mnt/pai-storage-12/data/qrcode_data/qrcode/250512/train': 1,
    '/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/PandaEmoji/train': 1,
}
train_dir_2_weights_old = {
    '/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/widerface/train': 8.0,
    '/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/MAFA/train': 0.56,
    '/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/Celeba/train': 0.25,
    '/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/shangchao_data/train': 0.2,
    '/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/Uface/train': 1.0,
    '/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/Uface_register/train': 0.28,
    '/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/hand/train': 3.0,
    '/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/background/train': 1.0,
    '/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/animals/train': 2.0,
    '/mnt/pai-storage-12/data/qrcode_data/qrcode/public_1/train': 2.0,
    '/mnt/pai-storage-12/data/qrcode_data/qrcode/public_2/train': 2.0,
    '/mnt/pai-storage-12/data/qrcode_data/qrcode/synthetise/train': 0.37,
    '/mnt/pai-storage-12/data/Facedetect_data/yolo-face_data/PandaEmoji/train': 2.0,
}

In [None]:
face_img_count = 0
qrcode_img_count = 0
face_box_count = 0
qrcode_box_count = 0
background_img_count = 0

for train_dir, stats in results.items():
    weight = train_dir_2_weights[train_dir]
    if ('Facedetect' in train_dir and 'qrcode' in train_dir) or ('Facedetect' not in train_dir and 'qrcode' not in train_dir):
        raise ValueError("Unexpected train_dir: ", train_dir)
    elif 'Facedetect' in train_dir:
        # print("Facedetect train_dir: ", train_dir)
        face_img_count += stats['img_count'] * weight
    elif 'qrcode' in train_dir:
        # print("qrcode train_dir: ", train_dir)
        qrcode_img_count += stats['img_count'] * weight
    else:
        raise ValueError("Unexpected train_dir: ", train_dir)

    face_box_count += stats['face_bbox_count'] * weight
    qrcode_box_count += stats['qrcode_bbox_count'] * weight
    background_img_count += (stats['none_label_count'] + stats['empty_label_count']) * weight

print(f"face_img_count: {face_img_count}, qrcode_img_count: {qrcode_img_count}, face_box_count: {face_box_count}, qrcode_box_count: {qrcode_box_count}, background_img_count: {background_img_count}")
total_bbox_count = face_box_count + qrcode_box_count
print(f"Face bbox ratio: {face_box_count / total_bbox_count:.2%}, Qrcode bbox ratio: {qrcode_box_count / total_bbox_count:.2%}")

# Visualize

In [None]:
# for each img_dir, sample 10 images and display them
import matplotlib.pyplot as plt
import os
import random

def get_bbox(img_path):
    label_path = (os.path.splitext(img_path)[0] + '.txt').replace('images', 'labels')
    if not os.path.exists(label_path):
        return {}
    
    with open(label_path, 'r') as f:
        lines = f.readlines()
    # each line is like "0 0.4524590163934426 0.4192139737991266 0.2754098360655738 0.23580786026200873"
    # class_id center_x center_y width height
    bboxes = {}  # {(x1, y1, x2, y2): class_id}
    for line in lines:
        line = line.strip().split()
        x_center, y_center, width, height = map(float, line[1:])
        x1 = x_center - width / 2
        y1 = y_center - height / 2
        x2 = x_center + width / 2
        y2 = y_center + height / 2
        img_h, img_w, img_c = cv2.imread(img_path).shape
        x1 = max(0, x1 * img_w)
        y1 = max(0, y1 * img_h)
        x2 = min(img_w, x2 * img_w)
        y2 = min(img_h, y2 * img_h)
        bboxes[(x1, y1, x2, y2)] = int(line[0])
    return bboxes

red = (255, 0, 0)
green = (0, 255, 0)
blue = (0, 0, 255)
color_map = {0: green, 1: blue}  # 0 is face, 1 is qrcode
def sample_images(image_paths, display_rows=2, display_cols=5):
    sample_image_paths = random.sample(image_paths, display_rows * display_cols)

    fig, axes = plt.subplots(display_rows, display_cols, figsize=(display_cols*3, display_rows*3))
    axes = axes.flatten()
    
    for i, img_path in enumerate(sample_image_paths):
        img_path = img_path.strip()
        bboxes = get_bbox(img_path)

        try:
            img = cv2.imread(img_path)
            if img is None:
                axes[i].text(0.5, 0.5, f"Failed to load\n{os.path.basename(img_path)}", 
                             ha='center', va='center', color='red')
            else:
                # Convert BGR to RGB for proper display and draw bboxes
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                axes[i].imshow(img)
                for bbox, class_id in bboxes.items():
                    color = color_map[class_id]
                    cv2.rectangle(img, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), color, 3)

                axes[i].imshow(img)
            # axes[i].set_title(f"Image {i+1}", fontsize=10)
            axes[i].axis('off')
        except Exception as e:
            axes[i].text(0.5, 0.5, f"Error: {str(e)}", ha='center', va='center', color='red')
            axes[i].axis('off')
    
    # plt.tight_layout()
    return fig


for img_dir, img_paths in list(dt_val.items()):
    fig = sample_images(img_paths)
    plt.suptitle(img_dir, fontsize=10)
    plt.subplots_adjust(top=0.9, wspace=0.1, hspace=0.1)
    # plt.tight_layout()
    # fig.show()