In [3]:
import xml.etree.ElementTree as ET
import os

def convert(size, box):
    dw = 1. / size[0]
    dh = 1. / size[1]
    x = (box[0] + box[1]) / 2.0
    y = (box[2] + box[3]) / 2.0
    w = box[1] - box[0]
    h = box[3] - box[2]
    x = x * dw
    w = w * dw
    y = y * dh
    h = h * dh
    return (x, y, w, h)

def convert_annotation(input_folder, output_folder, image_id):
    in_file = open(os.path.join(input_folder, '%s.xml' % image_id))
    out_file = open(os.path.join(output_folder, '%s.txt' % image_id), 'w')
    tree = ET.parse(in_file)
    root = tree.getroot()

    size = root.find('size')
    if size is None:
        print(f"Warning: 'size' element not found in {image_id}.xml")
        return

    w_elem = size.find('width')
    h_elem = size.find('height')
    if w_elem is None or h_elem is None:
        print(f"Warning: 'width' or 'height' element not found in {image_id}.xml")
        return

    w = int(w_elem.text)
    h = int(h_elem.text)

    for obj in root.iter('object'):
        name_elem = obj.find('name')
        if name_elem is None or name_elem.text not in yolo_classes:
            print(f"Warning: 'name' element not found or not recognized in {image_id}.xml")
            continue

        cls = int(yolo_classes[name_elem.text])

        xmlbox = obj.find('bndbox')
        if xmlbox is None:
            print(f"Warning: 'bndbox' element not found in {image_id}.xml for object {cls}.")
            continue

        xmin_elem = xmlbox.find('xmin')
        ymin_elem = xmlbox.find('ymin')
        xmax_elem = xmlbox.find('xmax')
        ymax_elem = xmlbox.find('ymax')

        if xmin_elem is None or ymin_elem is None or xmax_elem is None or ymax_elem is None:
            print(f"Warning: One or more bounding box coordinates not found in {image_id}.xml for object {cls}.")
            continue

        b = (float(xmin_elem.text), float(xmax_elem.text), float(ymin_elem.text), float(ymax_elem.text))
        bb = convert((w, h), b)
        out_file.write(f"{cls} {' '.join([f'{val:.6f}' for val in bb])}\n")
def process_xml_folder(input_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for root, dirs, files in os.walk(input_folder):
        for file in files:
            if file.endswith(".xml"):
                image_id = os.path.splitext(file)[0]
                convert_annotation(input_folder, output_folder, image_id)

if __name__ == "__main__":
    input_folder = "D:\\桌面\\GRDDC2020_train\\train\\Japan\\annotations\\xmls"  # 输入XML文件夹路径
    output_folder = "D:\\桌面\\GRDDC2020_train\\train\\Japan\\images"  # 输出txt文件夹路径

    # 新的YOLO格式的类别名称
    yolo_classes = {
        # 'D01': 1,
        # 'D00': 1,
        # 'D10': 0,
        # 'D11': 1,
        'D20': 3,
        'D40': 4,
        # 'D43': 5,
        # 'D44': 6,
        # 'D50': 7
    }

    process_xml_folder(input_folder, output_folder)




In [8]:
import os
import shutil

def merge_yolo_labels(dataset1_path, dataset2_path, class_to_merge):
    # 检查目标文件夹是否存在，不存在则创建
    # output_folder = os.path.join(dataset1_path, 'merged_labels')
    # os.makedirs(output_folder, exist_ok=True)

    # 遍历第二个数据集的标签文件
    for root, dirs, files in os.walk(dataset2_path):
        for file in files:
            if file.endswith('.txt'):
                label_file_path2 = os.path.join(root, file)
                # print(label_file_path2)
                image_name = os.path.splitext(file)[0]
                label_file_path1 = os.path.join(dataset1_path, f'{image_name}.txt')
                # print(label_file_path1)
                # print(f'Merging {label_file_path2} into {label_file_path1}')

                # 检查第一个数据集中是否有相应的标签文件
                if os.path.exists(label_file_path1):
                    # 读取第一个数据集的标签内容
                    with open(label_file_path1, 'r') as f1:
                        lines1 = f1.readlines()
                        # print(lines1)

                    # 读取第二个数据集的标签内容
                    with open(label_file_path2, 'r') as f2:
                        lines2 = f2.readlines()

                    # 找到第二个数据集中指定类别的标签
                    merged_lines = []
                    for line2 in lines2:
                        class_id = line2.split()[0]
                        # print(class_id)
                        # break
                        if int(class_id) == class_to_merge:
                            merged_lines.append(line2)

                    # 合并到第一个数据集的标签中
                    lines1.extend(merged_lines)

                    # 写回第一个数据集的标签文件
                    with open(label_file_path1, 'w') as f1:
                        f1.writelines(lines1)

    # print(f'Merging completed. Merged labels are stored in {output_folder}')

# 用法示例
dataset1_path = 'D:\\桌面\\a\\26'
dataset2_path = "D:\\桌面\\a\\noA1_1_35_qiang"
class_to_merge = 7  # 指定要合并的类别

merge_yolo_labels(dataset1_path, dataset2_path, class_to_merge)


In [1]:
import os
from collections import defaultdict

labels_dir = "d:\\桌面\\a\\B"

# 初始化一个字典来存储各类别的计数
class_counts = defaultdict(int)

# 遍历标签文件
for filename in os.listdir(labels_dir):
    if filename.endswith(".txt"):
        if (filename == 'classes.txt'):
            continue
        with open(os.path.join(labels_dir, filename), "r") as file:
            lines = file.readlines()
            for line in lines:
                # 标签文件通常是 "<class_id> <x_center> <y_center> <width> <height>" 的格式
                # 你可能需要根据你的数据集的具体格式来解析
                class_id = int(line.split()[0])
                class_counts[class_id] += 1
# 打印各类别的计数
for class_id, count in class_counts.items():
    print(f"Class {class_id}: {count} instances")
# CLass 0:
# 95 instances
# Class 1:451 instances
# C1ass6:205
# instances
# CLass 7:72
# instances
# CLass 4:66
# instances
# CLass 5:90 instances
# CLass 3:111
# instances
# Class 2:59 instances

Class 1: 202 instances
Class 3: 145 instances
Class 5: 41 instances
Class 4: 161 instances
Class 6: 79 instances
Class 2: 12 instances
Class 0: 10 instances
Class 7: 5 instances


In [48]:
import os
from collections import defaultdict

labels_dir = "D:\\桌面\\GRDDC2020_train\\train\\Czech\\images"

# 初始化一个字典来存储各类别的计数
class_counts = defaultdict(int)

# 遍历标签文件
for filename in os.listdir(labels_dir):
    if filename.endswith(".txt"):
        if (filename == 'classes.txt'):
            continue
        with open(os.path.join(labels_dir, filename), "r") as file:
            lines = file.readlines()
            for line in lines:
                # 标签文件通常是 "<class_id> <x_center> <y_center> <width> <height>" 的格式
                # 你可能需要根据你的数据集的具体格式来解析
                class_id = int(line.split()[0])
                class_counts[class_id] += 1
# 打印各类别的计数
for class_id, count in class_counts.items():
    print(f"Class {class_id}: {count} instances")
    # print(1)

Class 1: 988 instances
Class 4: 197 instances
Class 0: 399 instances
Class 3: 161 instances


In [2]:
import os
from collections import defaultdict

labels_dir = "D:\\桌面\\jp1"

# 初始化一个字典来存储各类别的计数
class_counts = defaultdict(int)

# 遍历标签文件
for filename in os.listdir(labels_dir):
    if filename.endswith(".txt"):
        if (filename == 'classes.txt'):
            continue
        with open(os.path.join(labels_dir, filename), "r") as file:
            lines = file.readlines()
            for line in lines:
                # 标签文件通常是 "<class_id> <x_center> <y_center> <width> <height>" 的格式
                # 你可能需要根据你的数据集的具体格式来解析
                class_id = int(line.split()[0])
                class_counts[class_id] += 1
# 打印各类别的计数
for class_id, count in class_counts.items():
    print(f"Class {class_id}: {count} instances")

Class 1: 314 instances
Class 6: 188 instances
Class 7: 98 instances
Class 4: 113 instances
Class 5: 93 instances
Class 2: 108 instances
Class 0: 90 instances
Class 3: 71 instances


In [4]:
import os

def clean_yolo_dataset(dataset_path):
    # 遍历数据集目录
    for root, dirs, files in os.walk(dataset_path):
        for file in files:
            # 检查文件扩展名是否为.jpg
            if file.endswith(".jpg"):
                jpg_file_path = os.path.join(root, file)
                # 构建对应的.txt文件路径
                txt_file_path = os.path.splitext(jpg_file_path)[0] + ".txt"

                # 检查.txt文件是否存在
                if not os.path.exists(txt_file_path):
                    print(f"删除文件: {jpg_file_path}")
                    os.remove(jpg_file_path)

if __name__ == "__main__":
    # 指定YOLO数据集的路径
    dataset_path = "D:\\桌面\\GRDDC2020_train\\train\\Japan\\images"
    clean_yolo_dataset(dataset_path)

删除文件: D:\桌面\GRDDC2020_train\train\Japan\images\Japan_000001.jpg
删除文件: D:\桌面\GRDDC2020_train\train\Japan\images\Japan_000002.jpg
删除文件: D:\桌面\GRDDC2020_train\train\Japan\images\Japan_000005.jpg
删除文件: D:\桌面\GRDDC2020_train\train\Japan\images\Japan_000008.jpg
删除文件: D:\桌面\GRDDC2020_train\train\Japan\images\Japan_000010.jpg
删除文件: D:\桌面\GRDDC2020_train\train\Japan\images\Japan_000014.jpg
删除文件: D:\桌面\GRDDC2020_train\train\Japan\images\Japan_000018.jpg
删除文件: D:\桌面\GRDDC2020_train\train\Japan\images\Japan_000022.jpg
删除文件: D:\桌面\GRDDC2020_train\train\Japan\images\Japan_000026.jpg
删除文件: D:\桌面\GRDDC2020_train\train\Japan\images\Japan_000027.jpg
删除文件: D:\桌面\GRDDC2020_train\train\Japan\images\Japan_000028.jpg
删除文件: D:\桌面\GRDDC2020_train\train\Japan\images\Japan_000029.jpg
删除文件: D:\桌面\GRDDC2020_train\train\Japan\images\Japan_000030.jpg
删除文件: D:\桌面\GRDDC2020_train\train\Japan\images\Japan_000033.jpg
删除文件: D:\桌面\GRDDC2020_train\train\Japan\images\Japan_000035.jpg
删除文件: D:\桌面\GRDDC2020_train\train\Japan\

In [5]:
 import os
from collections import defaultdict

labels_dir = "D:\\桌面\\GRDDC2020_train\\train\\Japan\\images"

# 初始化一个字典来存储各类别的计数
class_counts = defaultdict(int)

# 遍历标签文件
for filename in os.listdir(labels_dir):
    if filename.endswith(".txt"):
        # print(filename)
        if (filename == 'classes.txt'):
            continue
        with open(os.path.join(labels_dir, filename), "r") as file:
            lines = file.readlines()
            for line in lines:
                # 标签文件通常是 "<class_id> <x_center> <y_center> <width> <height>" 的格式
                # 你可能需要根据你的数据集的具体格式来解析
                class_id = int(line.split()[0])
                class_counts[class_id] += 1
# 打印各类别的计数
for class_id, count in class_counts.items():
    print(f"Class {class_id}: {count} instances")

Class 3: 6199 instances
Class 4: 2243 instances


In [3]:
import os
from collections import defaultdict

labels_dir = "D:\\桌面\\GRai\\3500"

# 初始化一个字典来存储各类别的计数
class_counts = defaultdict(int)

# 遍历标签文件
for filename in os.listdir(labels_dir):
    if filename.endswith(".txt"):
        # print(filename)
        if (filename == 'classes.txt'):
            continue
        with open(os.path.join(labels_dir, filename), "r") as file:
            lines = file.readlines()
            for line in lines:
                # 标签文件通常是 "<class_id> <x_center> <y_center> <width> <height>" 的格式
                # 你可能需要根据你的数据集的具体格式来解析
                class_id = int(line.split()[0])
                class_counts[class_id] += 1
# 打印各类别的计数
for class_id, count in class_counts.items():
    print(f"Class {class_id}: {count} instances")

Class 5: 205 instances
Class 0: 532 instances
Class 1: 3607 instances
Class 6: 1751 instances
Class 2: 193 instances
Class 3: 1905 instances
Class 4: 1012 instances
Class 7: 272 instances


In [7]:
import os


def clean_yolo_dataset(dataset_path):
    # 遍历数据集目录
    for root, dirs, files in os.walk(dataset_path):
        for file in files:
            # 检查文件扩展名是否为.jpg
            if file.endswith(".jpg"):
                jpg_file_path = os.path.join(root, file)
                # 构建对应的.txt文件路径
                txt_file_path = os.path.splitext(jpg_file_path)[0] + ".txt"

                # 检查.txt文件是否存在
                if not os.path.exists(txt_file_path):
                    print(f"删除文件: {jpg_file_path}")
                    os.remove(jpg_file_path)


if __name__ == "__main__":
    # 指定YOLO数据集的路径
    dataset_path = "D:\\桌面\\jsai_data - 副本"
    clean_yolo_dataset(dataset_path)


删除文件: D:\桌面\jsai_data - 副本\10.jpg
删除文件: D:\桌面\jsai_data - 副本\100.jpg
删除文件: D:\桌面\jsai_data - 副本\101.jpg
删除文件: D:\桌面\jsai_data - 副本\102.jpg
删除文件: D:\桌面\jsai_data - 副本\103.jpg
删除文件: D:\桌面\jsai_data - 副本\104.jpg
删除文件: D:\桌面\jsai_data - 副本\105.jpg
删除文件: D:\桌面\jsai_data - 副本\106.jpg
删除文件: D:\桌面\jsai_data - 副本\107.jpg
删除文件: D:\桌面\jsai_data - 副本\108.jpg
删除文件: D:\桌面\jsai_data - 副本\109.jpg
删除文件: D:\桌面\jsai_data - 副本\11.jpg
删除文件: D:\桌面\jsai_data - 副本\111.jpg
删除文件: D:\桌面\jsai_data - 副本\112.jpg
删除文件: D:\桌面\jsai_data - 副本\113.jpg
删除文件: D:\桌面\jsai_data - 副本\114.jpg
删除文件: D:\桌面\jsai_data - 副本\115.jpg
删除文件: D:\桌面\jsai_data - 副本\116.jpg
删除文件: D:\桌面\jsai_data - 副本\118.jpg
删除文件: D:\桌面\jsai_data - 副本\119.jpg
删除文件: D:\桌面\jsai_data - 副本\12.jpg
删除文件: D:\桌面\jsai_data - 副本\120.jpg
删除文件: D:\桌面\jsai_data - 副本\121.jpg
删除文件: D:\桌面\jsai_data - 副本\122.jpg
删除文件: D:\桌面\jsai_data - 副本\123.jpg
删除文件: D:\桌面\jsai_data - 副本\124.jpg
删除文件: D:\桌面\jsai_data - 副本\125.jpg
删除文件: D:\桌面\jsai_data - 副本\126.jpg
删除文件: D:\桌面\jsai_data -

In [2]:
import os
from collections import defaultdict

labels_dir = "D:\\桌面\\a\\res_re_qiang1"

# 初始化一个字典来存储各类别的计数
class_counts = defaultdict(int)

# 遍历标签文件
for filename in os.listdir(labels_dir):
    if filename.endswith(".txt"):
        with open(os.path.join(labels_dir, filename), "r") as file:
            lines = file.readlines()
            for line in lines:
                # 标签文件通常是 "<class_id> <x_center> <y_center> <width> <height>" 的格式
                # 你可能需要根据你的数据集的具体格式来解析
                class_id = int(line.split()[0])
                class_counts[class_id] += 1
# 打印各类别的计数
for class_id, count in class_counts.items():
    print(f"Class {class_id}: {count} instances")

Class 1: 609 instances
Class 6: 182 instances
Class 0: 80 instances
Class 4: 5 instances
Class 5: 52 instances
Class 2: 55 instances
Class 3: 18 instances


In [8]:
import os
from collections import defaultdict

labels_dir = "D:\\桌面\\a\\res_re_qiang1"

# 初始化一个字典来存储各类别的计数
class_counts = defaultdict(int)

# 遍历标签文件
for filename in os.listdir(labels_dir):
    if filename.endswith(".txt"):
        with open(os.path.join(labels_dir, filename), "r") as file:
            lines = file.readlines()
            for line in lines:
                # 标签文件通常是 "<class_id> <x_center> <y_center> <width> <height>" 的格式
                # 你可能需要根据你的数据集的具体格式来解析
                class_id = int(line.split()[0])
                class_counts[class_id] += 1
# 打印各类别的计数
for class_id, count in class_counts.items():
    print(f"Class {class_id}: {count} instances")

Class 1: 609 instances
Class 6: 182 instances
Class 0: 80 instances
Class 4: 5 instances
Class 5: 52 instances
Class 2: 55 instances
Class 3: 18 instances


In [4]:
!pip install albumentations -i https: // pypi.tuna.tsinghua.edu.cn / simple

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting albumentations
  Using cached https://pypi.tuna.tsinghua.edu.cn/packages/9b/f6/c486cedb4f75147232f32ec4c97026714cfef7c7e247a1f0427bc5489f66/albumentations-1.3.1-py3-none-any.whl (125 kB)
Collecting opencv-python-headless>=4.1.1
  Using cached https://pypi.tuna.tsinghua.edu.cn/packages/e3/10/31b27a7473043eb5317f698ede00e7e129b2de378903bfe0bb4d785a7baf/opencv_python_headless-4.8.1.78-cp37-abi3-win_amd64.whl (38.0 MB)
Collecting scikit-image>=0.16.1
  Using cached https://pypi.tuna.tsinghua.edu.cn/packages/d7/d1/a4c715ad640c9eb0daaa77c4ce561b06e086bec44cbc79083e3548b00b76/scikit_image-0.21.0-cp38-cp38-win_amd64.whl (22.7 MB)
Collecting qudida>=0.0.4
  Using cached https://pypi.tuna.tsinghua.edu.cn/packages/f0/a1/a5f4bebaa31d109003909809d88aeb0d4b201463a9ea29308d9e4f9e7655/qudida-0.0.4-py3-none-any.whl (3.5 kB)
Collecting imageio>=2.27
  Using cached https://pypi.tuna.tsinghua.edu.cn/packages/9b/82/473e452d3f21a9cd7e79

ERROR: Could not install packages due to an OSError: [WinError 5] 拒绝访问。: 'D:\\miniconda3\\envs\\pytorch\\Lib\\site-packages\\cv2\\cv2.pyd'
Consider using the `--user` option or check the permissions.



#过采样

In [1]:
import os
import random
from shutil import copyfile
from collections import defaultdict

# 定义数据集目录和子集目录
data_dir = "D:/yolo/yolov5/过采样_new1/train"
train_dir = '过采样11/train'
val_dir = '过采样11/val'
test_dir = '过采样11/test'

# 创建子集目录
os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

# 定义标签文件后缀
label_suffix = '.txt'

# 创建一个字典来存储每个类别的图像文件列表
class_image_dict = defaultdict(list)

# 遍历数据集目录，读取标签并按类别分组
for file_name in os.listdir(data_dir):
    if file_name.endswith('.jpg'):
        image_file = os.path.join(data_dir, file_name)
        label_file = os.path.join(data_dir, file_name.replace('.jpg', label_suffix))

        # 读取标签内容
        with open(label_file, 'r') as label_content:
            label_data = label_content.read().strip().split()  # 假设标签文件是以空格分隔的数据
            if label_data:
                class_id = int(label_data[0])  # 假设类别ID是标签文件的第一个元素
                class_image_dict[class_id].append((image_file, label_file))

# 计算每个类别的最小样本数
min_samples = min(len(images) for images in class_image_dict.values())

# 过采样以平衡标签比例
balanced_data = []
for class_id, images in class_image_dict.items():
    oversampled_images = random.sample(images, min_samples)  # 使用random.sample函数来进行无放回的抽样
    balanced_data.extend(oversampled_images)

# 随机打乱图像文件列表
random.shuffle(balanced_data)

# 定义训练集、验证集和测试集的比例
train_ratio = 1
val_ratio = 0
test_ratio = 0

# 根据比例计算数据集大小
total_samples = len(balanced_data)
train_size = int(total_samples * train_ratio)
val_size = int(total_samples * val_ratio)

# 划分数据集
train_data = balanced_data[:train_size]
val_data = balanced_data[train_size:train_size + val_size]
test_data = balanced_data[train_size + val_size:]


# 复制图像和标签文件到各个子集目录
def copy_images_and_labels(data_list, subset_dir):
    for image_file, label_file in data_list:
        dest_image = os.path.join(subset_dir, os.path.basename(image_file))
        dest_label = os.path.join(subset_dir, os.path.basename(label_file))
        copyfile(image_file, dest_image)
        copyfile(label_file, dest_label)


# 复制到各个子集目录
copy_images_and_labels(train_data, train_dir)
copy_images_and_labels(val_data, val_dir)
copy_images_and_labels(test_data, test_dir)
# 计算训练集中每个类别的样本数量
train_class_counts = defaultdict(int)
for _, label_file in train_data:
    with open(label_file, 'r') as label_content:
        label_data = label_content.read().strip().split()
        if label_data:
            class_id = int(label_data[0])
            train_class_counts[class_id] += 1

# 打印每个类别的训练样本数量
for class_id, count in train_class_counts.items():
    print(f"Class {class_id}: {count} instances in the training set")


Class 7: 6 instances in the training set
Class 6: 6 instances in the training set
Class 5: 6 instances in the training set
Class 2: 6 instances in the training set
Class 3: 6 instances in the training set
Class 1: 6 instances in the training set
Class 0: 6 instances in the training set
Class 4: 6 instances in the training set


In [2]:
import os
import random
from shutil import copyfile
from collections import defaultdict

# 定义数据集目录和子集目录
data_dir = "D:/yolo/yolov5/过采样_new1/train"
train_dir = '过采样111/train'
val_dir = '过采样111/val'
test_dir = '过采样111/test'

# 创建子集目录
os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

# 定义标签文件后缀
label_suffix = '.txt'

# 创建一个字典来存储每个类别的图像文件列表
class_image_dict = defaultdict(list)

# 遍历数据集目录，读取标签并按类别分组
for file_name in os.listdir(data_dir):
    if file_name.endswith('.jpg'):
        image_file = os.path.join(data_dir, file_name)
        label_file = os.path.join(data_dir, file_name.replace('.jpg', label_suffix))

        # 读取标签内容
        with open(label_file, 'r') as label_content:
            label_data = label_content.read().strip().split()  # 假设标签文件是以空格分隔的数据
            if label_data:
                class_id = int(label_data[0])  # 假设类别ID是标签文件的第一个元素
                class_image_dict[class_id].append((image_file, label_file))

# 查找具有最多样本的类别
max_samples = max(len(images) for images in class_image_dict.values())

# 过采样以平衡标签比例
balanced_data = []
for class_id, images in class_image_dict.items():
    oversampled_images = random.choices(images, k=max_samples)  # 使用random.choices函数来进行有放回的抽样
    balanced_data.extend(oversampled_images)

# 随机打乱图像文件列表
random.shuffle(balanced_data)

# 定义训练集、验证集和测试集的比例
train_ratio = 10
val_ratio = 0
test_ratio = 0

# 根据比例计算数据集大小
total_samples = len(balanced_data)
train_size = int(total_samples * train_ratio)
val_size = int(total_samples * val_ratio)

# 划分数据集
train_data = balanced_data[:train_size]
val_data = balanced_data[train_size:train_size + val_size]
test_data = balanced_data[train_size + val_size:]


# 复制图像和标签文件到各个子集目录
def copy_images_and_labels(data_list, subset_dir):
    for image_file, label_file in data_list:
        dest_image = os.path.join(subset_dir, os.path.basename(image_file))
        dest_label = os.path.join(subset_dir, os.path.basename(label_file))
        copyfile(image_file, dest_image)
        copyfile(label_file, dest_label)


# 复制到各个子集目录
copy_images_and_labels(train_data, train_dir)
copy_images_and_labels(val_data, val_dir)
copy_images_and_labels(test_data, test_dir)
train_class_counts = defaultdict(int)
for _, label_file in train_data:
    with open(label_file, 'r') as label_content:
        label_data = label_content.read().strip().split()
        if label_data:
            class_id = int(label_data[0])
            train_class_counts[class_id] += 1

# 打印每个类别的训练样本数量
for class_id, count in train_class_counts.items():
    print(f"Class {class_id}: {count} instances in the training set")

Class 7: 1054 instances in the training set
Class 4: 1054 instances in the training set
Class 3: 1054 instances in the training set
Class 0: 1054 instances in the training set
Class 5: 1054 instances in the training set
Class 2: 1054 instances in the training set
Class 6: 1054 instances in the training set
Class 1: 1054 instances in the training set


In [6]:
import os
from collections import defaultdict

# 定义YOLO标签文件的存储路径
labels_dir = "D:/yolo/yolov5/ai_data/train"

# 初始化一个字典来存储各类别的计数
class_counts = defaultdict(int)

# 遍历标签文件
for filename in os.listdir(labels_dir):
    if filename.endswith(".txt"):
        with open(os.path.join(labels_dir, filename), "r") as file:
            lines = file.readlines()
            for line in lines:
                # 标签文件通常是 "<class_id> <x_center> <y_center> <width> <height>" 的格式
                # 你可能需要根据你的数据集的具体格式来解析
                class_id = int(line.split()[0])
                class_counts[class_id] += 1

# 打印各类别的计数
for class_id, count in class_counts.items():
    print(f"Class {class_id}: {count} instances")


Class 5: 70 instances
Class 0: 142 instances
Class 1: 1325 instances
Class 6: 950 instances
Class 2: 45 instances
Class 3: 54 instances
Class 4: 66 instances
Class 7: 20 instances


In [9]:
import os
import random
from shutil import copyfile
from collections import defaultdict

# 定义数据集目录和子集目录
data_dir = "D:/yolo/jsai_data"
train_dir = '过采样1/train'
val_dir = '过采样1/val'
test_dir = '过采样1/test'

# 创建子集目录
os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

# 定义标签文件后缀
label_suffix = '.txt'

# 创建一个字典来存储每个类别的图像文件列表
class_image_dict = defaultdict(list)

# 遍历数据集目录，读取标签并按类别分组
for file_name in os.listdir(data_dir):
    if file_name.endswith('.jpg'):
        image_file = os.path.join(data_dir, file_name)
        label_file = os.path.join(data_dir, file_name.replace('.jpg', label_suffix))

        # 读取标签内容
        with open(label_file, 'r') as label_content:
            label_data = label_content.read().strip().split()  # 假设标签文件是以空格分隔的数据
            if label_data:
                class_id = int(label_data[0])  # 假设类别ID是标签文件的第一个元素
                class_image_dict[class_id].append((image_file, label_file))
print(class_image_dict)
# 查找具有最多样本的类别
max_samples = max(len(images) for images in class_image_dict.values())

# 过采样以平衡标签比例
balanced_data = []
for class_id, images in class_image_dict.items():
    oversampled_images = random.choices(images, k=max_samples)  # 使用random.choices函数来进行有放回的抽样
    balanced_data.extend(oversampled_images)

# 随机打乱图像文件列表
random.shuffle(balanced_data)

# 定义训练集、验证集和测试集的比例
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

# 根据比例计算数据集大小
total_samples = len(balanced_data)
train_size = int(total_samples * train_ratio)
val_size = int(total_samples * val_ratio)

# 划分数据集
train_data = balanced_data[:train_size]
val_data = balanced_data[train_size:train_size + val_size]
test_data = balanced_data[train_size + val_size:]


# 复制图像和标签文件到各个子集目录
def copy_images_and_labels(data_list, subset_dir):
    for image_file, label_file in data_list:
        dest_image = os.path.join(subset_dir, os.path.basename(image_file))
        dest_label = os.path.join(subset_dir, os.path.basename(label_file))
        copyfile(image_file, dest_image)
        copyfile(label_file, dest_label)


# 复制到各个子集目录
copy_images_and_labels(train_data, train_dir)
copy_images_and_labels(val_data, val_dir)
copy_images_and_labels(test_data, test_dir)


defaultdict(<class 'list'>, {})


ValueError: max() arg is an empty sequence

2、要确保每张图像只出现在一个数据集

In [2]:
import os
import random
from shutil import copyfile

# 定义数据集目录和子集目录
data_dir = "D:/yolo/jsai_data"
train_dir = '过采样_new1/train'
val_dir = '过采样_new1/val'
test_dir = '过采样_new1/test'

# 创建子集目录
os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

# 定义标签文件后缀
label_suffix = '.txt'

# 创建图像和标签文件列表
image_files = []
label_files = []

# 遍历数据集目录，读取图像和标签文件，并按顺序存储
for file_name in os.listdir(data_dir):
    if file_name.endswith('.jpg'):
        image_file = os.path.join(data_dir, file_name)
        label_file = os.path.join(data_dir, file_name.replace('.jpg', label_suffix))

        # 添加图像和标签文件到列表中
        image_files.append(image_file)
        label_files.append(label_file)

# 将数据集列表随机打乱
combined_data = list(zip(image_files, label_files))
random.shuffle(combined_data)
image_files, label_files = zip(*combined_data)

# 定义训练集、验证集和测试集的比例
train_ratio = 0.8
val_ratio = 0.2
test_ratio = 0

# 计算数据集大小
total_samples = len(image_files)
train_size = int(total_samples * train_ratio)
val_size = int(total_samples * val_ratio)

# 划分数据集
train_images = image_files[:train_size]
train_labels = label_files[:train_size]
val_images = image_files[train_size:train_size + val_size]
val_labels = label_files[train_size:train_size + val_size]
test_images = image_files[train_size + val_size:]
test_labels = label_files[train_size + val_size:]


# 复制图像和标签文件到各个子集目录
def copy_images_and_labels(images, labels, subset_dir):
    for image_file, label_file in zip(images, labels):
        dest_image = os.path.join(subset_dir, os.path.basename(image_file))
        dest_label = os.path.join(subset_dir, os.path.basename(label_file))
        copyfile(image_file, dest_image)
        copyfile(label_file, dest_label)


# 复制到各个子集目录
copy_images_and_labels(train_images, train_labels, train_dir)
copy_images_and_labels(val_images, val_labels, val_dir)
copy_images_and_labels(test_images, test_labels, test_dir)
