# Data prep using a new dataset
since the last 2 datasets I used where way to small I needed to look for a bigger one to get the accuracy up. </br> For this I found a dataset called deepfashion, this dataset contains of over 270k images of clothing. </br>The only problem with this dataset is that it is from 2016 meaning that the structure of the data is not usable with current cnn models. </br> For this reason I took on the challenge to transform all of the data so that I can use it with YoloV11.</br>
dataset used: https://mmlab.ie.cuhk.edu.hk/projects/DeepFashion.html

I first import the needed libraries

In [None]:
import os
import shutil
import random
import time

I make sure the paths to all of the needed files are stored in different strings for later use

In [None]:

# Paths to input files
list_eval_partition_path = "./input_files/list_eval_partition.txt"
bounding_box_path = "./input_files/list_bbox.txt"
categorise_path = "./input_files/list_category_img.txt"
category_path = "./input_files/list_category_cloth.txt"


I then create the new file structure where the images will be copied to

In [None]:
# Directories for YOLO data
output_dir = "yolo_dataset"
os.makedirs(output_dir, exist_ok=True)
for partition in ["train", "val", "test"]:
    os.makedirs(os.path.join(output_dir, partition, "images"), exist_ok=True)
    os.makedirs(os.path.join(output_dir, partition, "labels"), exist_ok=True)


I make sure the directory exists to prevent the code from failing 

In [None]:

# Helper function to ensure directory exists for images
def ensure_directory_exists(image_name):
    # Create a subdirectory based on the image folder structure
    folder_structure = os.path.dirname(image_name)
    dest_dir = os.path.join(output_dir, 'train', 'images', folder_structure)
    os.makedirs(dest_dir, exist_ok=True)
    return dest_dir


I map the category of the images to the correct image using the structure needed for yolo

In [None]:

# Load category mapping
category_mapping = {}
with open(category_path, "r") as f:
    for line in f.readlines():
        if line.strip() and not line.lower().startswith("category_name"):  # Skip header
            parts = line.strip().split(maxsplit=1)
            if len(parts) == 2:
                category_name, category_type = parts
                try:
                    category_mapping[category_name] = int(category_type)
                except ValueError:
                    print(f"Invalid category type for line: {line.strip()}")


I load the evaluation document to make sure to split the data into training and validating

In [None]:

# Load evaluation partition
eval_partition = {}
with open(list_eval_partition_path, "r") as f:
    for line in f.readlines():
        if line.strip() and not line.lower().startswith("image_name"):
            parts = line.strip().split(maxsplit=1)
            if len(parts) == 2:
                image_name, status = parts
                eval_partition[image_name] = status


I load all of the bounding boxes from the document and store them in a method so that yolo understands

In [None]:
# Load bounding boxes
bounding_boxes = {}
with open(bounding_box_path, "r") as f:
    for line in f.readlines():
        if line.strip() and not line.lower().startswith("image_name"):  # Skip header
            parts = line.strip().split()
            if len(parts) == 5:
                image_name = parts[0]
                try:
                    x1, y1, x2, y2 = map(int, parts[1:])
                    bounding_boxes[image_name] = (x1, y1, x2, y2)
                except ValueError:
                    print(f"Invalid bounding box for line: {line.strip()}")
            else:
                print(f"Skipping invalid line in bounding box file: {line.strip()}")


In [None]:

# Load image categories
image_categories = {}
with open(categorise_path, "r") as f:
    for line in f.readlines():
        if line.strip() and not line.lower().startswith("image_name"):  # Skip header
            parts = line.strip().split(maxsplit=1)
            if len(parts) == 2:
                image_name, category_label = parts
                try:
                    image_categories[image_name] = int(category_label)
                except ValueError:
                    print(f"Invalid category label for line: {line.strip()}")
            else:
                print(f"Skipping invalid line in image category file: {line.strip()}")


I convert the all the different elements so that they can be used with yolo

In [None]:

# YOLO Annotation Conversion
def convert_to_yolo_format(image_name, bbox, category_label, img_width, img_height):
    x1, y1, x2, y2 = bbox
    # Normalize coordinates
    x_center = (x1 + x2) / 2.0 / img_width
    y_center = (y1 + y2) / 2.0 / img_height
    width = (x2 - x1) / img_width
    height = (y2 - y1) / img_height
    return f"{category_label} {x_center:.6f} {y_center:.6f} {width:.6f} {height:.6f}"


I process all of the data and copy and paste it into the correct folders </br> I make sure to get the bounding box, category, and eval/train for each image.

In [None]:
# Prepare YOLO dataset
total_processed = 0

for image_name, partition in eval_partition.items():
    if image_name not in bounding_boxes or image_name not in image_categories:
        print(f"Skipping {image_name}: Missing bounding box or category.")
        continue

    bbox = bounding_boxes[image_name]
    category_label = image_categories[image_name]

    # Assuming you have a function to get image dimensions
    # Replace with actual image dimension retrieval if necessary
    img_width, img_height = 640, 480  # Replace with actual dimensions (or get from PIL if needed)

    yolo_annotation = convert_to_yolo_format(image_name, bbox, category_label, img_width, img_height)

    # Determine the correct directory based on partition
    dest_image_dir = ensure_directory_exists(image_name)
    
    # Check if the file exists, if it does, rename it
    src_image_path = image_name  # Adjust if the path is different
    dest_image_path = os.path.join(dest_image_dir, os.path.basename(image_name))

    if os.path.exists(dest_image_path):
        # Rename the file with a timestamp if it already exists
        timestamp = str(int(time.time()))
        dest_image_path = os.path.join(dest_image_dir, f"{os.path.splitext(os.path.basename(image_name))[0]}_{timestamp}{os.path.splitext(image_name)[1]}")
        print(f"File already exists, renaming to: {dest_image_path}")
    
    try:
        shutil.copy(src_image_path, dest_image_path)
    except FileNotFoundError:
        print(f"Image file not found: {src_image_path}")
        continue

    # Save annotation
    label_file = os.path.join(output_dir, partition, "labels", os.path.basename(image_name).replace(".jpg", ".txt"))
    os.makedirs(os.path.dirname(label_file), exist_ok=True)  # Ensure label directory exists
    with open(label_file, "w") as f:
        f.write(yolo_annotation + "\n")
    
    total_processed += 1

print(f"Total processed images: {total_processed}")
print("YOLO dataset preparation complete.")

I create the data.yaml that can be used later for training

In [None]:
# Create the data.yaml for YOLOv11 training
data_yaml = f"""
train: {os.path.join(output_dir, 'train', 'images')}
val: {os.path.join(output_dir, 'val', 'images')}

names:
"""
with open(category_path, "r") as f:
    for line in f.readlines():
        if line.strip() and not line.lower().startswith("category_name"):
            category_name, category_type = line.strip().split(maxsplit=1)
            data_yaml += f"  {category_type}: '{category_name}'\n"

with open('data.yaml', 'w') as f:
    f.write(data_yaml)

print("Data.yaml for YOLOv11 created successfully.")
