In [None]:
import os
import pandas as pd

IMAGES_DIR = "images"
LABELS_DIR = "labels"
OUTPUT_CSV = "dataset.csv"

def remove_file_if_exists(path):
    if os.path.exists(path):
        os.remove(path)

data = []

# List images in IMAGES_DIR
for image_file in os.listdir(IMAGES_DIR):
    # Check for typical image extensions
    if not image_file.lower().endswith((".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff")):
        continue

    base_name = os.path.splitext(image_file)[0]
    label_file = base_name + ".txt"
    label_path = os.path.join(LABELS_DIR, label_file)

    image_path = os.path.join(IMAGES_DIR, image_file)

    # If label file doesn't exist => remove image & skip
    if not os.path.exists(label_path):
        print(f"Removing '{image_file}' (no matching label).")
        os.remove(image_path)
        continue

    # Read label file lines
    with open(label_path, "r") as f:
        lines = [line.strip() for line in f if line.strip()]

    tumor = 0
    bboxes = []

    # If no lines => tumor=0, bboxes=[]
    if lines:
        # Check if any line has class_id == '1'
        any_tumor_line = any(line.split()[0] == "1" for line in lines if len(line.split()) == 5)
        if any_tumor_line:
            tumor = 1
            # Collect bounding boxes from lines with class_id=1
            for line in lines:
                parts = line.split()
                if len(parts) == 5:
                    class_id = parts[0]
                    if class_id == "1":
                        # parse x,y,w,h
                        try:
                            x_center = float(parts[1])
                            y_center = float(parts[2])
                            width = float(parts[3])
                            height = float(parts[4])
                            bboxes.append([x_center, y_center, width, height])
                        except ValueError:
                            print(f"Warning: could not parse bbox coords in '{label_file}': {line}")
                    # If class_id=='0', we simply ignore that bounding box
                else:
                    print(f"Warning: invalid line in '{label_file}': {line}")
        # else => tumor=0 => keep bboxes=[]
        # ignoring bounding boxes for class_id=0 lines

    data.append({
        "image": image_file,
        "tumor": tumor,
        "bboxes": bboxes
    })

df = pd.DataFrame(data, columns=["image", "tumor", "bboxes"])
df.to_csv(OUTPUT_CSV, index=False)
print(f"\nGenerated '{OUTPUT_CSV}' with columns [image, tumor, bboxes].")
print(" - tumor=1 => only bounding boxes where class_id=1")
print(" - tumor=0 => empty bboxes (and ignoring class_id=0 lines).")
