## Combine Datasets

In [1]:
import os
import shutil
from pathlib import Path
import yaml

In [2]:
# Combine datasets from synthetic-medical-arrows and arrow-detection

# Define paths
source_dataset1 = Path("/mmfs1/gscratch/krishna/xckevin/datasets/synthetic-medical-arrows")
source_dataset2 = Path("/gscratch/krishna/xckevin/arrow-detection")
output_dataset = Path("/mmfs1/gscratch/krishna/xckevin/datasets/combined-dataset")

# Create output directory structure
output_dataset.mkdir(parents=True, exist_ok=True)
for split in ["train", "valid", "test"]:
    (output_dataset / split / "images").mkdir(parents=True, exist_ok=True)
    (output_dataset / split / "labels").mkdir(parents=True, exist_ok=True)

print(f"Created output directory structure at {output_dataset}")

# Function to copy images and labels from a source dataset
def copy_dataset_split(source_path, output_path, split_name):
    """Copy images and labels from source split to output split"""
    source_images = source_path / split_name / "images"
    source_labels = source_path / split_name / "labels"
    
    if source_images.exists():
        for img_file in source_images.glob("*"):
            if img_file.is_file():
                dest = output_path / split_name / "images" / img_file.name
                shutil.copy2(img_file, dest)
        print(f"Copied {split_name} images from {source_images}")
    
    if source_labels.exists():
        for label_file in source_labels.glob("*.txt"):
            dest = output_path / split_name / "labels" / label_file.name
            shutil.copy2(label_file, dest)
        print(f"Copied {split_name} labels from {source_labels}")

# Copy datasets from both sources
print("\n--- Copying dataset 1: synthetic-medical-arrows ---")
for split in ["train", "valid", "test"]:
    copy_dataset_split(source_dataset1, output_dataset, split)

print("\n--- Copying dataset 2: arrow-detection ---")
for split in ["train", "valid", "test"]:
    copy_dataset_split(source_dataset2, output_dataset, split)

# Count total samples
total_samples = {
    "train": len(list((output_dataset / "train" / "images").glob("*"))),
    "valid": len(list((output_dataset / "valid" / "images").glob("*"))),
    "test": len(list((output_dataset / "test" / "images").glob("*"))),
}

print(f"\nCombined dataset statistics:")
print(f"  Train: {total_samples['train']} images")
print(f"  Valid: {total_samples['valid']} images")
print(f"  Test: {total_samples['test']} images")

# Create data.yaml
data_yaml = {
    "path": str(output_dataset),
    "train": "train/images",
    "val": "valid/images",
    "test": "test/images",
    "nc": 1,
    "names": ["arrow"],
    "kpt_shape": [2, 3],
    "flip_idx": [0, 1],
}

yaml_path = output_dataset / "data.yaml"
with open(yaml_path, 'w') as f:
    yaml.dump(data_yaml, f, default_flow_style=False, sort_keys=False)

print(f"\nCreated data.yaml at {yaml_path}")

Created output directory structure at /mmfs1/gscratch/krishna/xckevin/datasets/combined-dataset

--- Copying dataset 1: synthetic-medical-arrows ---
Copied train images from /mmfs1/gscratch/krishna/xckevin/datasets/synthetic-medical-arrows/train/images
Copied train labels from /mmfs1/gscratch/krishna/xckevin/datasets/synthetic-medical-arrows/train/labels
Copied valid images from /mmfs1/gscratch/krishna/xckevin/datasets/synthetic-medical-arrows/valid/images
Copied valid labels from /mmfs1/gscratch/krishna/xckevin/datasets/synthetic-medical-arrows/valid/labels
Copied test images from /mmfs1/gscratch/krishna/xckevin/datasets/synthetic-medical-arrows/test/images
Copied test labels from /mmfs1/gscratch/krishna/xckevin/datasets/synthetic-medical-arrows/test/labels

--- Copying dataset 2: arrow-detection ---

Combined dataset statistics:
  Train: 6321 images
  Valid: 790 images
  Test: 791 images

Created data.yaml at /mmfs1/gscratch/krishna/xckevin/datasets/combined-dataset/data.yaml
