In [1]:
import os
import pandas as pd
import random
import math
import shutil
from pathlib import Path

random.seed(42)

def split_and_copy_dataset(base_dir, output_dir, template_file):
    """
    Scans a directory with class-based subfolders, splits the image files
    randomly into training, validation, and testing sets, copies the files
    to a new directory structure, and saves the results to CSV files with
    prompts from a template file. This operation is case-insensitive for filenames.

    Args:
        base_dir (str): The path to the main directory containing class subfolders.
        output_dir (str): The directory where the new folder structure and CSV files will be saved.
        template_file (str): The path to the template CSV file with prompts.
    """
    # --- 0. Process Template File ---
    try:
        template_df = pd.read_csv(template_file)
        # Ensure required columns exist
        if 'image_filename' not in template_df.columns or 'text_prompt' not in template_df.columns:
            print("Error: Template file must contain 'image_filename' and 'text_prompt' columns.")
            return

        # --- CORRECTED LINE ---
        # Create a fast lookup dictionary with a lowercase index (filenames) for case-insensitive matching.
        prompt_lookup = pd.Series(template_df.text_prompt.values, index=template_df.image_filename.str.lower()).to_dict()
        print("Successfully loaded template file and created case-insensitive prompt lookup.")

    except FileNotFoundError:
        print(f"Error: Template file not found at '{template_file}'")
        return

    # --- 1. Configuration ---
    train_ratio = 0.7
    valid_ratio = 0.2
    allowed_extensions = {'.jpg', '.jpeg', '.png', '.gif'}

    # --- 2. Setup Output Directories ---
    output_path = Path(output_dir)
    train_path = output_path / 'train'
    valid_path = output_path / 'valid'
    test_path = output_path / 'test'

    print(f"Creating output directories at: {output_path.resolve()}")
    train_path.mkdir(parents=True, exist_ok=True)
    valid_path.mkdir(parents=True, exist_ok=True)
    test_path.mkdir(parents=True, exist_ok=True)

    # --- 3. Data Collection and Processing ---
    all_files_data = []
    print(f"Scanning source directory: {base_dir}")

    base_path = Path(base_dir)
    if not base_path.is_dir():
        print(f"Error: Provided base directory '{base_dir}' does not exist or is not a directory.")
        return

    subfolders = [folder for folder in base_path.iterdir() if folder.is_dir()]
    print(f"Found {len(subfolders)} class folders. Starting split and copy process...")

    for folder in subfolders:
        class_name = folder.name

        class_files = [
            file for file in folder.iterdir()
            if file.is_file() and file.suffix.lower() in allowed_extensions
        ]

        if not class_files:
            print(f"Warning: No images found in folder '{class_name}'. Skipping.")
            continue

        random.shuffle(class_files)

        num_files = len(class_files)
        num_train = math.floor(num_files * train_ratio)
        num_valid = math.floor(num_files * valid_ratio)

        train_files = class_files[:num_train]
        valid_files = class_files[num_train : num_train + num_valid]
        test_files = class_files[num_train + num_valid:]

        def copy_files(files, destination_folder, set_name):
            for f in files:
                # Copy file to the root of the destination folder (e.g., train/, valid/)
                dest_path = destination_folder / f.name
                shutil.copy(str(f), str(dest_path))

                # --- Case-Insensitive Lookup ---
                # Look up the prompt using the lowercased filename
                text_prompt = prompt_lookup.get(f.name.lower())

                # If a prompt is found, add the data to our list. Otherwise, warn the user.
                if text_prompt:
                    all_files_data.append({'filepath': str(dest_path), 'prompt': text_prompt, 'set': set_name})
                else:
                    print(f"Warning: Prompt not found for '{f.name}' in template file. Skipping this file.")

        copy_files(train_files, train_path, 'train')
        copy_files(valid_files, valid_path, 'valid')
        copy_files(test_files, test_path, 'test')

    if not all_files_data:
        print("Error: No image files with matching prompts were found to process. Halting.")
        return

    # --- 4. CSV Creation ---
    df = pd.DataFrame(all_files_data)

    train_df = df[df['set'] == 'train'][['filepath', 'prompt']]
    valid_df = df[df['set'] == 'valid'][['filepath', 'prompt']]
    test_df = df[df['set'] == 'test'][['filepath', 'prompt']]

    train_csv_path = output_path / 'train.csv'
    valid_csv_path = output_path / 'valid.csv'
    test_csv_path = output_path / 'test.csv'

    train_df.to_csv(train_csv_path, index=False)
    valid_df.to_csv(valid_csv_path, index=False)
    test_df.to_csv(test_csv_path, index=False)

    print("\n--- Split Summary ---")
    print(f"Total images processed: {len(df)}")
    print(f"Training images: {len(train_df)}")
    print(f"Validation images: {len(valid_df)}")
    print(f"Testing images: {len(test_df)}")
    print("--------------------")
    print(f"Successfully created new dataset and CSV files at: {output_path.resolve()}")
    print(f" - {train_csv_path.name}, {valid_csv_path.name}, {test_csv_path.name}")
    print(f" - Folders: train/, valid/, test/")


if __name__ == '__main__':
    # --- IMPORTANT ---
    # Set the path to the original directory containing your image subfolders
    IMAGE_DIRECTORY = 'C:/Users/yehte/Downloads/Ye Htet/Projects/TikTok/Annotation/fine-tune/crop-faces'

    # Set the path where you want to save the new dataset structure and CSV files
    OUTPUT_DIRECTORY = 'C:/Users/yehte/Downloads/Ye Htet/Projects/TikTok/Annotation/fine-tune'

    # Set the path to the template file with image_filename and text_prompt
    TEMPLATE_FILE = "C:/Users/yehte/Downloads/Ye Htet/Projects/TikTok/Annotation/Annotation_Template_Fine_Tune_Consistent_2025-06-30.csv"

    # Run the function
    split_and_copy_dataset(IMAGE_DIRECTORY, OUTPUT_DIRECTORY, TEMPLATE_FILE)


Successfully loaded template file and created case-insensitive prompt lookup.
Creating output directories at: C:\Users\yehte\Downloads\Ye Htet\Projects\TikTok\Annotation\fine-tune
Scanning source directory: C:/Users/yehte/Downloads/Ye Htet/Projects/TikTok/Annotation/fine-tune/crop-faces
Found 20 class folders. Starting split and copy process...

--- Split Summary ---
Total images processed: 1000
Training images: 700
Validation images: 200
Testing images: 100
--------------------
Successfully created new dataset and CSV files at: C:\Users\yehte\Downloads\Ye Htet\Projects\TikTok\Annotation\fine-tune
 - train.csv, valid.csv, test.csv
 - Folders: train/, valid/, test/
