# Move data files to directories based on splits from CSV

Note: Needs to be run in the location where the CheXpert data images are saved.
* copy over the `project_train.csv` etc. created from part 1

In [3]:
import os
import shutil
import pandas as pd
from pathlib import Path

from tqdm import tqdm

# ------------------------------
# USER SETTINGS
# ------------------------------
IMAGE_ROOT = "."                             # base folder where image paths originate
OUTPUT_ROOT = "CheXpert_reduced_dataset_split_v2" # folder containing train/val/test dirs
# For validation - "val":   "project_val.csv",
CSV_SPLITS = {
    "train": "project_train.csv",
    "test":  "project_test.csv"
}
# ------------------------------

def ensure_dir(path):
    os.makedirs(path, exist_ok=True)

def copy_images(df, split_name, output_csv_name):
    """
    Copy images listed in df into OUTPUT_ROOT/split_name/ and
    create a new CSV with updated filenames.
    """
    split_dir = os.path.join(OUTPUT_ROOT, split_name)
    ensure_dir(split_dir)

    # Store updated rows for the new CSV
    new_rows = []

    for _, row in tqdm(df.iterrows(), total=len(df), desc=f"Copying {split_name} images"):
        src = os.path.join(IMAGE_ROOT, row["Path"])
        # To create the destination, need to rename the file
        new_filename = "_".join(Path(row["Path"]).parts[-3:])
        # print(new_filename)
        dst = os.path.join(split_dir, new_filename)

        if os.path.exists(src):
            # Copy the original files to the new destination
            shutil.copy2(src, dst)
        else:
            print(f"[WARNING] Missing file: {src}")

        # Create a new row dictionary with updated path
        new_row = row.to_dict()  # copy original columns
        new_row["Path"] = new_filename  # replace Path with new filename
        new_rows.append(new_row)
    
    # Create new DataFrame and save to CSV
    new_df = pd.DataFrame(new_rows)
    new_df.to_csv(output_csv_name, index=False)
    print(f"[INFO] Saved updated CSV to {output_csv_name}")
    return new_df

        

def main():
    # Make the output root directory
    ensure_dir(OUTPUT_ROOT)

    for split_name, csv_file in CSV_SPLITS.items():
        print(f"\n=== Processing {split_name} ===")
        
        df = pd.read_csv(csv_file)

        print(f"Copying {len(df)} images...")
        copy_images(df, split_name, f"final_project_updated_names_{split_name}.csv")

    print("\nDone! Images copied into train/test directories.")

if __name__ == "__main__":
    main()



=== Processing train ===
Copying 49500 images...


Copying train images: 100%|██████████| 49500/49500 [01:22<00:00, 598.06it/s]


[INFO] Saved updated CSV to final_project_updated_names_train.csv

=== Processing test ===
Copying 5500 images...


Copying test images: 100%|██████████| 5500/5500 [00:07<00:00, 720.28it/s]

[INFO] Saved updated CSV to final_project_updated_names_test.csv

Done! Images copied into train/test directories.



