# Move data files to directories based on splits from CSV

Note: Needs to be run in the location where the CheXpert data images are saved.
* copy over the `project_train.csv` etc. created from part 1

In [1]:
import os
import shutil
import pandas as pd
from pathlib import Path

from tqdm import tqdm

# ------------------------------
# USER SETTINGS
# ------------------------------
IMAGE_ROOT = "."                             # base folder where image paths originate
OUTPUT_ROOT = "CheXpert_reduced_dataset_split_v3" # folder containing train/val/test dirs

CSV_SPLITS = {
    "train": "1_project_train_moco.csv",
    "test":  "1_project_test_moco.csv"
}
# ------------------------------

def ensure_dir(path):
    os.makedirs(path, exist_ok=True)

def copy_images(df, split_name, output_csv_name, cp_images=True, OUTPUT_ROOT=OUTPUT_ROOT, IMAGE_ROOT=IMAGE_ROOT):
    """
    Copy images listed in df into OUTPUT_ROOT/split_name/ and
    create a new CSV with updated filenames.
    """
    split_dir = os.path.join(OUTPUT_ROOT, split_name)
    ensure_dir(split_dir)

    # Store updated rows for the new CSV
    new_rows = []

    for _, row in tqdm(df.iterrows(), total=len(df), desc=f"Copying {split_name} images"):
        src = os.path.join(IMAGE_ROOT, row["Path"])
        # To create the destination, need to rename the file
        new_filename = "_".join(Path(row["Path"]).parts[-3:])
        # print(new_filename)
        dst = os.path.join(split_dir, new_filename)
    
        if cp_images:
            if os.path.exists(src):
                # Copy the original files to the new destination
                shutil.copy2(src, dst)
            else:
                print(f"[WARNING] Missing file: {src}")

        # Create a new row dictionary with updated path
        new_row = row.to_dict()  # copy original columns
        new_row["Path"] = new_filename  # replace Path with new filename
        new_rows.append(new_row)
    
    # Create new DataFrame and save to CSV
    new_df = pd.DataFrame(new_rows)
    new_df.to_csv(output_csv_name, index=False)
    print(f"[INFO] Saved updated CSV to {output_csv_name}")
    return new_df

        

def create_split_dirs(pre_fn, post_fn, cp_images=True, OUTPUT_ROOT=OUTPUT_ROOT, CSV_SPLITS=CSV_SPLITS):
    # Make the output root directory
    ensure_dir(OUTPUT_ROOT)

    for split_name, csv_file in CSV_SPLITS.items():
        print(f"\n=== Processing {split_name} ===")
        
        df = pd.read_csv(csv_file)

        print(f"Copying {len(df)} images...")
        copy_images(df, split_name, f"{pre_fn}final_project_updated_names_{split_name}_{post_fn}.csv", cp_images=cp_images, OUTPUT_ROOT=OUTPUT_ROOT, IMAGE_ROOT=IMAGE_ROOT)

    if cp_images:
        print("\nDone! Images copied into train/test directories.")

# Invoke for split 1
create_split_dirs("1_", "moco", cp_images=False)


=== Processing train ===
Copying 49500 images...


Copying train images: 100%|██████████| 49500/49500 [00:03<00:00, 16350.86it/s]


[INFO] Saved updated CSV to 1_final_project_updated_names_train_moco.csv

=== Processing test ===
Copying 5500 images...


Copying test images: 100%|██████████| 5500/5500 [00:00<00:00, 16714.98it/s]

[INFO] Saved updated CSV to 1_final_project_updated_names_test_moco.csv





In [2]:
# Invoke for split 2
CSV_SPLITS = {
    "train": "2_project_train_linear_cls.csv",
    "test":  "2_project_test_linear_cls.csv"
}

create_split_dirs("2_", "linear", cp_images=False, OUTPUT_ROOT=OUTPUT_ROOT, CSV_SPLITS=CSV_SPLITS)


=== Processing train ===
Copying 16642 images...


Copying train images: 100%|██████████| 16642/16642 [00:00<00:00, 16824.25it/s]


[INFO] Saved updated CSV to 2_final_project_updated_names_train_linear.csv

=== Processing test ===
Copying 3405 images...


Copying test images: 100%|██████████| 3405/3405 [00:00<00:00, 16001.57it/s]

[INFO] Saved updated CSV to 2_final_project_updated_names_test_linear.csv





---

In [3]:
# ------------------------------
# USER SETTINGS
# ------------------------------
OUTPUT_ROOT = "CheXpert_reduced_dataset_split_multiclass" # folder containing train/val/test dirs

CSV_SPLITS = {
    "train": "3_project_train_transfer.csv",
    "val": "3_project_val_transfer.csv",
    "test":  "3_project_test_transfer.csv"
}
# ------------------------------

create_split_dirs("3_", "transfer", cp_images=False, OUTPUT_ROOT=OUTPUT_ROOT, CSV_SPLITS=CSV_SPLITS)


=== Processing train ===
Copying 17527 images...


Copying train images: 100%|██████████| 17527/17527 [00:01<00:00, 15132.71it/s]


[INFO] Saved updated CSV to 3_final_project_updated_names_train_transfer.csv

=== Processing val ===
Copying 2504 images...


Copying val images: 100%|██████████| 2504/2504 [00:00<00:00, 16533.15it/s]


[INFO] Saved updated CSV to 3_final_project_updated_names_val_transfer.csv

=== Processing test ===
Copying 5008 images...


Copying test images: 100%|██████████| 5008/5008 [00:00<00:00, 15329.55it/s]

[INFO] Saved updated CSV to 3_final_project_updated_names_test_transfer.csv





In [8]:
# ------------------------------
# USER SETTINGS
# ------------------------------
OUTPUT_ROOT = "CheXpert_reduced_dataset_split_transfer_binary" # folder containing train/val/test dirs

CSV_SPLITS = {
    "train": "0_project_train.csv",
    "val": "0_project_val.csv",
    "test":  "0_project_test.csv"
}
# ------------------------------

create_split_dirs("0_", "transfer_binary", cp_images=True, OUTPUT_ROOT=OUTPUT_ROOT, CSV_SPLITS=CSV_SPLITS)


=== Processing train ===
Copying 7732 images...


Copying train images:   0%|          | 0/7732 [00:00<?, ?it/s]

Copying train images: 100%|██████████| 7732/7732 [00:11<00:00, 669.16it/s]


[INFO] Saved updated CSV to 0_final_project_updated_names_train_transfer_binary.csv

=== Processing val ===
Copying 1105 images...


Copying val images: 100%|██████████| 1105/1105 [00:01<00:00, 775.84it/s]


[INFO] Saved updated CSV to 0_final_project_updated_names_val_transfer_binary.csv

=== Processing test ===
Copying 2210 images...


Copying test images: 100%|██████████| 2210/2210 [00:04<00:00, 521.87it/s]

[INFO] Saved updated CSV to 0_final_project_updated_names_test_transfer_binary.csv

Done! Images copied into train/test directories.



