In [34]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [35]:
from pathlib import Path
import shutil

import pandas as pd
from sklearn.model_selection import train_test_split

from mra_midas_skin_cancer_ml.utils.process_metadata import (
    create_lesion_key,
    import_metadata,
    process_target,
    sort_metadata,
    get_data_dir,
    export_metadata,
)

from mra_midas_skin_cancer_ml.utils.validate_data import (
    check_split_ratios,
    count_files_in_image_folders,
)

In [36]:
def process_metadata_for_img():
    """Split metadata based on image distance ("1ft", "6in", "dscope")."""

    meta_df = import_metadata()
    meta_df = process_target(meta_df)
    meta_df = meta_df[meta_df["midas_path_binary"] != "missing"]

    dist_dict = {}

    cols = [
        "lesion_key",
        "midas_record_id",
        "midas_file_name",
        "midas_path_binary",
    ]

    for dist in ["1ft", "6in", "dscope"]:
        subset_df = meta_df[meta_df["midas_distance"] == dist]

        # Sort by patient_id (asc), lesion (asc)and control (desc)
        subset_df = sort_metadata(subset_df)

        # Create unique patient lesion key
        subset_df = create_lesion_key(subset_df)

        subset_df = subset_df[cols]

        # Drop duplicates and keep last record (non-control record)
        subset_df = subset_df.drop_duplicates(subset="lesion_key", keep="last")

        print(f"{dist} Unique: {subset_df['lesion_key'].is_unique}")

        dist_dict[dist] = subset_df

    return dist_dict


dist_dict = process_metadata_for_img()

1ft Unique: True
6in Unique: True
dscope Unique: True


In [37]:
def train_test_split_by_lesion(dist_dict, test_size=0.2, random_state=42):
    """Split dataframe into train/val/test sets for each image set."""

    result_dict = {}
    data_dir = get_data_dir()

    for dist, subset_df in dist_dict.items():
        X = subset_df.drop(columns=["midas_path_binary"])
        y = subset_df["midas_path_binary"]

        X_train, X_temp, y_train, y_temp = train_test_split(
            X, y, test_size=test_size, random_state=random_state, stratify=y
        )

        X_val, X_test, y_val, y_test = train_test_split(
            X_temp,
            y_temp,
            test_size=0.5,
            random_state=random_state,
            stratify=y_temp,
        )

        splits = {
            "train": (X_train, y_train),
            "val": (X_val, y_val),
            "test": (X_test, y_test),
        }

        split_dfs = []

        for split_name, (X_split, y_split) in splits.items():
            split_df = X_split.copy()
            split_df["midas_path_binary"] = y_split
            split_df["split"] = split_name
            split_dfs.append(split_df)

        result_dict[dist] = pd.concat(split_dfs, ignore_index=True)

        export_metadata(
            result_dict[dist],
            data_dir / "output" / f"{dist}_split_image_data.xlsx",
        )

    check_split_ratios(result_dict)

    return result_dict


result_dict = train_test_split_by_lesion(dist_dict)


1ft
Split raw counts:
split
train    820
val      103
test     103
Name: count, dtype: int64

Split proportions:
split
train    0.79922
val      0.10039
test     0.10039
Name: proportion, dtype: float64

Target proportions by split:
split  midas_path_binary
test   benign               0.514563
       malignant            0.485437
train  benign               0.507317
       malignant            0.492683
val    benign               0.504854
       malignant            0.495146
Name: proportion, dtype: float64

6in
Split raw counts:
split
train    825
test     104
val      103
Name: count, dtype: int64

Split proportions:
split
train    0.799419
test     0.100775
val      0.099806
Name: proportion, dtype: float64

Target proportions by split:
split  midas_path_binary
test   benign               0.509615
       malignant            0.490385
train  benign               0.505455
       malignant            0.494545
val    benign               0.504854
       malignant            0.495146
Na

In [None]:
def create_image_folders(result_dict, raw_images_dir, output_root_dir):
    """Creates folders and copies images into train/val/test based on base file names."""

    raw_images_dir = Path(raw_images_dir)
    output_root_dir = Path(output_root_dir)

    for dist, df in result_dict.items():
        dist_dir = output_root_dir / dist

        # Create folder structure
        for split in ["train", "val", "test"]:
            for label in ["benign", "malignant"]:
                (dist_dir / split / label).mkdir(parents=True, exist_ok=True)

        # Copy images
        for row in df.itertuples(index=False):
            base_name = Path(row.midas_file_name).stem  # remove extension
            label = row.midas_path_binary
            split = row.split

            # Find all files in raw_images that start with the base_name
            matched_files = list(raw_images_dir.glob(f"{base_name}*"))

            if not matched_files:
                print(f"No files found starting with: {base_name}")
                continue

            for src_path in matched_files:
                dst_path = dist_dir / split / label / src_path.name
                shutil.copy2(src_path, dst_path)

    # Validation: count files
    count_files_in_image_folders(output_root_dir)

create_image_folders(
    result_dict, 
    get_data_dir() / "input" / "raw_images", 
    get_data_dir() / "output" / "split_images"
)