In [1]:
%load_ext autoreload
%autoreload 2

# Import Libraries

In [15]:
from pathlib import Path
import shutil

import pandas as pd
from sklearn.model_selection import train_test_split

from mra_midas_skin_cancer_ml.utils.process_metadata import (
    create_lesion_key,
    dedupe_metadata,
    drop_na_target_img,
    import_metadata,
    process_target,
    get_data_dir,
    export_metadata,
)

from mra_midas_skin_cancer_ml.utils.validate_data import (
    check_split_ratios,
    count_files_in_image_folders,
)

# Split Data Based On Image Distance

In [12]:
def process_metadata_for_img():
    """Process metadata for image splitting."""

    meta_df = import_metadata()
    meta_df = process_target(meta_df)
    meta_df = drop_na_target_img(meta_df)
    dedupe_df = dedupe_metadata(meta_df)

    meta_df = create_lesion_key(meta_df)
    merge_df = pd.merge(
        meta_df,
        dedupe_df[["lesion_key", "midas_path_binary"]],
        on=["lesion_key", "midas_path_binary"],
        how="inner",
    )

    return merge_df


def split_metadata_dist():
    """Split metadata based on image distance ("1ft", "6in", "dscope")."""

    meta_df = process_metadata_for_img()

    dist_dict = {}

    cols = [
        "lesion_key",
        "midas_record_id",
        "midas_file_name",
        "matched_file",
        "midas_path_binary",
    ]

    total_unique_lesions = set()

    for dist in ["1ft", "6in", "dscope"]:
        subset_df = meta_df[meta_df["midas_distance"] == dist]
        subset_df = subset_df[cols]

        # Drop multiple images per lesion
        subset_df = subset_df.drop_duplicates(
            subset=["lesion_key", "midas_path_binary"]
        )

        print(f"{dist} is unique: {subset_df['lesion_key'].is_unique}")
        unique_count = subset_df["lesion_key"].nunique()
        print(f"{dist} unique lesions: {unique_count} \n")

        total_unique_lesions.update(subset_df["lesion_key"].tolist())

        dist_dict[dist] = subset_df

    print(f"Total unique lesions: {len(total_unique_lesions)}\n")

    return dist_dict


dist_dict = split_metadata_dist()

Total missing files: 12
No file found: s-prd-502892079.jpg
No file found: s-prd-539536718.jpg
No file found: s-prd-539536620.jpg
No file found: s-prd-709811242.jpeg
No file found: s-prd-656881902.jpg
No file found: s-prd-656882615.jpg
No file found: s-prd-656882465.jpg
No file found: s-prd-675941199.jpeg
No file found: s-prd-692721767.jpeg
No file found: s-prd-722591153.jpeg
No file found: s-prd-722591152.jpeg
No file found: s-prd-798621909.jpg

Is unique: True
Unique count: 1035 

1ft is unique: True
1ft unique lesions: 1021 

6in is unique: True
6in unique lesions: 1028 

dscope is unique: True
dscope unique lesions: 1028 

Total unique lesions: 1035



# Split Each Image Set Into Train/Val/Test

In [None]:
def split_train_test_by_lesion(dist_dict, test_size=0.2, random_state=42):
    """
    Split dataframes for each image distance into train/val/test sets by lesion_key.
    """
    # Get all unique lesion_keys across distances
    all_keys = pd.concat(
        [df["lesion_key"] for df in dist_dict.values()]
    ).unique()

    # Split lesion keys into train, val and test
    train_keys, val_test_keys = train_test_split(
        all_keys, test_size=test_size, random_state=random_state
    )

    val_keys, test_keys = train_test_split(
        val_test_keys, test_size=0.5, random_state=random_state
    )

    master_split_df = pd.DataFrame({"lesion_key": all_keys})

    def assign_master_split(key):
        if key in train_keys:
            return "train"
        elif key in val_keys:
            return "val"
        else:
            return "test"

    master_split_df["split"] = master_split_df["lesion_key"].apply(assign_master_split)   

    # Assign splits for each distance based on lesion key
    result_dict = {}
    data_dir = get_data_dir()

    for dist, subset_df in dist_dict.items():
        df = subset_df.copy()

        def assign_split(lesion_key):
            if lesion_key in train_keys:
                return "train"
            elif lesion_key in val_keys:
                return "val"
            else:
                return "test"

        df["split"] = df["lesion_key"].apply(assign_split)
        result_dict[dist] = df

        export_metadata(
            df, data_dir / "output" / f"{dist}_split_image_data.xlsx"
        )

    check_split_ratios(result_dict)
    return result_dict


result_dict = split_train_test_by_lesion(dist_dict)


1ft
Split raw counts
split
test     815
val      104
train    102
Name: count, dtype: int64
total 1021

Split proportions:
split
test     0.798237
val      0.101861
train    0.099902
Name: proportion, dtype: float64

Target proportions by split:
split  midas_path_binary
test   benign               0.512883
       malignant            0.487117
train  malignant            0.529412
       benign               0.470588
val    benign               0.500000
       malignant            0.500000
Name: proportion, dtype: float64

6in
Split raw counts
split
test     821
val      104
train    103
Name: count, dtype: int64
total 1028

Split proportions:
split
test     0.798638
val      0.101167
train    0.100195
Name: proportion, dtype: float64

Target proportions by split:
split  midas_path_binary
test   benign               0.511571
       malignant            0.488429
train  malignant            0.524272
       benign               0.475728
val    benign               0.500000
       malignant

In [4]:
def split_train_test_by_lesion(dist_dict, test_size=0.2, random_state=42):
    """Split dataframe into train/val/test sets for each image set."""

    result_dict = {}
    data_dir = get_data_dir()

    for dist, subset_df in dist_dict.items():
        X = subset_df.drop(columns=["midas_path_binary"])
        y = subset_df["midas_path_binary"]

        X_train, X_temp, y_train, y_temp = train_test_split(
            X, y, test_size=test_size, random_state=random_state, stratify=y
        )

        X_val, X_test, y_val, y_test = train_test_split(
            X_temp,
            y_temp,
            test_size=0.5,
            random_state=random_state,
            stratify=y_temp,
        )

        splits = {
            "train": (X_train, y_train),
            "val": (X_val, y_val),
            "test": (X_test, y_test),
        }

        split_dfs = []

        for split_name, (X_split, y_split) in splits.items():
            split_df = X_split.copy()
            split_df["midas_path_binary"] = y_split
            split_df["split"] = split_name
            split_dfs.append(split_df)

        result_dict[dist] = pd.concat(split_dfs, ignore_index=True)

        export_metadata(
            result_dict[dist],
            data_dir / "output" / f"{dist}_split_image_data.xlsx",
        )

    check_split_ratios(result_dict)

    return result_dict


result_dict = split_train_test_by_lesion(dist_dict)


1ft
Split raw counts:
split
train    816
test     103
val      102
Name: count, dtype: int64

Split proportions:
split
train    0.799216
test     0.100881
val      0.099902
Name: proportion, dtype: float64

Target proportions by split:
split  midas_path_binary
test   benign               0.504854
       malignant            0.495146
train  benign               0.507353
       malignant            0.492647
val    benign               0.509804
       malignant            0.490196
Name: proportion, dtype: float64

6in
Split raw counts:
split
train    822
val      103
test     103
Name: count, dtype: int64

Split proportions:
split
train    0.799611
val      0.100195
test     0.100195
Name: proportion, dtype: float64

Target proportions by split:
split  midas_path_binary
test   benign               0.504854
       malignant            0.495146
train  benign               0.507299
       malignant            0.492701
val    benign               0.504854
       malignant            0.495146

In [None]:
def create_image_folders(result_dict, raw_images_dir, output_root_dir):
    """
    Creates folders and copies images into train/val/test. 
    Match case-insensitive filenames (.jpg/.jpeg/_cropped.jpg/_cropped.jpeg)
    """

    raw_images_dir = Path(raw_images_dir)
    output_root_dir = Path(output_root_dir)

    all_files_map = {
        p.name.lower(): p for p in raw_images_dir.iterdir() if p.is_file()
    }

    for dist, df in result_dict.items():
        dist_dir = output_root_dir / dist

        for split in ["train", "val", "test"]:
            for label in ["benign", "malignant"]:
                (dist_dir / split / label).mkdir(parents=True, exist_ok=True)

        for row in df.itertuples(index=False):
            excel_name = (row.midas_file_name.lower())  
            label = row.midas_path_binary
            split = row.split

            stem = Path(excel_name).stem  # base without extension

            candidates = [
                f"{stem}.jpg",
                f"{stem}.jpeg",
                f"{stem}_cropped.jpg",
                f"{stem}_cropped.jpeg",
            ]

            matched_file = None
            for candidate in candidates:
                if candidate in all_files_map:
                    matched_file = all_files_map[candidate]
                    break

            if matched_file is None:
                print(f"No file found for: {row.midas_file_name}")
                continue

            dst_path = dist_dir / split / label / matched_file.name
            shutil.copy2(matched_file, dst_path)

    count_files_in_image_folders(output_root_dir)


create_image_folders(
    result_dict,
    get_data_dir() / "input" / "raw_images",
    get_data_dir() / "output" / "split_images",
)


=== 1ft ===
train / benign   : 414
train / malignant: 402
val   / benign   : 52
val   / malignant: 50
test  / benign   : 52
test  / malignant: 51

=== 6in ===
train / benign   : 417
train / malignant: 405
val   / benign   : 52
val   / malignant: 51
test  / benign   : 52
test  / malignant: 51

=== dscope ===
train / benign   : 415
train / malignant: 407
val   / benign   : 52
val   / malignant: 51
test  / benign   : 52
test  / malignant: 51
