In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
import glob

In [2]:
# Paths y nombres
DATA_PATH = "winner_data/"
METADATA_PATH = os.path.join(DATA_PATH, "metadata/")

IMAGES_PATH = os.path.join(DATA_PATH, "images/")
SYNTHETIC_DATA_PATH = os.path.join("winner_data/")

ORIGINAL_TRAIN_METADATA_CSV_PATH = os.path.join(METADATA_PATH, "train-metadata.csv") # DF original con ISIC_ID y MALIGNANT
FOLDS_CSV_PATH = os.path.join(METADATA_PATH, "folds.csv") # DF con ISIC_ID original y FOLD para entrenamiento

ORIGINAL_IMAGES_PATH = os.path.join(IMAGES_PATH, "original/")
SYNTHETIC_IMAGES_PATH = os.path.join(IMAGES_PATH, "synthetic/")

# Imagenes originales y sinteticas con distinta extension
ORIGINAL_IMAGE_EXTENSION = ".jpg"
SYNTHETIC_IMAGE_EXTENSION = ".png"

# Nombres para guardar csv procesados
OUTPUT_TRAIN_ORIGINAL_DF_FILENAME = "train_original.csv"
OUTPUT_VAL_DF_FILENAME = "val.csv"
OUTPUT_TEST_DF_FILENAME = "test.csv"
OUTPUT_SYNTHETIC_DF_FILENAME = "synthetic.csv"

OUTPUT_TRAIN_ORIGINAL_DF_PATH = os.path.join(METADATA_PATH, OUTPUT_TRAIN_ORIGINAL_DF_FILENAME)
OUTPUT_VAL_DF_PATH = os.path.join(METADATA_PATH, OUTPUT_VAL_DF_FILENAME)
OUTPUT_TEST_DF_PATH = os.path.join(METADATA_PATH, OUTPUT_TEST_DF_FILENAME)
OUTPUT_SYNTHETIC_DF_PATH = os.path.join(METADATA_PATH, OUTPUT_SYNTHETIC_DF_FILENAME)

In [3]:
print(OUTPUT_TRAIN_ORIGINAL_DF_PATH, OUTPUT_VAL_DF_PATH, OUTPUT_TEST_DF_PATH)

winner_data/metadata/train_original.csv winner_data/metadata/val.csv winner_data/metadata/test.csv


## 1. Procesamiento datos originales

In [4]:
# Leer csv con datos originales y target
original_train_metadata_df = pd.read_csv(ORIGINAL_TRAIN_METADATA_CSV_PATH)
original_train_metadata_df = original_train_metadata_df.rename(columns={"malignant": "target"})
original_train_metadata_df["target"] = original_train_metadata_df["target"].astype(int)
original_train_metadata_df = original_train_metadata_df[["isic_id", "target"]]

  original_train_metadata_df = pd.read_csv(ORIGINAL_TRAIN_METADATA_CSV_PATH)


In [56]:
original_train_metadata_df.head(5)

Unnamed: 0,isic_id,target
0,ISIC_0015670,0
1,ISIC_0015845,0
2,ISIC_0015864,0
3,ISIC_0015902,0
4,ISIC_0024200,0


In [12]:
original_train_metadata_df["target"].value_counts(1) * 100

target
0    99.902009
1     0.097991
Name: proportion, dtype: float64

In [10]:
len(original_train_metadata_df)

401059

In [57]:
# Agregar columna image_path. Caso imagenes originales
original_train_metadata_df["image_path"] = original_train_metadata_df["isic_id"].apply(
    lambda x: os.path.join(ORIGINAL_IMAGES_PATH, x + ORIGINAL_IMAGE_EXTENSION)
)

In [58]:
original_train_metadata_df.head(5)

Unnamed: 0,isic_id,target,image_path
0,ISIC_0015670,0,winner_data/images/original/ISIC_0015670.jpg
1,ISIC_0015845,0,winner_data/images/original/ISIC_0015845.jpg
2,ISIC_0015864,0,winner_data/images/original/ISIC_0015864.jpg
3,ISIC_0015902,0,winner_data/images/original/ISIC_0015902.jpg
4,ISIC_0024200,0,winner_data/images/original/ISIC_0024200.jpg


In [59]:
# Definición de tamaño conjunto de entrenamiento, validación y test
TRAIN_TEST_SPLIT_SIZE = 0.8
VAL_SPLIT_SIZE = 0.15
SEED = 2025

In [60]:
# Crear datasets/ids de entrenamiento, validación y test
temp_df, test_df = train_test_split(
    original_train_metadata_df,
    train_size=TRAIN_TEST_SPLIT_SIZE,
    random_state=SEED,
    stratify=original_train_metadata_df["target"]
)

train_df, val_df = train_test_split(
    temp_df,
    train_size=(1 - VAL_SPLIT_SIZE),
    random_state=SEED,
    stratify=temp_df["target"]
)

In [61]:
print(f"N train imagenes originales: {train_df.size}, % : {np.round(train_df.size / original_train_metadata_df.size, 2)}")
print(f"N val: {val_df.size}, % : {np.round(val_df.size / original_train_metadata_df.size, 2)}")
print(f"N test: {test_df.size}, % : {np.round(test_df.size / original_train_metadata_df.size, 2)}")

N train imagenes originales: 818157, % : 0.68
N val: 144384, % : 0.12
N test: 240636, % : 0.2


In [62]:
# Guardar csv split originales
train_df.to_csv(OUTPUT_TRAIN_ORIGINAL_DF_PATH, index=False)
val_df.to_csv(OUTPUT_VAL_DF_PATH, index=False)
test_df.to_csv(OUTPUT_TEST_DF_PATH, index=False)

## 2. Procesamiento datos sinteticos

In [63]:
# N Folds
unique_sorted_folds = [i for i in range(0, 5)]
unique_sorted_folds

[0, 1, 2, 3, 4]

In [64]:
synthetic_images_data = []

# Iterar por cada fold, por cada carpeta hr, lr y añadir path de imagenes sinteticas a df
images_type = ["/lr", "/hr"] # Tipos de imagen low res y high res
for fold in unique_sorted_folds:
    for image_type in images_type:
        # Subdirectorio hr o lr donde estan las imagenes
        synthetic_images_subdir = os.path.join(SYNTHETIC_IMAGES_PATH, str(fold) + image_type)
        pattern = "*" + SYNTHETIC_IMAGE_EXTENSION # Patron para seleccionar imagenes
        found_files = glob.glob(os.path.join(synthetic_images_subdir, pattern)) # NO PRINTEAR O CRASH

        # Guardar fila
        for synthetic_image_path in found_files:
            synthetic_images_data.append({
                "target": 1,
                "image_path": synthetic_image_path
            })

In [65]:
# Crear dataframe metadata sintetica
synthetic_df = pd.DataFrame(synthetic_images_data)

In [66]:
synthetic_df.head(5)

Unnamed: 0,target,image_path
0,1,winner_data/images/synthetic/0/lr\00047c6d-ca1...
1,1,winner_data/images/synthetic/0/lr\00808b91-e14...
2,1,winner_data/images/synthetic/0/lr\008dcaf9-71b...
3,1,winner_data/images/synthetic/0/lr\009d9c71-b0d...
4,1,winner_data/images/synthetic/0/lr\00b846be-652...


In [67]:
# Revisar paths correctas
"""
for idx, row in synthetic_df.iterrows():
    file_path = row["image_path"]
    if os.path.isfile(file_path):
        continue
    else:
        print("Error")
        break
"""

'\nfor idx, row in synthetic_df.iterrows():\n    file_path = row["image_path"]\n    if os.path.isfile(file_path):\n        continue\n    else:\n        print("Error")\n        break\n'

In [68]:
synthetic_df.to_csv(OUTPUT_SYNTHETIC_DF_PATH, index=False)