In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import requests
from tqdm.auto import tqdm

# Setup

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [48]:
BASE_DIR = "/content/drive/MyDrive/DATABase/fitzpatric17k"
CSV_PATH = os.path.join(BASE_DIR, "fitzpatrick17k.csv")
OUTPUT_DIR = os.path.join(BASE_DIR, "fitzpatrick_images")
os.makedirs(OUTPUT_DIR, exist_ok=True)

## Filter and split the metadata

In [49]:
df = pd.read_csv(CSV_PATH)

In [50]:
df = df[df['label'].str.lower().str.contains("melanoma") | df['label'].str.lower().str.contains("nevus")].copy()

In [51]:
print(len(df), "lines after filtration")

975 lines after filtration


In [52]:
scales = sorted(df['fitzpatrick_scale'].unique())
scales

[np.int64(-1),
 np.int64(1),
 np.int64(2),
 np.int64(3),
 np.int64(4),
 np.int64(5),
 np.int64(6)]

In [53]:
df['benign_malignant'] = df['label'].str.lower().apply(lambda x: 1 if "melanoma" in x else 0 if "nevus" in x else np.nan)
df = df.dropna(subset=['benign_malignant'])
df['benign_malignant'] = df['benign_malignant'].astype(int)

In [54]:
df.rename(columns={'md5hash': 'image_id'}, inplace=True)

In [55]:
df.head()

Unnamed: 0,image_id,fitzpatrick_scale,fitzpatrick_centaur,label,nine_partition_label,three_partition_label,qc,url,url_alphanum,benign_malignant
13,b87804452f60aa162a6d29c0f66a2466,2,1,melanoma,malignant melanoma,malignant,,https://www.dermaamin.com/site/images/clinical...,httpwwwdermaamincomsiteimagesclinicalpicLlmmlm...,1
18,4c3f795cf8eb72b946f9bd2642cf23c1,6,5,melanoma,malignant melanoma,malignant,,https://www.dermaamin.com/site/images/clinical...,httpwwwdermaamincomsiteimagesclinicalpicmmelan...,1
71,bb18c8800c62e37bd21641ad30aa3982,3,2,nevocytic nevus,benign melanocyte,benign,,https://www.dermaamin.com/site/images/clinical...,httpwwwdermaamincomsiteimagesclinicalpicnnevoc...,0
75,81c007db8cc1bc57cb08bcc00dda653b,2,1,superficial spreading melanoma ssm,malignant melanoma,malignant,,https://www.dermaamin.com/site/images/clinical...,httpwwwdermaamincomsiteimagesclinicalpicssuper...,1
85,d98b961ecb24b31d672ca92d42fa644f,1,1,epidermal nevus,benign epidermal,benign,,https://www.dermaamin.com/site/images/clinical...,httpwwwdermaamincomsiteimagesclinicalpiceepide...,0


In [56]:
train_list = []
test_list = []
for scale in df['fitzpatrick_scale'].unique():
    subset = df[df['fitzpatrick_scale'] == scale].copy()
    subset = subset.sample(frac=1, random_state=42).reset_index(drop=True)
    n = len(subset)
    train_n = int(0.8 * n)
    train_subset = subset.iloc[:train_n].copy()
    test_subset = subset.iloc[train_n:].copy()
    train_subset['split'] = 'train'
    test_subset['split'] = f"test{scale}"
    train_list.append(train_subset)
    test_list.append(test_subset)

train_df = pd.concat(train_list, ignore_index=True)
test_df = pd.concat(test_list, ignore_index=True)
combined_df = pd.concat([train_df, test_df], ignore_index=True)

## Downloading the images

In [57]:
def download_image(url, save_path):
    headers = {
        "User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                       "AppleWebKit/537.36 (KHTML, like Gecko) "
                       "Chrome/98.0.4758.102 Safari/537.36")
    }
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        with open(save_path, "wb") as f:
            f.write(response.content)
        return True
    except Exception as e:
        print(f"Download error {url}: {e}")
        return False

Download

In [58]:
failed_indices = []

for idx, row in tqdm(combined_df.iterrows(), total=len(combined_df), desc="Downloading images"):
    url = row['url']
    if pd.isna(url) or not str(url).startswith("http"):
        print(f"Skipping row {row['image_id']} due to missing or invalid URL")
        failed_indices.append(idx)
        continue
    filename = f"{row['image_id']}.jpg"
    save_path = os.path.join(OUTPUT_DIR, filename)
    downloaded = download_image(url, save_path)
    if not downloaded:
        failed_indices.append(idx)

if failed_indices:
    combined_df.drop(index=failed_indices, inplace=True)
    combined_df.reset_index(drop=True, inplace=True)

Downloading images:   0%|          | 0/975 [00:00<?, ?it/s]

Skipping row 9d994f1278a9788612fa2af179328c31 due to missing or invalid URL
Download error nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?
Skipping row 480fd80167d7865ce9aafe14aba4ec9d due to missing or invalid URL
Download error nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?
Skipping row 87ae4c6a6e5d03d360fdd19b3a2d8092 due to missing or invalid URL
Download error nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?
Skipping row b6cf008da895fb856a76c033fddf6ec5 due to missing or invalid URL
Download error nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?
Skipping row 1f44599103695a126e5bb6496ba52cb7 due to missing or invalid URL
Download error nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?
Skipping row d653edb6aa702f095a9b00550738a821 due to missing or invalid URL
Download error nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


In [60]:
combined_df[['image_id', 'fitzpatrick_scale', 'benign_malignant', 'split']].to_csv(
    os.path.join(os.path.join(BASE_DIR, "fitzpatrick17k_with_splits.csv")), index=False
)

In [61]:
failed_indices

[249, 249, 651, 651, 764, 764, 774, 774, 896, 896, 911, 911]