In [20]:
import os
import pandas as pd
import requests
from PIL import Image
from io import BytesIO
from sklearn.cluster import KMeans
import numpy as np
import cv2
import matplotlib.pyplot as plt

# Load your cleaned dataframe
df = pd.read_csv("fitzpatrick17k.csv")
df = df.dropna(subset=['url']).reset_index(drop=True)

# Filter only Psoriasis images
psoriasis_df = df[df['label'].str.lower() == 'psoriasis']
os.makedirs("images/psoriasis", exist_ok=True)


In [16]:
unique_labels = df['label'].unique()
label_counts = df['label'].value_counts()
print(label_counts)


label
psoriasis                      653
squamous cell carcinoma        581
lichen planus                  491
basal cell carcinoma           468
allergic contact dermatitis    430
                              ... 
port wine stain                 59
erythema elevatum diutinum      55
xanthomas                       53
pustular psoriasis              53
pilomatricoma                   53
Name: count, Length: 114, dtype: int64


In [27]:
# # Downloading all of the Psoriasis images from the url field

# def download_image(url, save_path):
#     headers = {'User-Agent': 'Mozilla/5.0'}
#     try:
#         response = requests.get(url, headers=headers, timeout=10)
#         if response.status_code == 200:
#             try:
#                 image = Image.open(BytesIO(response.content)).convert("RGB")
#                 image.save(save_path)
#                 return True
#             except Exception as e:
#                 print(f"Invalid image at {url} | {e}")
#         else:
#             print(f"Bad status ({response.status_code}) for {url}")
#     except Exception as e:
#         print(f"Failed to download {url} | {e}")
#     return False


# # Download all Psoriasis images
# for idx, row in psoriasis_df.iterrows():
#     img_name = f"images/psoriasis/psoriasis_{idx}.jpg"
#     if not os.path.exists(img_name):
#         download_image(row['url'], img_name)


In [38]:
psoriasis_dir = "images/psoriasis_clustered"
num_files = len([f for f in os.listdir(psoriasis_dir) if os.path.isfile(os.path.join(psoriasis_dir, f))])
print(f"Number of files in 'images/psoriasis': {num_files}")

Number of files in 'images/psoriasis': 1959


In [34]:
# This method will make the sizing of the images the same while maintaining the aspect ratio
def resize_with_padding(image, target_size=(256, 256)):
    h, w = image.shape[:2]
    scale = min(target_size[0]/h, target_size[1]/w)
    new_h, new_w = int(h * scale), int(w * scale)
    
    resized = cv2.resize(image, (new_w, new_h))
    
    top = (target_size[0] - new_h) // 2
    bottom = target_size[0] - new_h - top
    left = (target_size[1] - new_w) // 2
    right = target_size[1] - new_w - left
    
    padded = cv2.copyMakeBorder(resized, top, bottom, left, right, cv2.BORDER_CONSTANT, value=[0, 0, 0])
    return padded


In [35]:
# Performs K-mean clustering on the given image path
def segment_image_kmeans(image_path, k=3):
    image = cv2.imread(image_path)
    image = resize_with_padding(image, (256, 256))
    img_flat = image.reshape((-1, 3))

    kmeans = KMeans(n_clusters=k, random_state=42).fit(img_flat)
    clustered = kmeans.labels_.reshape((256, 256))

    return image, clustered


In [36]:
# Displays the clusters onto the actual images
def show_cluster_overlay(image, clustered_mask, cluster_id, save_path):
    mask = (clustered_mask == cluster_id).astype(np.uint8) * 255
    mask_color = cv2.cvtColor(mask, cv2.COLOR_GRAY2BGR)
    overlay = cv2.addWeighted(image, 0.7, mask_color, 0.3, 0)

    plt.figure(figsize=(10,4))
    plt.subplot(1, 3, 1)
    plt.title("Original")
    plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    plt.axis("off")

    plt.subplot(1, 3, 2)
    plt.title("Mask")
    plt.imshow(mask, cmap='gray')
    plt.axis("off")

    plt.subplot(1, 3, 3)
    plt.title("Overlay")
    plt.imshow(cv2.cvtColor(overlay, cv2.COLOR_BGR2RGB))
    plt.axis("off")

    plt.tight_layout()
    plt.savefig(save_path)
    plt.close()

In [37]:
input_dir = "images/psoriasis"
output_dir = "images/psoriasis_clustered"
os.makedirs(output_dir, exist_ok=True)

for i, filename in enumerate(os.listdir(input_dir)):
    if not filename.lower().endswith(('.jpg', '.jpeg', '.png')):
        continue

    img_path = os.path.join(input_dir, filename)
    image, clustered = segment_image_kmeans(img_path, k=3)

    # Save one overlay per cluster to manually inspect which one captures lesions best
    for cluster_id in range(3):
        out_path = os.path.join(output_dir, f"{filename[:-4]}_cluster{cluster_id}.png")
        show_cluster_overlay(image, clustered, cluster_id, out_path)