In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Set style
sns.set(style="whitegrid")

# File path
csv_path = "data/metadata/unified_labels_with_stratified_splits.csv"
output_dir = "data/metadata/"
os.makedirs(output_dir, exist_ok=True)

# Read CSV (handle large file efficiently)
df = pd.read_csv(csv_path)

# Preview
df.head()


Unnamed: 0,image,diagnosis,unified_label,source_csv,split1,split2,split3,split4,split5
0,ISIC_0034321,NV,nevus,ISIC2018_Task3_Validation_GroundTruth.csv,train,,,val,
1,ISIC_0034322,NV,nevus,ISIC2018_Task3_Validation_GroundTruth.csv,train,,,,
2,ISIC_0034323,BCC,bcc,ISIC2018_Task3_Validation_GroundTruth.csv,train,train,,,
3,ISIC_0034324,NV,nevus,ISIC2018_Task3_Validation_GroundTruth.csv,train,,,test,
4,ISIC_0034325,NV,nevus,ISIC2018_Task3_Validation_GroundTruth.csv,train,train,,train,


In [2]:
# 1. Diagnosis distribution (global)
plt.figure(figsize=(10,6))
order = df['unified_label'].value_counts().index
sns.barplot(
    x=df['unified_label'].value_counts().index,
    y=df['unified_label'].value_counts().values,
    palette="tab10"
)
plt.title('Diagnosis Distribution (Global)')
plt.xlabel('Diagnosis (unified_label)')
plt.ylabel('Number of Images')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'diagnosis_distribution.png'))
plt.close()



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(


In [3]:
# 2. Source dataset distribution (global)
plt.figure(figsize=(10,6))
source_counts = df['source_csv'].value_counts()
sns.barplot(
    x=source_counts.index,
    y=source_counts.values,
    palette="tab10"
)
plt.title('Source Dataset Distribution (Global)')
plt.xlabel('Source Dataset')
plt.ylabel('Number of Images')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'source_distribution.png'))
plt.close()



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(


In [4]:
# 3. Split size comparison (train/val/test across splits)
split_names = [f'split{i}' for i in range(1,6)]
groups = ['train', 'val', 'test']
split_sizes = {g: [] for g in groups}

for split in split_names:
    for g in groups:
        split_sizes[g].append((df[split] == g).sum())

plt.figure(figsize=(8,6))
for g in groups:
    plt.plot(split_names, split_sizes[g], marker='o', label=g)
plt.title('Split Size Comparison (train/val/test)')
plt.xlabel('Split')
plt.ylabel('Number of Images')
plt.legend()
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'split_size_comparison.png'))
plt.close()


In [5]:
# 4. Stacked class distribution per split (train/val/test for each split)
from collections import defaultdict

split_class_counts = defaultdict(lambda: defaultdict(int))
for split in split_names:
    for g in groups:
        mask = df[split] == g
        counts = df.loc[mask, 'unified_label'].value_counts()
        for label, count in counts.items():
            split_class_counts[f'{split}_{g}'][label] = count

# Prepare DataFrame for plotting
plot_df = pd.DataFrame(split_class_counts).fillna(0).astype(int)
plot_df = plot_df.T  # rows: split_group, columns: class
plot_df.index.name = 'split_group'

# Plot
fig, ax = plt.subplots(figsize=(14,7))
plot_df.plot(kind='bar', stacked=True, ax=ax, colormap='tab10')
plt.title('Stacked Class Distribution per Split')
plt.xlabel('Split and Group')
plt.ylabel('Number of Images')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Diagnosis', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'class_distribution_all_splits.png'))
plt.close()


In [6]:
# 5. Stacked source distribution per split (train/val/test for each split)
split_source_counts = defaultdict(lambda: defaultdict(int))
for split in split_names:
    for g in groups:
        mask = df[split] == g
        counts = df.loc[mask, 'source_csv'].value_counts()
        for source, count in counts.items():
            split_source_counts[f'{split}_{g}'][source] = count

plot_df_source = pd.DataFrame(split_source_counts).fillna(0).astype(int)
plot_df_source = plot_df_source.T
plot_df_source.index.name = 'split_group'

fig, ax = plt.subplots(figsize=(14,7))
plot_df_source.plot(kind='bar', stacked=True, ax=ax, colormap='tab20')
plt.title('Stacked Source Distribution per Split')
plt.xlabel('Split and Group')
plt.ylabel('Number of Images')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Source', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'source_distribution_all_splits.png'))
plt.close()


In [7]:
# 6. Heatmap: Class presence across all splits and groups
heatmap_data = []
labels = sorted(df['unified_label'].unique())
columns = []
for split in split_names:
    for g in groups:
        columns.append(f'{split}_{g}')
        mask = df[split] == g
        counts = df.loc[mask, 'unified_label'].value_counts()
        heatmap_data.append([counts.get(label, 0) for label in labels])

heatmap_df = pd.DataFrame(heatmap_data, columns=labels, index=columns).T
plt.figure(figsize=(14,6))
sns.heatmap(heatmap_df, annot=True, fmt='d', cmap='Blues', cbar_kws={'label': 'Image Count'})
plt.title('Class Presence Heatmap Across All Splits and Groups')
plt.xlabel('Split and Group')
plt.ylabel('Diagnosis (unified_label)')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'class_presence_heatmap.png'))
plt.close()


In [8]:
# 7. Box plot of class frequencies (global)
class_counts = df['unified_label'].value_counts()
plt.figure(figsize=(8,6))
sns.boxplot(y=class_counts.values, color='skyblue')
plt.title('Box Plot of Class Frequencies (Global)')
plt.ylabel('Number of Images per Class')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'class_frequency_boxplot.png'))
plt.close()
