# Exploratory Data Analysis - HAM10000 Dataset

Comprehensive analysis of the HAM10000 skin lesion dataset for skin cancer classification research.

## Contents
1. Dataset Overview
2. Class Distribution Analysis
3. Image Property Analysis
4. Color Distribution Analysis
5. Data Quality Assessment
6. Feature Visualization (t-SNE/UMAP)
7. Statistical Summary

In [None]:
import os
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from pathlib import Path
from collections import Counter
from tqdm.notebook import tqdm
import cv2
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['savefig.dpi'] = 300

# Create output directory
OUTPUT_DIR = '../results/eda'
os.makedirs(OUTPUT_DIR, exist_ok=True)

print('Setup complete!')

## 1. Dataset Overview

In [None]:
# Configuration - Update these paths
DATA_DIR = '../data/HAM10000'
CSV_PATH = '../data/HAM10000/HAM10000_metadata.csv'

# Load metadata
df = pd.read_csv(CSV_PATH)

print(f'Total samples: {len(df)}')
print(f'\nColumns: {df.columns.tolist()}')
print(f'\nData types:\n{df.dtypes}')
df.head(10)

In [None]:
# Class information
CLASS_INFO = {
    'akiec': {'name': 'Actinic Keratoses', 'type': 'Pre-cancerous', 'color': '#e74c3c'},
    'bcc': {'name': 'Basal Cell Carcinoma', 'type': 'Malignant', 'color': '#9b59b6'},
    'bkl': {'name': 'Benign Keratosis', 'type': 'Benign', 'color': '#3498db'},
    'df': {'name': 'Dermatofibroma', 'type': 'Benign', 'color': '#2ecc71'},
    'mel': {'name': 'Melanoma', 'type': 'Malignant', 'color': '#e91e63'},
    'nv': {'name': 'Melanocytic Nevi', 'type': 'Benign', 'color': '#00bcd4'},
    'vasc': {'name': 'Vascular Lesions', 'type': 'Benign', 'color': '#ff9800'}
}

# Display class information
class_df = pd.DataFrame([
    {'Abbreviation': k, 'Full Name': v['name'], 'Type': v['type']}
    for k, v in CLASS_INFO.items()
])
display(class_df)

## 2. Class Distribution Analysis

In [None]:
# Class distribution
class_counts = df['dx'].value_counts()
class_percentages = df['dx'].value_counts(normalize=True) * 100

# Create summary table
distribution_df = pd.DataFrame({
    'Class': class_counts.index,
    'Full Name': [CLASS_INFO[c]['name'] for c in class_counts.index],
    'Count': class_counts.values,
    'Percentage': class_percentages.values,
    'Type': [CLASS_INFO[c]['type'] for c in class_counts.index]
})

print('Class Distribution:')
display(distribution_df)

# Calculate imbalance ratio
imbalance_ratio = class_counts.max() / class_counts.min()
print(f'\nImbalance Ratio (max/min): {imbalance_ratio:.2f}')

In [None]:
# Visualization: Class Distribution Bar Chart
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

colors = [CLASS_INFO[c]['color'] for c in class_counts.index]

# Bar chart
bars = axes[0].bar(class_counts.index, class_counts.values, color=colors, edgecolor='black')
axes[0].set_xlabel('Lesion Type')
axes[0].set_ylabel('Number of Images')
axes[0].set_title('Class Distribution in HAM10000 Dataset')

# Add count labels
for bar, count in zip(bars, class_counts.values):
    axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 50,
                 f'{count}', ha='center', va='bottom', fontweight='bold')

# Pie chart
axes[1].pie(class_counts.values, labels=class_counts.index, autopct='%1.1f%%',
           colors=colors, explode=[0.05]*len(class_counts), shadow=True)
axes[1].set_title('Class Distribution (Percentage)')

plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/class_distribution.png', dpi=300, bbox_inches='tight')
plt.savefig(f'{OUTPUT_DIR}/class_distribution.pdf', bbox_inches='tight')
plt.show()

In [None]:
# Benign vs Malignant Distribution
df['lesion_type'] = df['dx'].map(lambda x: CLASS_INFO[x]['type'])
type_counts = df['lesion_type'].value_counts()

fig, ax = plt.subplots(figsize=(8, 6))
type_colors = {'Benign': '#2ecc71', 'Malignant': '#e74c3c', 'Pre-cancerous': '#f39c12'}
colors = [type_colors[t] for t in type_counts.index]

bars = ax.bar(type_counts.index, type_counts.values, color=colors, edgecolor='black')
ax.set_ylabel('Number of Images')
ax.set_title('Benign vs Malignant vs Pre-cancerous Lesions')

for bar, count in zip(bars, type_counts.values):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 50,
            f'{count} ({count/len(df)*100:.1f}%)', ha='center', va='bottom', fontweight='bold')

plt.savefig(f'{OUTPUT_DIR}/benign_malignant_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

## 3. Sample Images from Each Class

In [None]:
def load_image(image_id, data_dir):
    """Load image by ID from various possible locations."""
    possible_paths = [
        os.path.join(data_dir, f'{image_id}.jpg'),
        os.path.join(data_dir, 'HAM10000_images_part_1', f'{image_id}.jpg'),
        os.path.join(data_dir, 'HAM10000_images_part_2', f'{image_id}.jpg'),
    ]
    for path in possible_paths:
        if os.path.exists(path):
            return np.array(Image.open(path))
    return None

# Display sample images
fig, axes = plt.subplots(3, 7, figsize=(21, 9))

for col, class_name in enumerate(CLASS_INFO.keys()):
    class_samples = df[df['dx'] == class_name].sample(n=min(3, len(df[df['dx'] == class_name])), random_state=42)
    
    for row, (_, sample) in enumerate(class_samples.iterrows()):
        img = load_image(sample['image_id'], DATA_DIR)
        if img is not None:
            axes[row, col].imshow(img)
        axes[row, col].axis('off')
        if row == 0:
            axes[row, col].set_title(f'{class_name.upper()}\n{CLASS_INFO[class_name]["name"]}', 
                                     fontsize=10, fontweight='bold')

plt.suptitle('Sample Images from Each Class (3 samples per class)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/sample_images_grid.png', dpi=300, bbox_inches='tight')
plt.show()

## 4. Image Property Analysis

In [None]:
# Analyze image dimensions and properties
image_stats = []

print('Analyzing image properties...')
for idx, row in tqdm(df.iterrows(), total=len(df)):
    img = load_image(row['image_id'], DATA_DIR)
    if img is not None:
        h, w = img.shape[:2]
        mean_pixel = img.mean()
        std_pixel = img.std()
        
        # Calculate color statistics per channel
        r_mean, g_mean, b_mean = img[:,:,0].mean(), img[:,:,1].mean(), img[:,:,2].mean()
        
        image_stats.append({
            'image_id': row['image_id'],
            'dx': row['dx'],
            'height': h,
            'width': w,
            'aspect_ratio': w/h,
            'mean_pixel': mean_pixel,
            'std_pixel': std_pixel,
            'r_mean': r_mean,
            'g_mean': g_mean,
            'b_mean': b_mean
        })

stats_df = pd.DataFrame(image_stats)
print(f'\nImage Statistics Summary:')
print(stats_df[['height', 'width', 'mean_pixel', 'std_pixel']].describe())

In [None]:
# Image dimension distribution
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

axes[0].hist(stats_df['height'], bins=30, color='steelblue', edgecolor='black', alpha=0.7)
axes[0].axvline(stats_df['height'].mean(), color='red', linestyle='--', label=f'Mean: {stats_df["height"].mean():.0f}')
axes[0].set_xlabel('Height (pixels)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Image Height Distribution')
axes[0].legend()

axes[1].hist(stats_df['width'], bins=30, color='forestgreen', edgecolor='black', alpha=0.7)
axes[1].axvline(stats_df['width'].mean(), color='red', linestyle='--', label=f'Mean: {stats_df["width"].mean():.0f}')
axes[1].set_xlabel('Width (pixels)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Image Width Distribution')
axes[1].legend()

axes[2].hist(stats_df['aspect_ratio'], bins=30, color='coral', edgecolor='black', alpha=0.7)
axes[2].axvline(stats_df['aspect_ratio'].mean(), color='red', linestyle='--', label=f'Mean: {stats_df["aspect_ratio"].mean():.2f}')
axes[2].set_xlabel('Aspect Ratio (W/H)')
axes[2].set_ylabel('Frequency')
axes[2].set_title('Aspect Ratio Distribution')
axes[2].legend()

plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/image_dimensions.png', dpi=300, bbox_inches='tight')
plt.show()

## 5. Color Distribution Analysis

In [None]:
# Color distribution per class
fig, axes = plt.subplots(2, 4, figsize=(16, 8))
axes = axes.flatten()

for idx, class_name in enumerate(CLASS_INFO.keys()):
    class_stats = stats_df[stats_df['dx'] == class_name]
    
    axes[idx].hist(class_stats['r_mean'], bins=30, alpha=0.6, color='red', label='R')
    axes[idx].hist(class_stats['g_mean'], bins=30, alpha=0.6, color='green', label='G')
    axes[idx].hist(class_stats['b_mean'], bins=30, alpha=0.6, color='blue', label='B')
    axes[idx].set_title(f'{class_name.upper()} - RGB Distribution')
    axes[idx].set_xlabel('Mean Pixel Value')
    axes[idx].legend()

# Remove unused subplot
axes[-1].axis('off')

plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/color_distribution_per_class.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Box plot of mean pixel values per class
fig, ax = plt.subplots(figsize=(12, 6))

colors = [CLASS_INFO[c]['color'] for c in CLASS_INFO.keys()]
bp = stats_df.boxplot(column='mean_pixel', by='dx', ax=ax, patch_artist=True)

for patch, color in zip(bp.patches if hasattr(bp, 'patches') else ax.patches, colors):
    patch.set_facecolor(color)
    patch.set_alpha(0.7)

ax.set_xlabel('Lesion Class')
ax.set_ylabel('Mean Pixel Intensity')
ax.set_title('Mean Pixel Intensity Distribution by Class')
plt.suptitle('')

plt.savefig(f'{OUTPUT_DIR}/pixel_intensity_boxplot.png', dpi=300, bbox_inches='tight')
plt.show()

## 6. Patient and Lesion Analysis

In [None]:
# Analyze patient demographics if available
if 'age' in df.columns:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Age distribution
    df['age'].hist(bins=30, ax=axes[0], color='steelblue', edgecolor='black')
    axes[0].set_xlabel('Age')
    axes[0].set_ylabel('Frequency')
    axes[0].set_title('Age Distribution of Patients')
    
    # Age by class
    df.boxplot(column='age', by='dx', ax=axes[1])
    axes[1].set_xlabel('Lesion Class')
    axes[1].set_ylabel('Age')
    axes[1].set_title('Age Distribution by Lesion Type')
    plt.suptitle('')
    
    plt.tight_layout()
    plt.savefig(f'{OUTPUT_DIR}/age_distribution.png', dpi=300, bbox_inches='tight')
    plt.show()

if 'sex' in df.columns:
    print('\nSex Distribution:')
    print(df['sex'].value_counts())
    
if 'localization' in df.columns:
    print('\nLocalization Distribution:')
    print(df['localization'].value_counts())

## 7. Data Quality Assessment

In [None]:
# Check for duplicates
if 'lesion_id' in df.columns:
    duplicate_lesions = df['lesion_id'].value_counts()
    duplicates = duplicate_lesions[duplicate_lesions > 1]
    print(f'Number of lesions with multiple images: {len(duplicates)}')
    print(f'Total duplicate images: {duplicates.sum() - len(duplicates)}')
    
    # Distribution of duplicates per class
    print('\nDuplicates per class:')
    for class_name in CLASS_INFO.keys():
        class_df = df[df['dx'] == class_name]
        if 'lesion_id' in class_df.columns:
            unique_lesions = class_df['lesion_id'].nunique()
            total_images = len(class_df)
            print(f'  {class_name}: {unique_lesions} unique lesions, {total_images} images')

# Missing values
print('\nMissing Values:')
print(df.isnull().sum())

In [None]:
# Detect potential quality issues (blurry images)
def calculate_laplacian_variance(image):
    """Calculate Laplacian variance as blur metric."""
    gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
    return cv2.Laplacian(gray, cv2.CV_64F).var()

print('Calculating blur metrics...')
blur_scores = []

for idx, row in tqdm(df.sample(n=min(1000, len(df)), random_state=42).iterrows(), total=min(1000, len(df))):
    img = load_image(row['image_id'], DATA_DIR)
    if img is not None:
        blur_scores.append({
            'image_id': row['image_id'],
            'dx': row['dx'],
            'blur_score': calculate_laplacian_variance(img)
        })

blur_df = pd.DataFrame(blur_scores)
print(f'\nBlur Score Statistics (higher = sharper):')
print(blur_df['blur_score'].describe())

# Identify potentially blurry images (bottom 5%)
threshold = blur_df['blur_score'].quantile(0.05)
blurry_images = blur_df[blur_df['blur_score'] < threshold]
print(f'\nPotentially blurry images (bottom 5%): {len(blurry_images)}')

## 8. Feature Visualization (t-SNE)

In [None]:
# Extract simple features for visualization
print('Extracting features for t-SNE visualization...')

features = []
labels = []
sample_df = df.sample(n=min(2000, len(df)), random_state=42)

for idx, row in tqdm(sample_df.iterrows(), total=len(sample_df)):
    img = load_image(row['image_id'], DATA_DIR)
    if img is not None:
        # Resize and flatten
        img_resized = cv2.resize(img, (64, 64))
        features.append(img_resized.flatten())
        labels.append(row['dx'])

features = np.array(features)
labels = np.array(labels)

print(f'Feature matrix shape: {features.shape}')

In [None]:
# PCA for dimensionality reduction before t-SNE
print('Running PCA...')
pca = PCA(n_components=50)
features_pca = pca.fit_transform(features)
print(f'Explained variance ratio: {pca.explained_variance_ratio_.sum():.2%}')

# t-SNE
print('Running t-SNE...')
tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=1000)
features_tsne = tsne.fit_transform(features_pca)

In [None]:
# Visualize t-SNE
fig, ax = plt.subplots(figsize=(12, 10))

for class_name in CLASS_INFO.keys():
    mask = labels == class_name
    ax.scatter(features_tsne[mask, 0], features_tsne[mask, 1],
               c=CLASS_INFO[class_name]['color'], label=class_name.upper(),
               alpha=0.6, s=20)

ax.set_xlabel('t-SNE Component 1')
ax.set_ylabel('t-SNE Component 2')
ax.set_title('t-SNE Visualization of Skin Lesion Images')
ax.legend(title='Class', bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/tsne_visualization.png', dpi=300, bbox_inches='tight')
plt.show()

## 9. Statistical Summary Table

In [None]:
# Create comprehensive summary table
summary_data = []

for class_name in CLASS_INFO.keys():
    class_stats = stats_df[stats_df['dx'] == class_name]
    
    summary_data.append({
        'Class': class_name.upper(),
        'Full Name': CLASS_INFO[class_name]['name'],
        'Type': CLASS_INFO[class_name]['type'],
        'Count': len(class_stats),
        'Percentage': f'{len(class_stats)/len(df)*100:.1f}%',
        'Mean Pixel': f'{class_stats["mean_pixel"].mean():.1f}',
        'Std Pixel': f'{class_stats["std_pixel"].mean():.1f}',
        'Avg Height': f'{class_stats["height"].mean():.0f}',
        'Avg Width': f'{class_stats["width"].mean():.0f}'
    })

summary_df = pd.DataFrame(summary_data)
print('Dataset Summary Table:')
display(summary_df)

# Save to CSV
summary_df.to_csv(f'{OUTPUT_DIR}/dataset_summary.csv', index=False)
print(f'\nSummary saved to {OUTPUT_DIR}/dataset_summary.csv')

In [None]:
# Generate LaTeX table
latex_table = summary_df[['Class', 'Full Name', 'Count', 'Percentage', 'Type']].to_latex(
    index=False,
    caption='HAM10000 Dataset Statistics',
    label='tab:dataset_stats',
    column_format='llrrr'
)

with open(f'{OUTPUT_DIR}/dataset_stats_table.tex', 'w') as f:
    f.write(latex_table)

print('LaTeX table saved!')
print(latex_table)

## 10. Key Findings Summary

### Dataset Characteristics:
- **Total Images**: 10,015 dermoscopic images
- **Classes**: 7 skin lesion types
- **Severe Class Imbalance**: NV class dominates (~67%), while DF is minority (~1%)

### Clinical Implications:
- Malignant lesions (MEL, BCC) constitute ~16% of the dataset
- Class imbalance requires careful handling (weighted sampling, focal loss)

### Recommendations for Training:
1. Use weighted random sampling or class weights
2. Apply data augmentation to minority classes
3. Consider focal loss for handling imbalance
4. Use stratified splits for train/val/test