In [None]:
import sys
sys.path.insert(0, '..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from pathlib import Path
from tqdm import tqdm

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

# Paths
DATA_DIR = Path('../data')
RAW_DIR = DATA_DIR / 'raw' / 'aptos'

print(f"Data directory: {DATA_DIR}")
print(f"Raw data: {RAW_DIR}")

## 2. Load Dataset

In [None]:
# Load training labels
train_csv = RAW_DIR / 'train.csv'

if train_csv.exists():
    df = pd.read_csv(train_csv)
    print(f"Loaded {len(df)} samples")
    print(f"\nColumns: {df.columns.tolist()}")
    df.head()

In [None]:
# Class names
CLASS_NAMES = [
    'No DR (Grade 0)',
    'Mild NPDR (Grade 1)',
    'Moderate NPDR (Grade 2)',
    'Severe NPDR (Grade 3)',
    'Proliferative DR (Grade 4)'
]

# Add class name column
df['class_name'] = df['diagnosis'].map(lambda x: CLASS_NAMES[x])

## 3. Class Distribution Analysis

In [None]:
# Class distribution
class_counts = df['diagnosis'].value_counts().sort_index()

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar plot
colors = sns.color_palette('husl', 5)
bars = axes[0].bar(range(5), class_counts.values, color=colors)
axes[0].set_xticks(range(5))
axes[0].set_xticklabels(['Grade 0', 'Grade 1', 'Grade 2', 'Grade 3', 'Grade 4'])
axes[0].set_xlabel('DR Severity Grade')
axes[0].set_ylabel('Number of Images')
axes[0].set_title('Class Distribution in APTOS 2019 Dataset')

# Add count labels
for bar, count in zip(bars, class_counts.values):
    axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 20, 
                 str(count), ha='center', va='bottom', fontsize=10)

# Pie chart
axes[1].pie(class_counts.values, labels=CLASS_NAMES, autopct='%1.1f%%', 
            colors=colors, startangle=90)
axes[1].set_title('Class Distribution (Percentage)')

plt.tight_layout()
plt.savefig('../notebooks/figures/class_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

# Print statistics
print("\nClass Distribution:")
for i, (count, name) in enumerate(zip(class_counts.values, CLASS_NAMES)):
    pct = count / len(df) * 100
    print(f"  {name}: {count} ({pct:.1f}%)")

print(f"\nImbalance ratio (max/min): {class_counts.max() / class_counts.min():.1f}x")

## 4. Sample Images per Class

In [None]:
# Display sample images from each class
images_dir = RAW_DIR / 'train_images'

if images_dir.exists():
    fig, axes = plt.subplots(5, 4, figsize=(16, 20))
    
    for grade in range(5):
        samples = df[df['diagnosis'] == grade].sample(min(4, len(df[df['diagnosis'] == grade])))
        
        for j, (_, row) in enumerate(samples.iterrows()):
            img_path = images_dir / f"{row['id_code']}.png"
            if not img_path.exists():
                img_path = images_dir / f"{row['id_code']}.jpg"
            
            if img_path.exists():
                img = cv2.imread(str(img_path))
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                img = cv2.resize(img, (400, 400))
                
                axes[grade, j].imshow(img)
                axes[grade, j].set_title(f"Grade {grade}: {row['id_code']}")
                axes[grade, j].axis('off')
    
    plt.suptitle('Sample Fundus Images by DR Grade', fontsize=16, y=1.02)
    plt.tight_layout()
    plt.savefig('../notebooks/figures/sample_images_by_class.png', dpi=150, bbox_inches='tight')
    plt.show()
else:
    print(f"Images directory not found: {images_dir}")
    print("Please download the APTOS dataset first.")

## 5. Ben Graham Preprocessing Visualization

In [None]:
from src.utils.ben_graham import BenGrahamPreprocessor

preprocessor = BenGrahamPreprocessor(output_size=512)

# Get sample images
if images_dir.exists():
    fig, axes = plt.subplots(5, 3, figsize=(15, 25))
    
    for grade in range(5):
        sample = df[df['diagnosis'] == grade].sample(1).iloc[0]
        img_path = images_dir / f"{sample['id_code']}.png"
        if not img_path.exists():
            img_path = images_dir / f"{sample['id_code']}.jpg"
        
        if img_path.exists():
            # Original
            original = cv2.imread(str(img_path))
            original = cv2.cvtColor(original, cv2.COLOR_BGR2RGB)
            original_resized = cv2.resize(original, (512, 512))
            
            # Ben Graham processed
            processed = preprocessor(cv2.cvtColor(original, cv2.COLOR_RGB2BGR))
            processed = cv2.cvtColor(processed, cv2.COLOR_BGR2RGB)
            
            # Display
            axes[grade, 0].imshow(original_resized)
            axes[grade, 0].set_title(f'Grade {grade}: Original')
            axes[grade, 0].axis('off')
            
            axes[grade, 1].imshow(processed)
            axes[grade, 1].set_title(f'Grade {grade}: Ben Graham Processed')
            axes[grade, 1].axis('off')
            
            # Difference (enhanced view of changes)
            diff = cv2.absdiff(original_resized, processed)
            axes[grade, 2].imshow(diff)
            axes[grade, 2].set_title(f'Grade {grade}: Difference')
            axes[grade, 2].axis('off')
    
    plt.suptitle('Ben Graham Preprocessing Effect by DR Grade', fontsize=16, y=1.01)
    plt.tight_layout()
    plt.savefig('../notebooks/figures/ben_graham_preprocessing.png', dpi=150, bbox_inches='tight')
    plt.show()

## 6. Image Quality Analysis

In [None]:
def compute_image_stats(img_path):
    """Compute image statistics for quality assessment."""
    img = cv2.imread(str(img_path))
    if img is None:
        return None
    
    # Convert to grayscale for blur detection
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Laplacian variance (blur detection)
    laplacian_var = cv2.Laplacian(gray, cv2.CV_64F).var()
    
    # Mean brightness
    mean_brightness = np.mean(gray)
    
    # Image dimensions
    height, width = img.shape[:2]
    
    return {
        'laplacian_var': laplacian_var,
        'mean_brightness': mean_brightness,
        'height': height,
        'width': width,
        'aspect_ratio': width / height,
    }

# Compute stats for a sample of images
if images_dir.exists():
    sample_df = df.sample(min(500, len(df)))
    
    stats_list = []
    for _, row in tqdm(sample_df.iterrows(), total=len(sample_df), desc="Analyzing images"):
        img_path = images_dir / f"{row['id_code']}.png"
        if not img_path.exists():
            img_path = images_dir / f"{row['id_code']}.jpg"
        
        if img_path.exists():
            stats = compute_image_stats(img_path)
            if stats:
                stats['id_code'] = row['id_code']
                stats['diagnosis'] = row['diagnosis']
                stats_list.append(stats)
    
    stats_df = pd.DataFrame(stats_list)
    
    # Visualize
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # Blur score by class
    sns.boxplot(data=stats_df, x='diagnosis', y='laplacian_var', ax=axes[0, 0], palette='husl')
    axes[0, 0].set_xlabel('DR Grade')
    axes[0, 0].set_ylabel('Laplacian Variance (Sharpness)')
    axes[0, 0].set_title('Image Sharpness by DR Grade')
    
    # Brightness by class
    sns.boxplot(data=stats_df, x='diagnosis', y='mean_brightness', ax=axes[0, 1], palette='husl')
    axes[0, 1].set_xlabel('DR Grade')
    axes[0, 1].set_ylabel('Mean Brightness')
    axes[0, 1].set_title('Image Brightness by DR Grade')
    
    # Image dimensions
    axes[1, 0].scatter(stats_df['width'], stats_df['height'], c=stats_df['diagnosis'], 
                       cmap='husl', alpha=0.5)
    axes[1, 0].set_xlabel('Width')
    axes[1, 0].set_ylabel('Height')
    axes[1, 0].set_title('Image Dimensions')
    
    # Aspect ratio distribution
    stats_df['aspect_ratio'].hist(bins=30, ax=axes[1, 1], color='steelblue', edgecolor='black')
    axes[1, 1].set_xlabel('Aspect Ratio (Width/Height)')
    axes[1, 1].set_ylabel('Count')
    axes[1, 1].set_title('Aspect Ratio Distribution')
    
    plt.tight_layout()
    plt.savefig('../notebooks/figures/image_quality_analysis.png', dpi=150, bbox_inches='tight')
    plt.show()

## 7. Summary Statistics

In [None]:
print("=" * 50)
print("APTOS 2019 Dataset Summary")
print("=" * 50)
print(f"Total samples: {len(df)}")
print(f"Number of classes: 5")
print(f"\nClass Distribution:")
for i in range(5):
    count = len(df[df['diagnosis'] == i])
    pct = count / len(df) * 100
    print(f"  Grade {i} ({CLASS_NAMES[i]}): {count} ({pct:.1f}%)")

print(f"\nClass Imbalance: {class_counts.max() / class_counts.min():.1f}x (max/min)")
print(f"\nRecommendations:")
print("  - Use weighted sampling or class weights")
print("  - Apply strong augmentation for minority classes")
print("  - Consider threshold optimization for regression")