# Phase 5: Exploratory Data Analysis (EDA)

Publication-quality visualizations for the NHANES Diabetes Prediction project.

## Key Questions
1. How do features differ between diabetes/pre-diabetes/no diabetes groups?
2. What are the strongest correlations with our targets?
3. Are there interesting interactions between features?
4. How does the population differ across survey years?

---

In [None]:
# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Add project root to path
import sys
sys.path.insert(0, str(Path.cwd().parent / 'src'))

# Project visualization module
from visualization import (
    set_publication_style,
    plot_target_distribution,
    plot_cohort_flow,
    plot_feature_by_status,
    plot_feature_panel,
    plot_correlation_heatmap,
    plot_top_correlations,
    plot_scatter_by_status,
    plot_interaction_grid,
    plot_prevalence_by_year,
    plot_feature_by_year,
    plot_pca,
    plot_risk_factors,
    calculate_effect_sizes,
    DIABETES_COLORS,
    DIABETES_LABELS
)

# Set style
set_publication_style()

# Paths
DATA_DIR = Path.cwd().parent / 'data'
FIGURES_DIR = Path.cwd().parent / 'reports' / 'figures'
FIGURES_DIR.mkdir(parents=True, exist_ok=True)

print(f"Figures will be saved to: {FIGURES_DIR}")

## 1. Load Data

In [None]:
# Load the full engineered dataset (has all features + target)
df = pd.read_parquet(DATA_DIR / 'processed' / 'features_engineered.parquet')
print(f"Dataset shape: {df.shape}")

# Load modeling datasets for reference
X_with_labs = pd.read_parquet(DATA_DIR / 'processed' / 'X_with_labs.parquet')
X_without_labs = pd.read_parquet(DATA_DIR / 'processed' / 'X_without_labs.parquet')
y = pd.read_parquet(DATA_DIR / 'processed' / 'y_with_labs.parquet')

print(f"\nModeling datasets:")
print(f"  X_with_labs: {X_with_labs.shape}")
print(f"  X_without_labs: {X_without_labs.shape}")
print(f"  y: {y.shape}")

In [None]:
# Check target distribution
print("Target Distribution:")
print(df['DIABETES_STATUS'].value_counts().sort_index())
print(f"\nMissing: {df['DIABETES_STATUS'].isna().sum()}")

In [None]:
# Create survey year variable from SDDSRVYR if not present
if 'SURVEY_YEAR' not in df.columns:
    # SDDSRVYR: Survey cycle (8 = 2013-2014, 9 = 2015-2016, 10 = 2017-2018)
    year_map = {9: '2015-2016', 10: '2017-2018'}
    df['SURVEY_YEAR'] = df['SDDSRVYR'].map(year_map)
    print("Survey year distribution:")
    print(df['SURVEY_YEAR'].value_counts())

---

## 2. Population Overview

### 2.1 Cohort Flow Diagram

In [None]:
# Cohort flow diagram
cohort_steps = [
    {'label': 'NHANES 2015-2018\nTotal Participants', 'n': 19225},
    {'label': 'Adults (Age ≥ 18)', 'n': 11878, 'excluded': 7347, 'reason': 'Age < 18'},
    {'label': 'Non-Pregnant', 'n': 11723, 'excluded': 155, 'reason': 'Pregnant'},
    {'label': 'Valid Diabetes Status', 'n': 11698, 'excluded': 25, 'reason': 'Missing Target'},
]

fig = plot_cohort_flow(
    cohort_steps,
    title='Study Population Selection',
    figsize=(12, 10),
    save_path=FIGURES_DIR / 'cohort_flow.png'
)
plt.show()

### 2.2 Target Distribution

In [None]:
# Filter to valid target only
df_valid = df[df['DIABETES_STATUS'].notna()].copy()
print(f"Valid samples for analysis: {len(df_valid):,}")

fig = plot_target_distribution(
    df_valid['DIABETES_STATUS'],
    title='Diabetes Status Distribution in NHANES 2015-2018',
    figsize=(10, 5),
    save_path=FIGURES_DIR / 'target_distribution_final.png'
)
plt.show()

### 2.3 Demographics by Diabetes Status

In [None]:
# Summary statistics by diabetes status
demo_vars = ['RIDAGEYR', 'RIAGENDR', 'BMXBMI', 'BMXWAIST']

summary = df_valid.groupby('DIABETES_STATUS')[demo_vars].agg(['mean', 'std', 'count'])
summary.index = [DIABETES_LABELS[int(i)] for i in summary.index]

# Display nicely
for var in demo_vars:
    print(f"\n{var}:")
    print(summary[var].round(2))

In [None]:
# Age and BMI distributions by status - combined figure
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Age
sns.violinplot(data=df_valid, x='DIABETES_STATUS', y='RIDAGEYR', 
               palette=[DIABETES_COLORS[i] for i in [0, 1, 2]], ax=axes[0], inner='quartile')
axes[0].set_xticklabels([DIABETES_LABELS[i] for i in [0, 1, 2]])
axes[0].set_xlabel('')
axes[0].set_ylabel('Age (years)')
axes[0].set_title('Age Distribution by Diabetes Status', fontweight='bold')

# BMI
sns.violinplot(data=df_valid, x='DIABETES_STATUS', y='BMXBMI',
               palette=[DIABETES_COLORS[i] for i in [0, 1, 2]], ax=axes[1], inner='quartile')
axes[1].set_xticklabels([DIABETES_LABELS[i] for i in [0, 1, 2]])
axes[1].set_xlabel('')
axes[1].set_ylabel('BMI (kg/m²)')
axes[1].set_title('BMI Distribution by Diabetes Status', fontweight='bold')

plt.tight_layout()
fig.savefig(FIGURES_DIR / 'demographics_by_status.png', dpi=300, bbox_inches='tight')
plt.show()

---

## 3. Feature Differences by Diabetes Status

### 3.1 Key Continuous Features

In [None]:
# Define key features by category
key_features = {
    'Anthropometric': ['BMXBMI', 'BMXWAIST', 'WAIST_HEIGHT_RATIO'],
    'Blood Pressure': ['AVG_SYS_BP', 'AVG_DIA_BP', 'PULSE_PRESSURE', 'MAP'],
    'Weight History': ['WEIGHT_CHANGE_10YR', 'WEIGHT_CHANGE_25', 'WEIGHT_FROM_MAX'],
    'Dietary': ['DR1TKCAL', 'DR1TSUGR', 'CARB_FIBER_RATIO', 'SAT_FAT_PCT'],
    'Laboratory': ['LBXGH', 'ACR_RATIO', 'TG_HDL_RATIO', 'NON_HDL_CHOL'],
    'Mental Health': ['PHQ9_SCORE'],
    'Sleep': ['SLD012', 'WAKE_TIME_DIFF'],
}

In [None]:
# Anthropometric features panel
fig = plot_feature_panel(
    df_valid,
    key_features['Anthropometric'],
    target='DIABETES_STATUS',
    ncols=3,
    figsize_per_plot=(4.5, 4),
    suptitle='Anthropometric Measures by Diabetes Status',
    save_path=FIGURES_DIR / 'features_anthropometric.png'
)
plt.show()

In [None]:
# Blood pressure features panel
fig = plot_feature_panel(
    df_valid,
    key_features['Blood Pressure'],
    target='DIABETES_STATUS',
    ncols=4,
    figsize_per_plot=(4, 4),
    suptitle='Blood Pressure Measures by Diabetes Status',
    save_path=FIGURES_DIR / 'features_blood_pressure.png'
)
plt.show()

In [None]:
# Laboratory features panel (with labs only)
lab_features = ['ACR_RATIO', 'TG_HDL_RATIO', 'NON_HDL_CHOL']
fig = plot_feature_panel(
    df_valid,
    lab_features,
    target='DIABETES_STATUS',
    ncols=3,
    figsize_per_plot=(4.5, 4),
    suptitle='Laboratory Derived Features by Diabetes Status',
    save_path=FIGURES_DIR / 'features_laboratory.png'
)
plt.show()

In [None]:
# Dietary features panel
fig = plot_feature_panel(
    df_valid,
    key_features['Dietary'],
    target='DIABETES_STATUS',
    ncols=4,
    figsize_per_plot=(4, 4),
    suptitle='Dietary Measures by Diabetes Status',
    save_path=FIGURES_DIR / 'features_dietary.png'
)
plt.show()

### 3.2 Individual Feature Deep Dives

In [None]:
# HbA1c distribution (the primary diagnostic marker)
fig = plot_feature_by_status(
    df_valid,
    'LBXGH',
    target='DIABETES_STATUS',
    plot_type='kde',
    title='HbA1c Distribution by Diabetes Status',
    xlabel='HbA1c (%)',
    figsize=(10, 6),
    save_path=FIGURES_DIR / 'hba1c_kde_by_status.png'
)

# Add diagnostic thresholds
plt.axvline(x=5.7, color='orange', linestyle='--', label='Prediabetes threshold (5.7%)')
plt.axvline(x=6.5, color='red', linestyle='--', label='Diabetes threshold (6.5%)')
plt.legend()
plt.savefig(FIGURES_DIR / 'hba1c_kde_by_status.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Waist-to-Height Ratio (key obesity metric)
fig = plot_feature_by_status(
    df_valid,
    'WAIST_HEIGHT_RATIO',
    target='DIABETES_STATUS',
    plot_type='violin',
    title='Waist-to-Height Ratio by Diabetes Status',
    xlabel='Waist-to-Height Ratio',
    figsize=(10, 6),
    save_path=FIGURES_DIR / 'waist_height_ratio_by_status.png'
)

# Add clinical threshold
plt.axhline(y=0.5, color='red', linestyle='--', linewidth=2)
plt.text(2.2, 0.52, 'Elevated risk threshold (0.5)', color='red', fontsize=10)
plt.savefig(FIGURES_DIR / 'waist_height_ratio_by_status.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# PHQ-9 Depression Score
fig = plot_feature_by_status(
    df_valid,
    'PHQ9_SCORE',
    target='DIABETES_STATUS',
    plot_type='violin',
    title='PHQ-9 Depression Score by Diabetes Status',
    xlabel='PHQ-9 Score',
    figsize=(10, 6),
    save_path=FIGURES_DIR / 'phq9_by_status.png'
)
plt.show()

---

## 4. Correlation Analysis

### 4.1 Top Features Correlated with Diabetes Status

In [None]:
# Calculate correlations with diabetes status
# Use the modeling features (without _MISSING flags for cleaner analysis)
core_features = [c for c in X_with_labs.columns if not c.endswith('_MISSING')]
print(f"Analyzing {len(core_features)} core features")

In [None]:
# Create analysis dataframe with valid target
analysis_df = df_valid[core_features + ['DIABETES_STATUS']].copy()

fig, correlations = plot_top_correlations(
    analysis_df,
    target='DIABETES_STATUS',
    n_top=25,
    method='spearman',
    figsize=(10, 12),
    title='Top 25 Features Correlated with Diabetes Status',
    save_path=FIGURES_DIR / 'top_correlations.png'
)
plt.show()

In [None]:
# Print top positive and negative correlations
top_pos = correlations.sort_values(ascending=False).head(10)
top_neg = correlations.sort_values(ascending=True).head(10)

print("Top 10 POSITIVE correlations with diabetes:")
for feat, corr in top_pos.items():
    print(f"  {feat}: {corr:.3f}")

print("\nTop 10 NEGATIVE correlations with diabetes:")
for feat, corr in top_neg.items():
    print(f"  {feat}: {corr:.3f}")

### 4.2 Correlation Heatmap by Feature Category

In [None]:
# Select a subset of key features for readability
heatmap_features = [
    # Demographics
    'RIDAGEYR',
    # Anthropometric
    'BMXBMI', 'BMXWAIST', 'WAIST_HEIGHT_RATIO',
    # Blood Pressure
    'AVG_SYS_BP', 'AVG_DIA_BP', 'PULSE_PRESSURE',
    # Weight history
    'WEIGHT_CHANGE_10YR', 'WEIGHT_FROM_MAX',
    # Dietary
    'DR1TKCAL', 'CARB_FIBER_RATIO', 'SAT_FAT_PCT',
    # Laboratory
    'ACR_RATIO', 'TG_HDL_RATIO', 'NON_HDL_CHOL',
    # Sleep/Mental
    'PHQ9_SCORE', 'SLD012',
    # Target
    'DIABETES_STATUS'
]

# Filter to features that exist
heatmap_features = [f for f in heatmap_features if f in df_valid.columns]

fig = plot_correlation_heatmap(
    df_valid,
    features=heatmap_features,
    method='spearman',
    figsize=(14, 12),
    title='Feature Correlation Matrix (Key Features)',
    cluster=True,
    save_path=FIGURES_DIR / 'correlation_heatmap.png'
)
plt.show()

---

## 5. Feature Interactions

### 5.1 BMI × Age Interaction

In [None]:
fig = plot_scatter_by_status(
    df_valid,
    x='RIDAGEYR',
    y='BMXBMI',
    target='DIABETES_STATUS',
    title='BMI vs Age by Diabetes Status',
    figsize=(12, 8),
    alpha=0.4,
    sample_frac=0.5,
    add_regression=True,
    save_path=FIGURES_DIR / 'scatter_bmi_age.png'
)
plt.show()

### 5.2 Metabolic Interactions

In [None]:
# Waist-Height Ratio vs TG/HDL Ratio (both insulin resistance markers)
fig = plot_scatter_by_status(
    df_valid,
    x='WAIST_HEIGHT_RATIO',
    y='TG_HDL_RATIO',
    target='DIABETES_STATUS',
    title='Insulin Resistance Markers: Waist-Height vs TG/HDL Ratio',
    figsize=(12, 8),
    alpha=0.5,
    sample_frac=0.5,
    save_path=FIGURES_DIR / 'scatter_metabolic.png'
)

# Add clinical thresholds
plt.axvline(x=0.5, color='gray', linestyle='--', alpha=0.5)
plt.axhline(y=3.0, color='gray', linestyle='--', alpha=0.5)
plt.text(0.51, plt.ylim()[1]*0.95, 'Elevated\nwaist-height', fontsize=9, color='gray')
plt.text(plt.xlim()[1]*0.85, 3.2, 'Elevated TG/HDL', fontsize=9, color='gray')
plt.savefig(FIGURES_DIR / 'scatter_metabolic.png', dpi=300, bbox_inches='tight')
plt.show()

### 5.3 Multi-Feature Interaction Grid

In [None]:
# Key feature interaction grid
interaction_features = ['RIDAGEYR', 'BMXBMI', 'WAIST_HEIGHT_RATIO', 'AVG_SYS_BP']

fig = plot_interaction_grid(
    df_valid,
    interaction_features,
    target='DIABETES_STATUS',
    figsize_per_plot=(3.5, 3.5),
    sample_frac=0.3,
    save_path=FIGURES_DIR / 'feature_interactions.png'
)
plt.show()

---

## 6. Temporal Analysis

### 6.1 Prevalence by Survey Year

In [None]:
# Prevalence by year
fig = plot_prevalence_by_year(
    df_valid,
    year_col='SURVEY_YEAR',
    target='DIABETES_STATUS',
    figsize=(10, 6),
    title='Diabetes Prevalence by Survey Year (NHANES 2015-2018)',
    save_path=FIGURES_DIR / 'prevalence_by_year.png'
)
plt.show()

In [None]:
# Calculate exact prevalence
prevalence = df_valid.groupby('SURVEY_YEAR')['DIABETES_STATUS'].apply(
    lambda x: x.value_counts(normalize=True) * 100
).unstack()
prevalence.columns = [DIABETES_LABELS[int(c)] for c in prevalence.columns]
print("Prevalence by Survey Year (%):")
print(prevalence.round(2))

### 6.2 Feature Distributions by Year

In [None]:
# BMI by year
fig = plot_feature_by_year(
    df_valid,
    feature='BMXBMI',
    year_col='SURVEY_YEAR',
    figsize=(8, 5),
    title='BMI Distribution by Survey Year',
    save_path=FIGURES_DIR / 'bmi_by_year.png'
)
plt.show()

In [None]:
# Age by year (check for sampling consistency)
fig = plot_feature_by_year(
    df_valid,
    feature='RIDAGEYR',
    year_col='SURVEY_YEAR',
    figsize=(8, 5),
    title='Age Distribution by Survey Year',
    save_path=FIGURES_DIR / 'age_by_year.png'
)
plt.show()

---

## 7. Dimensionality Reduction

### 7.1 PCA Analysis

In [None]:
# Select numeric features for PCA (exclude _MISSING flags and identifiers)
pca_features = [c for c in X_with_labs.columns 
                if not c.endswith('_MISSING') and c != 'SEQN']

# Merge with target
pca_df = X_with_labs[pca_features].copy()
pca_df['DIABETES_STATUS'] = y['DIABETES_STATUS'].values

print(f"Features for PCA: {len(pca_features)}")

In [None]:
# Run PCA
fig, pca_results = plot_pca(
    pca_df,
    features=pca_features,
    target='DIABETES_STATUS',
    n_components=10,
    figsize=(14, 5),
    save_path=FIGURES_DIR / 'pca_analysis.png'
)
plt.show()

In [None]:
# Print variance explained
print("Variance Explained by Principal Components:")
cumsum = 0
for i, var in enumerate(pca_results['explained_variance_ratio'][:10]):
    cumsum += var * 100
    print(f"  PC{i+1}: {var*100:.1f}% (cumulative: {cumsum:.1f}%)")

---

## 8. Risk Factor Summary

### 8.1 Effect Sizes (Cohen's d)

In [None]:
# Calculate effect sizes comparing No Diabetes (0) vs Diabetes (2)
effect_features = [
    'RIDAGEYR', 'BMXBMI', 'BMXWAIST', 'WAIST_HEIGHT_RATIO',
    'AVG_SYS_BP', 'AVG_DIA_BP', 'PULSE_PRESSURE', 'MAP',
    'WEIGHT_CHANGE_10YR', 'WEIGHT_CHANGE_25',
    'DR1TKCAL', 'CARB_FIBER_RATIO', 'SAT_FAT_PCT',
    'ACR_RATIO', 'TG_HDL_RATIO', 'NON_HDL_CHOL',
    'PHQ9_SCORE', 'SLD012', 'PAD680'
]

# Filter to available features
effect_features = [f for f in effect_features if f in df_valid.columns]

effect_df = calculate_effect_sizes(
    df_valid,
    effect_features,
    target='DIABETES_STATUS',
    comparison=(0, 2)  # No Diabetes vs Diabetes
)

print(f"Calculated effect sizes for {len(effect_df)} features")
effect_df.sort_values('effect_size', ascending=False)

In [None]:
# Forest plot of effect sizes
fig = plot_risk_factors(
    effect_df,
    title='Risk Factors for Diabetes\n(Effect Size: No Diabetes vs Diabetes)',
    figsize=(10, 10),
    save_path=FIGURES_DIR / 'risk_factors_forest.png'
)
plt.show()

---

## 9. Summary Dashboard

In [None]:
# Create a summary dashboard figure
fig = plt.figure(figsize=(16, 12))

# Grid layout
gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3)

# 1. Target distribution (top left)
ax1 = fig.add_subplot(gs[0, 0])
counts = df_valid['DIABETES_STATUS'].value_counts().sort_index()
bars = ax1.bar([DIABETES_LABELS[int(i)] for i in counts.index], counts.values,
               color=[DIABETES_COLORS[int(i)] for i in counts.index], edgecolor='white')
ax1.set_ylabel('Count')
ax1.set_title('Target Distribution', fontweight='bold')
for bar, count in zip(bars, counts.values):
    ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 50, f'{count:,}', 
             ha='center', fontsize=9)

# 2. Age distribution (top middle)
ax2 = fig.add_subplot(gs[0, 1])
for status in [0, 1, 2]:
    subset = df_valid[df_valid['DIABETES_STATUS'] == status]['RIDAGEYR']
    ax2.hist(subset, bins=30, alpha=0.5, color=DIABETES_COLORS[status], label=DIABETES_LABELS[status])
ax2.set_xlabel('Age (years)')
ax2.set_ylabel('Frequency')
ax2.set_title('Age Distribution by Status', fontweight='bold')
ax2.legend(fontsize=8)

# 3. BMI distribution (top right)
ax3 = fig.add_subplot(gs[0, 2])
for status in [0, 1, 2]:
    subset = df_valid[df_valid['DIABETES_STATUS'] == status]['BMXBMI'].dropna()
    ax3.hist(subset, bins=30, alpha=0.5, color=DIABETES_COLORS[status], label=DIABETES_LABELS[status])
ax3.set_xlabel('BMI (kg/m²)')
ax3.set_ylabel('Frequency')
ax3.set_title('BMI Distribution by Status', fontweight='bold')
ax3.legend(fontsize=8)

# 4. Key metrics by status (middle row, spans 2 columns)
ax4 = fig.add_subplot(gs[1, :2])
key_vars = ['BMXBMI', 'AVG_SYS_BP', 'WAIST_HEIGHT_RATIO', 'PHQ9_SCORE']
key_vars = [v for v in key_vars if v in df_valid.columns]
x = np.arange(len(key_vars))
width = 0.25
for i, status in enumerate([0, 1, 2]):
    means = [df_valid[df_valid['DIABETES_STATUS'] == status][v].mean() for v in key_vars]
    # Normalize for display
    max_vals = [df_valid[v].max() for v in key_vars]
    norm_means = [m/mx * 100 for m, mx in zip(means, max_vals)]
    ax4.bar(x + (i-1)*width, norm_means, width, color=DIABETES_COLORS[status], 
            label=DIABETES_LABELS[status], edgecolor='white')
ax4.set_xticks(x)
ax4.set_xticklabels(key_vars)
ax4.set_ylabel('Normalized Value (% of max)')
ax4.set_title('Key Features by Diabetes Status', fontweight='bold')
ax4.legend()

# 5. Prevalence by year (middle right)
ax5 = fig.add_subplot(gs[1, 2])
prev = df_valid.groupby('SURVEY_YEAR')['DIABETES_STATUS'].apply(
    lambda x: (x == 2).mean() * 100
)
ax5.bar(prev.index, prev.values, color='#b2182b', edgecolor='white')
ax5.set_ylabel('Diabetes Prevalence (%)')
ax5.set_title('Diabetes Prevalence by Year', fontweight='bold')
for i, (year, val) in enumerate(prev.items()):
    ax5.text(i, val + 0.3, f'{val:.1f}%', ha='center', fontsize=10)

# 6. Top correlations (bottom row, spans all)
ax6 = fig.add_subplot(gs[2, :])
top_n = 10
top_corr = correlations.reindex(correlations.abs().sort_values(ascending=False).index)[:top_n]
colors = ['#b2182b' if v > 0 else '#2166ac' for v in top_corr.values]
ax6.barh(top_corr.index, top_corr.values, color=colors, edgecolor='white')
ax6.axvline(x=0, color='black', linewidth=0.8)
ax6.set_xlabel('Spearman Correlation')
ax6.set_title(f'Top {top_n} Features Correlated with Diabetes Status', fontweight='bold')
ax6.invert_yaxis()

fig.suptitle('NHANES Diabetes Prediction: EDA Summary', fontsize=16, fontweight='bold', y=1.02)

plt.tight_layout()
fig.savefig(FIGURES_DIR / 'eda_summary_dashboard.png', dpi=300, bbox_inches='tight')
plt.show()

---

## 10. Key Findings Summary

### Population Characteristics
- **Sample Size**: 11,698 adults with valid diabetes status
- **Prevalence**: ~19% diabetes, ~32% prediabetes, ~49% no diabetes
- **Survey Years**: 2015-2016 and 2017-2018 (stable prevalence)

### Strongest Risk Factors (by correlation)
1. **Age** - Strong positive correlation with diabetes status
2. **BMI / Waist circumference** - Central obesity strongly associated
3. **Blood pressure** - Elevated in diabetic group
4. **TG/HDL Ratio** - Insulin resistance marker

### Key Observations
1. **Clear separation by status** - Most features show progressive increase from no diabetes → prediabetes → diabetes
2. **Metabolic clustering** - Obesity, hypertension, and dyslipidemia cluster together
3. **Mental health link** - Higher depression scores in diabetic group
4. **Stable temporal trends** - No major shifts between 2015-2016 and 2017-2018

In [None]:
# List all saved figures
print("\nSaved Figures:")
print("=" * 50)
for fig_path in sorted(FIGURES_DIR.glob('*.png')):
    print(f"  {fig_path.name}")

In [None]:
print("\nPhase 5 EDA Complete!")
print("="*50)
print(f"Total figures generated: {len(list(FIGURES_DIR.glob('*.png')))}")
print(f"Output directory: {FIGURES_DIR}")