# ICU Mortality Model - Dataset Statistics

This notebook loads the event-wide dataset and computes comprehensive statistics for the entire dataset.

## Objective
- Load event-wide dataset from 02_feature_engineering.ipynb
- Calculate min, max, mean, median, and missing percentage for all numeric features
- Create a single-row summary DataFrame with all statistics
- Save results for reference

## Statistics Computed
- **Min/Max**: Minimum and maximum values for each numeric feature
- **Mean/Median**: Central tendency measures
- **Missing %**: Percentage of missing values for each feature

## Setup and Configuration

In [None]:
import sys
import os
sys.path.append(os.path.join('..', 'src'))

import pandas as pd
import numpy as np
import json
import warnings
warnings.filterwarnings('ignore')

print("=== ICU Mortality Model - Dataset Statistics ===")
print("Computing comprehensive statistics for preprocessed datasets...")

# Define paths
data_path = os.path.join('..', '..', 'protected_outputs', 'preprocessing')
output_path = os.path.join('..', '..', 'protected_outputs', 'preprocessing', 'dataset_statistics.parquet')

## Load Event-Wide Dataset

In [None]:
# Load event-wide dataset from 02_feature_engineering.ipynb
event_wide_path = os.path.join('..', '..', 'protected_outputs', 'preprocessing', 'by_event_wide_df.parquet')

if os.path.exists(event_wide_path):
    event_wide_df = pd.read_parquet(event_wide_path)
    
    print(f"✅ Loaded event-wide dataset: {event_wide_df.shape}")
    print(f"Hospitalizations: {event_wide_df['hospitalization_id'].nunique()}")
    print(f"Time range: {event_wide_df['event_time'].min()} to {event_wide_df['event_time'].max()}")
    print(f"Memory usage: {event_wide_df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
    
else:
    raise FileNotFoundError(f"Event-wide dataset not found at {event_wide_path}. Please run 02_feature_engineering.ipynb first.")

# Display basic info
print("\nDataset info:")
print(f"Total records: {len(event_wide_df):,}")
print(f"Total columns: {len(event_wide_df.columns)}")
print(f"Mortality rate: {event_wide_df['disposition'].mean():.3f}")

## Identify Numeric Columns

In [None]:
# Identify numeric columns for statistics calculation
# Exclude non-numeric identifier and datetime columns
exclude_columns = [
    'hospitalization_id', 'event_time', 'hour_24_start_dttm', 
    'hour_24_end_dttm', 'disposition'
]

# Get numeric columns
numeric_columns = event_wide_df.select_dtypes(include=[np.number]).columns.tolist()
numeric_columns = [col for col in numeric_columns if col not in exclude_columns]

print(f"✅ Identified {len(numeric_columns)} numeric columns for statistics")
print(f"Total features to analyze: {len(numeric_columns)}")

# Show sample of columns
print("\nSample numeric columns:")
for i, col in enumerate(numeric_columns[:10]):
    non_null_count = event_wide_df[col].notna().sum()
    print(f"  {col}: {non_null_count:,} non-null values")
if len(numeric_columns) > 10:
    print(f"  ... and {len(numeric_columns) - 10} more columns")

## Calculate Comprehensive Statistics

In [None]:
# Calculate statistics for all numeric columns
print("Calculating comprehensive statistics for all numeric features...")

# Initialize dictionary to store all statistics
stats_dict = {}

# Calculate statistics for each numeric column
for col in numeric_columns:
    try:
        # Get the column data
        col_data = event_wide_df[col]
        
        # Calculate statistics
        stats_dict[f"{col}_min"] = col_data.min()
        stats_dict[f"{col}_max"] = col_data.max()
        stats_dict[f"{col}_mean"] = col_data.mean()
        stats_dict[f"{col}_median"] = col_data.median()
        stats_dict[f"{col}_missing_pct"] = (col_data.isna().sum() / len(col_data)) * 100
        
    except Exception as e:
        print(f"Warning: Could not calculate statistics for {col}: {str(e)}")
        # Set NaN values for failed calculations
        stats_dict[f"{col}_min"] = np.nan
        stats_dict[f"{col}_max"] = np.nan
        stats_dict[f"{col}_mean"] = np.nan
        stats_dict[f"{col}_median"] = np.nan
        stats_dict[f"{col}_missing_pct"] = np.nan

print(f"✅ Calculated statistics for {len(numeric_columns)} features")
print(f"Total statistics computed: {len(stats_dict)}")
print(f"Statistics per feature: 5 (min, max, mean, median, missing%)")

## Create Single-Row Summary DataFrame

In [None]:
# Create single-row DataFrame with all statistics
print("Creating single-row summary DataFrame...")

# Convert statistics dictionary to single-row DataFrame
summary_df = pd.DataFrame([stats_dict])

# Add metadata columns
summary_df['total_records'] = len(event_wide_df)
summary_df['total_hospitalizations'] = event_wide_df['hospitalization_id'].nunique()
summary_df['total_features_analyzed'] = len(numeric_columns)
summary_df['overall_mortality_rate'] = event_wide_df['disposition'].mean()
summary_df['analysis_timestamp'] = pd.Timestamp.now()

print(f"✅ Created summary DataFrame: {summary_df.shape}")
print(f"Total columns in summary: {len(summary_df.columns)}")

# Display sample of statistics
print("\nSample statistics (first few features):")
sample_cols = [col for col in summary_df.columns if any(col.endswith(suffix) for suffix in ['_min', '_max', '_mean', '_median', '_missing_pct'])][:15]
if sample_cols:
    print(summary_df[sample_cols].T.to_string())

## Summary Statistics Overview

In [None]:
# Provide overview of the statistics
print("=== Dataset Statistics Overview ===")

# Dataset metadata
print(f"Dataset size: {summary_df['total_records'].iloc[0]:,} records")
print(f"Hospitalizations: {summary_df['total_hospitalizations'].iloc[0]:,}")
print(f"Features analyzed: {summary_df['total_features_analyzed'].iloc[0]}")
print(f"Overall mortality rate: {summary_df['overall_mortality_rate'].iloc[0]:.3f}")

# Statistics summary
missing_pct_cols = [col for col in summary_df.columns if col.endswith('_missing_pct')]
if missing_pct_cols:
    missing_values = summary_df[missing_pct_cols].iloc[0]
    print(f"\nMissing data overview:")
    print(f"  Features with no missing data: {(missing_values == 0).sum()}")
    print(f"  Features with <10% missing: {(missing_values < 10).sum()}")
    print(f"  Features with 10-50% missing: {((missing_values >= 10) & (missing_values < 50)).sum()}")
    print(f"  Features with >50% missing: {(missing_values >= 50).sum()}")
    print(f"  Average missing percentage: {missing_values.mean():.1f}%")

# Show features with highest and lowest missing percentages
print(f"\nFeatures with lowest missing data:")
lowest_missing = missing_values.nsmallest(5)
for feature, pct in lowest_missing.items():
    feature_name = feature.replace('_missing_pct', '')
    print(f"  {feature_name}: {pct:.1f}% missing")

print(f"\nFeatures with highest missing data:")
highest_missing = missing_values.nlargest(5)
for feature, pct in highest_missing.items():
    feature_name = feature.replace('_missing_pct', '')
    print(f"  {feature_name}: {pct:.1f}% missing")

## Save Results

In [None]:
# Save summary statistics to parquet file
output_path = os.path.join(output_dir, 'dataset_statistics.parquet')

try:
    summary_df.to_parquet(output_path, index=False)
    print(f"✅ Saved dataset statistics to: {output_path}")
    
    # Verify file was saved
    file_size = os.path.getsize(output_path) / 1024  # KB
    print(f"File size: {file_size:.1f} KB")
    
except Exception as e:
    print(f"❌ Error saving statistics: {str(e)}")
    # Fallback to CSV if parquet fails
    csv_path = os.path.join(output_dir, 'dataset_statistics.csv')
    summary_df.to_csv(csv_path, index=False)
    print(f"✅ Saved as CSV instead: {csv_path}")

print("\n=== Analysis Complete ===") 
print(f"Summary DataFrame shape: {summary_df.shape}")
print(f"Features analyzed: {len(numeric_columns)}")
print(f"Total statistics generated: {len([col for col in summary_df.columns if any(col.endswith(suffix) for suffix in ['_min', '_max', '_mean', '_median', '_missing_pct'])])}")