# Data Preprocessing Pipeline - Disaster Early Warning System

This notebook implements the complete data preprocessing pipeline:
1. Load raw weather datasets
2. Apply data cleaning and standardization
3. Create synthetic disaster labels
4. Engineer features for ML training
5. Perform train/test split
6. Export processed data to SQLite
7. Verify data quality and feature distributions

**Output**: Clean, labeled, and feature-engineered dataset ready for ML training

In [None]:
import sys
import os

# Add backend modules to path
sys.path.append('../backend')
sys.path.append('../')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Import our custom modules
from backend.utils.data_loader import DataLoader
from backend.utils.disaster_labeler import DisasterLabeler
from backend.utils.feature_engineer import FeatureEngineer

# Set up plotting
plt.style.use('default')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (12, 8)

print("✓ All modules imported successfully!")
print(f"Working directory: {os.getcwd()}")

## 1. Load Raw Weather Datasets

In [None]:
# Initialize data loader
data_loader = DataLoader(dataset_dir='../dataset')

# Load all available datasets
print("Loading weather datasets...")
datasets = data_loader.load_weather_datasets()

# Display dataset information
print("\nDataset Summary:")
print("=" * 60)
dataset_info = data_loader.get_dataset_info()

for name, info in dataset_info.items():
    print(f"\n{name.upper()}:")
    print(f"  Shape: {info['shape']}")
    print(f"  Numeric columns: {len(info['numeric_columns'])}")
    print(f"  Missing values: {info['missing_values']}")
    print(f"  Memory usage: {info['memory_usage_mb']:.2f} MB")
    print(f"  Key columns: {info['numeric_columns'][:5]}")

## 2. Select Primary Dataset for Processing

Based on the exploration, we'll use the weather classification dataset as our primary source for ML training.

In [None]:
# Select primary dataset for processing
primary_datasets = ['classification', 'rain_prediction', 'global']
selected_datasets = []

for dataset_name in primary_datasets:
    if dataset_name in datasets:
        selected_datasets.append(dataset_name)
        print(f"✓ Selected {dataset_name}: {datasets[dataset_name].shape}")

if not selected_datasets:
    print("⚠ No suitable datasets found. Using first available dataset.")
    selected_datasets = [list(datasets.keys())[0]]

# Combine selected datasets if multiple are available
if len(selected_datasets) > 1:
    print(f"\nCombining {len(selected_datasets)} datasets...")
    combined_df = data_loader.get_combined_dataset(selected_datasets)
    print(f"Combined dataset shape: {combined_df.shape}")
else:
    combined_df = datasets[selected_datasets[0]].copy()
    print(f"Using single dataset: {combined_df.shape}")

# Display basic info about combined dataset
print(f"\nCombined Dataset Info:")
print(f"Shape: {combined_df.shape}")
print(f"Columns: {list(combined_df.columns)}")
print(f"\nFirst few rows:")
print(combined_df.head())

## 3. Create Synthetic Disaster Labels

In [None]:
# Initialize disaster labeler
disaster_labeler = DisasterLabeler()

# Create disaster labels
print("Creating synthetic disaster labels...")
labeled_df = disaster_labeler.create_disaster_labels(
    combined_df, 
    region_type='temperate'  # Can be adjusted based on data
)

# Analyze disaster distribution
disaster_stats = {
    'total_records': len(labeled_df),
    'disaster_records': (labeled_df['disaster_occurred'] == 1).sum(),
    'disaster_percentage': (labeled_df['disaster_occurred'] == 1).mean() * 100
}

print(f"\nDisaster Labeling Results:")
print(f"Total records: {disaster_stats['total_records']:,}")
print(f"Disaster records: {disaster_stats['disaster_records']:,}")
print(f"Disaster percentage: {disaster_stats['disaster_percentage']:.2f}%")

# Show disaster type distribution
if disaster_stats['disaster_records'] > 0:
    disaster_types = labeled_df[labeled_df['disaster_occurred'] == 1]['disaster_type'].value_counts()
    print(f"\nDisaster Type Distribution:")
    for disaster_type, count in disaster_types.items():
        percentage = (count / disaster_stats['disaster_records']) * 100
        print(f"  {disaster_type}: {count} ({percentage:.1f}%)")
    
    # Plot disaster distribution
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Disaster occurrence pie chart
    disaster_counts = labeled_df['disaster_occurred'].value_counts()
    ax1.pie(disaster_counts.values, labels=['No Disaster', 'Disaster'], autopct='%1.1f%%', startangle=90)
    ax1.set_title('Disaster Occurrence Distribution')
    
    # Disaster type bar chart
    disaster_types.plot(kind='bar', ax=ax2, color='coral')
    ax2.set_title('Disaster Type Distribution')
    ax2.set_ylabel('Count')
    ax2.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()
else:
    print("⚠ No disasters identified with current thresholds. Consider adjusting criteria.")

## 4. Engineer Features for ML Training

In [None]:
# Initialize feature engineer
feature_engineer = FeatureEngineer(window_size=7)

# Engineer features
print("Engineering features for ML training...")
featured_df = feature_engineer.engineer_features(
    labeled_df,
    datetime_col=None,  # Add if datetime column exists
    location_col='source_dataset' if 'source_dataset' in labeled_df.columns else None
)

print(f"\nFeature Engineering Results:")
print(f"Original columns: {len(labeled_df.columns)}")
print(f"After feature engineering: {len(featured_df.columns)}")

# Get list of engineered features
original_cols = set(labeled_df.columns)
new_cols = set(featured_df.columns) - original_cols
engineered_features = list(new_cols)

print(f"\nEngineered Features ({len(engineered_features)}):")
for i, feature in enumerate(engineered_features, 1):
    if feature in featured_df.columns:
        mean_val = featured_df[feature].mean()
        std_val = featured_df[feature].std()
        print(f"{i:2d}. {feature}: {mean_val:.3f} ± {std_val:.3f}")

# Create feature summary
feature_summary = feature_engineer.create_feature_summary(featured_df)
print(f"\nFeature Summary Statistics:")
print(f"Total engineered features: {feature_summary['total_features']}")
print(f"Features with valid statistics: {len(feature_summary['feature_statistics'])}")

## 5. Feature Importance Analysis

In [None]:
# Calculate feature importance if we have disaster labels
if 'disaster_occurred' in featured_df.columns and featured_df['disaster_occurred'].sum() > 0:
    print("Calculating feature importance...")
    
    # Get feature importance ranking
    importance_df = feature_engineer.get_feature_importance_ranking(
        featured_df, target_col='disaster_occurred'
    )
    
    if not importance_df.empty:
        print(f"\nTop 10 Most Important Features:")
        print("=" * 60)
        top_features = importance_df.head(10)
        for idx, row in top_features.iterrows():
            print(f"{row['feature']:25s}: {row['correlation']:.4f}")
        
        # Plot feature importance
        plt.figure(figsize=(12, 8))
        top_10 = importance_df.head(10)
        plt.barh(range(len(top_10)), top_10['correlation'], color='skyblue')
        plt.yticks(range(len(top_10)), top_10['feature'])
        plt.xlabel('Absolute Correlation with Disaster Occurrence')
        plt.title('Top 10 Feature Importance (Correlation with Disasters)')
        plt.gca().invert_yaxis()
        plt.tight_layout()
        plt.show()
    else:
        print("⚠ Could not calculate feature importance")
else:
    print("⚠ Skipping feature importance analysis (no disaster labels found)")

## 6. Prepare Data for ML Training

In [None]:
# Select features for ML training
print("Preparing data for ML training...")

# Define feature columns (engineered + original weather variables)
weather_features = ['temperature', 'pressure', 'wind_speed', 'precipitation', 'humidity']
engineered_feature_names = [
    'pressure_drop_7d', 'wind_spike_max', 'rain_accumulation_7d',
    'humidity_trend', 'temp_deviation', 'pressure_velocity', 'wind_gust_ratio'
]

# Select available features
available_features = []
for feature in weather_features + engineered_feature_names:
    if feature in featured_df.columns:
        available_features.append(feature)

print(f"Available features for ML: {len(available_features)}")
print(f"Features: {available_features}")

# Prepare feature matrix and target vector
if available_features and 'disaster_occurred' in featured_df.columns:
    X = featured_df[available_features].copy()
    y = featured_df['disaster_occurred'].copy()
    
    # Handle missing values in features
    X = X.fillna(X.median())
    
    print(f"\nFeature matrix shape: {X.shape}")
    print(f"Target vector shape: {y.shape}")
    print(f"Positive class ratio: {y.mean():.4f}")
    
    # Check for any remaining issues
    print(f"\nData Quality Check:")
    print(f"Missing values in X: {X.isnull().sum().sum()}")
    print(f"Missing values in y: {y.isnull().sum()}")
    print(f"Infinite values in X: {np.isinf(X).sum().sum()}")
    
else:
    print("⚠ Cannot prepare ML data - missing features or target column")
    X, y = None, None

## 7. Train/Test Split

In [None]:
# Perform stratified train/test split
if X is not None and y is not None:
    print("Performing train/test split...")
    
    # Check if we have enough positive samples for stratification
    positive_samples = y.sum()
    
    if positive_samples >= 2:  # Need at least 2 positive samples for stratification
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, 
            test_size=0.2, 
            random_state=42, 
            stratify=y
        )
        print("✓ Stratified split completed")
    else:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, 
            test_size=0.2, 
            random_state=42
        )
        print("✓ Regular split completed (insufficient positive samples for stratification)")
    
    print(f"\nSplit Results:")
    print(f"Training set: {X_train.shape[0]} samples ({y_train.mean():.4f} positive rate)")
    print(f"Test set: {X_test.shape[0]} samples ({y_test.mean():.4f} positive rate)")
    
    # Feature scaling (optional, but recommended for some algorithms)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    print(f"✓ Feature scaling completed")
    
else:
    print("⚠ Skipping train/test split - no valid data available")
    X_train = X_test = y_train = y_test = None

## 8. Export Processed Data to SQLite

In [None]:
# Export processed data to SQLite database
print("Exporting processed data to SQLite...")

# Database file path
db_path = '../disaster_data.db'

try:
    # Connect to SQLite database
    conn = sqlite3.connect(db_path)
    
    # Export full processed dataset
    featured_df.to_sql('processed_weather_data', conn, if_exists='replace', index=False)
    print(f"✓ Exported full dataset: {len(featured_df)} records")
    
    # Export training data if available
    if X_train is not None:
        # Combine features and target for training set
        train_df = pd.DataFrame(X_train, columns=available_features)
        train_df['disaster_occurred'] = y_train.values
        train_df.to_sql('training_data', conn, if_exists='replace', index=False)
        print(f"✓ Exported training data: {len(train_df)} records")
        
        # Export test data
        test_df = pd.DataFrame(X_test, columns=available_features)
        test_df['disaster_occurred'] = y_test.values
        test_df.to_sql('test_data', conn, if_exists='replace', index=False)
        print(f"✓ Exported test data: {len(test_df)} records")
    
    # Export feature metadata
    feature_metadata = []
    for feature in available_features:
        if feature in featured_df.columns:
            feature_metadata.append({
                'feature_name': feature,
                'mean': featured_df[feature].mean(),
                'std': featured_df[feature].std(),
                'min': featured_df[feature].min(),
                'max': featured_df[feature].max(),
                'description': feature_engineer.feature_definitions.get(feature, 'Weather variable')
            })
    
    feature_metadata_df = pd.DataFrame(feature_metadata)
    feature_metadata_df.to_sql('feature_metadata', conn, if_exists='replace', index=False)
    print(f"✓ Exported feature metadata: {len(feature_metadata_df)} features")
    
    # Close connection
    conn.close()
    
    print(f"\n✓ All data exported successfully to {db_path}")
    
except Exception as e:
    print(f"✗ Error exporting to SQLite: {e}")

## 9. Data Quality Verification

In [None]:
# Verify data quality and feature distributions
print("Verifying data quality and feature distributions...")

# Overall data quality metrics
quality_metrics = {
    'total_records': len(featured_df),
    'total_features': len(featured_df.columns),
    'engineered_features': len(engineered_features),
    'disaster_records': featured_df['disaster_occurred'].sum() if 'disaster_occurred' in featured_df.columns else 0,
    'missing_values': featured_df.isnull().sum().sum(),
    'duplicate_records': featured_df.duplicated().sum()
}

print(f"\nData Quality Metrics:")
print(f"{'='*50}")
for metric, value in quality_metrics.items():
    print(f"{metric.replace('_', ' ').title():25s}: {value:,}")

# Feature distribution analysis
if available_features:
    print(f"\nFeature Distribution Analysis:")
    print(f"{'='*50}")
    
    # Plot feature distributions
    n_features = min(6, len(available_features))
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    axes = axes.flatten()
    
    for i, feature in enumerate(available_features[:n_features]):
        if feature in featured_df.columns:
            featured_df[feature].hist(bins=30, ax=axes[i], alpha=0.7, color='lightblue')
            axes[i].set_title(f'{feature} Distribution')
            axes[i].set_ylabel('Frequency')
            
            # Add statistics
            mean_val = featured_df[feature].mean()
            axes[i].axvline(mean_val, color='red', linestyle='--', alpha=0.7)
            axes[i].text(0.7, 0.9, f'Mean: {mean_val:.2f}', transform=axes[i].transAxes,
                        bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
    
    # Remove empty subplots
    for i in range(n_features, len(axes)):
        fig.delaxes(axes[i])
    
    plt.tight_layout()
    plt.show()

# Correlation matrix for key features
if len(available_features) > 1:
    print(f"\nFeature Correlation Analysis:")
    key_features = available_features[:8]  # Limit to 8 features for readability
    corr_matrix = featured_df[key_features].corr()
    
    plt.figure(figsize=(10, 8))
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='coolwarm', center=0,
               square=True, linewidths=0.5, cbar_kws={"shrink": .8})
    plt.title('Feature Correlation Matrix')
    plt.tight_layout()
    plt.show()
    
    # Find highly correlated features
    high_corr_pairs = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            corr_val = corr_matrix.iloc[i, j]
            if abs(corr_val) > 0.7:
                high_corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_val))
    
    if high_corr_pairs:
        print(f"\nHighly Correlated Feature Pairs (|r| > 0.7):")
        for feat1, feat2, corr in high_corr_pairs:
            print(f"  {feat1} <-> {feat2}: {corr:.3f}")
    else:
        print(f"\n✓ No highly correlated features found (good for ML)")

## 10. Pipeline Summary and Next Steps

In [None]:
# Generate pipeline summary
print("\n" + "="*80)
print("DATA PREPROCESSING PIPELINE SUMMARY")
print("="*80)

summary_stats = {
    'Input Datasets': len(datasets),
    'Selected Datasets': len(selected_datasets),
    'Total Records Processed': len(featured_df),
    'Original Features': len(labeled_df.columns),
    'Engineered Features': len(engineered_features),
    'Final Feature Count': len(available_features),
    'Disaster Records': quality_metrics['disaster_records'],
    'Disaster Rate': f"{(quality_metrics['disaster_records'] / quality_metrics['total_records'] * 100):.2f}%" if quality_metrics['total_records'] > 0 else "0%",
    'Training Samples': len(X_train) if X_train is not None else 0,
    'Test Samples': len(X_test) if X_test is not None else 0
}

for key, value in summary_stats.items():
    print(f"{key:25s}: {value}")

print(f"\n" + "="*80)
print("PIPELINE STATUS")
print("="*80)

pipeline_status = [
    ("✓", "Data Loading", "Successfully loaded weather datasets"),
    ("✓", "Data Cleaning", "Applied standardization and missing value handling"),
    ("✓", "Disaster Labeling", f"Created {quality_metrics['disaster_records']} disaster labels"),
    ("✓", "Feature Engineering", f"Generated {len(engineered_features)} engineered features"),
    ("✓" if X_train is not None else "⚠", "Train/Test Split", "Prepared data for ML training" if X_train is not None else "Insufficient data for ML"),
    ("✓", "Data Export", "Exported processed data to SQLite database"),
    ("✓", "Quality Verification", "Verified data quality and feature distributions")
]

for status, step, description in pipeline_status:
    print(f"{status} {step:20s}: {description}")

print(f"\n" + "="*80)
print("NEXT STEPS")
print("="*80)

next_steps = [
    "1. MACHINE LEARNING MODEL TRAINING:",
    "   • Train binary classification model (disaster vs no disaster)",
    "   • Train multi-class model (disaster type classification)",
    "   • Evaluate model performance with cross-validation",
    "   • Save trained models for deployment",
    "",
    "2. MODEL EVALUATION:",
    "   • Calculate accuracy, precision, recall, F1-score",
    "   • Generate confusion matrices",
    "   • Analyze feature importance",
    "   • Test model on holdout data",
    "",
    "3. BACKEND INTEGRATION:",
    "   • Implement prediction engine",
    "   • Create API endpoints",
    "   • Integrate with Gemini AI for explanations",
    "   • Set up real-time prediction pipeline",
    "",
    "4. FRONTEND DEVELOPMENT:",
    "   • Build React dashboard",
    "   • Create visualization components",
    "   • Implement user interface",
    "   • Add real-time updates"
]

for step in next_steps:
    print(step)

print(f"\n" + "="*80)
print("DATA PREPROCESSING PIPELINE COMPLETE!")
print("Ready for ML model training and system development.")
print("="*80)