### Machine Learning Preprocessing Pipeline

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

### Preprocessing Steps

In [2]:
preprocessing_steps = """
PREPROCESSING STEPS FOR PMJDY ML ANALYSIS:

1. DATA LOADING & INITIAL INSPECTION
   - Load the cleaned datasets
   - Check data types and missing values
   - Identify feature categories

2. HANDLING MISSING VALUES
   - Impute missing census data
   - Handle missing population values
   - Strategy: median imputation for numerical features

3. FEATURE SCALING & NORMALIZATION
   - Standardize continuous features
   - Keep binary flags as-is
   - Separate scaling for different feature groups

4. HANDLING CLASS IMBALANCE
   - Check target variable distributions
   - Apply SMOTE for imbalanced flags
   - Create balanced training sets

5. FEATURE ENGINEERING
   - Create interaction features
   - Generate polynomial features for key metrics
   - Add regional clustering features

6. DATA SPLITTING
   - Train-test split (80-20)
   - Stratified sampling for classification
   - Time-based split for temporal analysis

7. FINAL VALIDATION
   - Check for data leakage
   - Verify feature distributions
   - Export preprocessed datasets
"""

print(preprocessing_steps)


PREPROCESSING STEPS FOR PMJDY ML ANALYSIS:

1. DATA LOADING & INITIAL INSPECTION
   - Load the cleaned datasets
   - Check data types and missing values
   - Identify feature categories

2. HANDLING MISSING VALUES
   - Impute missing census data
   - Handle missing population values
   - Strategy: median imputation for numerical features

3. FEATURE SCALING & NORMALIZATION
   - Standardize continuous features
   - Keep binary flags as-is
   - Separate scaling for different feature groups

4. HANDLING CLASS IMBALANCE
   - Check target variable distributions
   - Apply SMOTE for imbalanced flags
   - Create balanced training sets

5. FEATURE ENGINEERING
   - Create interaction features
   - Generate polynomial features for key metrics
   - Add regional clustering features

6. DATA SPLITTING
   - Train-test split (80-20)
   - Stratified sampling for classification
   - Time-based split for temporal analysis

7. FINAL VALIDATION
   - Check for data leakage
   - Verify feature distribution

### STEP 1: DATA LOADING & INITIAL INSPECTION

In [3]:
print("STEP 1: DATA LOADING & INITIAL INSPECTION")

def load_and_inspect_data():
    """Load cleaned datasets and perform initial inspection"""
    
    print("\nLoading datasets...")
    
    # Load the ml_integrated_dataset as primary dataset
    ml_data = pd.read_csv('./cleaned datasets/ml_integrated_dataset.csv')
    print(f"Loaded ml_integrated_dataset: {ml_data.shape}")
    
    # Identify column types
    categorical_cols = ml_data.select_dtypes(include=['object']).columns.tolist()
    numerical_cols = ml_data.select_dtypes(include=['float64', 'int64']).columns.tolist()
    binary_flags = [col for col in numerical_cols if 'Flag' in col]
    continuous_features = [col for col in numerical_cols if col not in binary_flags]
    
    print(f"\nColumn categories:")
    print(f"  - Categorical: {len(categorical_cols)} columns")
    print(f"  - Continuous: {len(continuous_features)} columns")
    print(f"  - Binary flags: {len(binary_flags)} columns")
    
    # Check missing values
    missing_summary = ml_data.isnull().sum()
    missing_cols = missing_summary[missing_summary > 0]
    
    if len(missing_cols) > 0:
        print(f"\nMissing values found in {len(missing_cols)} columns:")
        for col, count in missing_cols.items():
            print(f"  - {col}: {count} ({count/len(ml_data)*100:.1f}%)")
    else:
        print("\nNo missing values found")
    
    return ml_data, categorical_cols, continuous_features, binary_flags

# Execute Step 1
print("\nExecuting Step 1...")

ml_data, categorical_cols, continuous_features, binary_flags = load_and_inspect_data()

print("\nStep 1 Complete")

STEP 1: DATA LOADING & INITIAL INSPECTION

Executing Step 1...

Loading datasets...
Loaded ml_integrated_dataset: (36, 52)

Column categories:
  - Categorical: 2 columns
  - Continuous: 45 columns
  - Binary flags: 5 columns

Missing values found in 16 columns:
  - Rural_Beneficiaries: 2 (5.6%)
  - Urban_Beneficiaries: 2 (5.6%)
  - Total_Beneficiaries: 2 (5.6%)
  - Balance_Rupees: 2 (5.6%)
  - RuPay_Cards: 2 (5.6%)
  - Rural_Percent: 2 (5.6%)
  - RuPay_Penetration: 2 (5.6%)
  - Avg_Balance_Rs: 2 (5.6%)
  - Rural_Urban_Ratio: 2 (5.6%)
  - Population: 7 (19.4%)
  - Households: 7 (19.4%)
  - Literacy_Rate: 7 (19.4%)
  - Rural_HH_Percent: 7 (19.4%)
  - Internet_Penetration: 7 (19.4%)
  - Phone_Penetration: 7 (19.4%)
  - Account_Density_Per_Lakh: 7 (19.4%)

Step 1 Complete


### STEP 2: HANDLING MISSING VALUES

In [4]:
print("STEP 2: HANDLING MISSING VALUES")

def handle_missing_values(df, continuous_features):
    """Impute missing values using appropriate strategies"""
    
    df_processed = df.copy()
    
    # Identify columns with missing values
    missing_cols = df_processed[continuous_features].columns[df_processed[continuous_features].isnull().any()].tolist()
    
    if len(missing_cols) > 0:
        print(f"\nImputing missing values in {len(missing_cols)} columns...")
        
        # Use median imputation for continuous features
        imputer = SimpleImputer(strategy='median')
        df_processed[missing_cols] = imputer.fit_transform(df_processed[missing_cols])
        
        print("Imputation strategy: MEDIAN (robust to outliers)")
        print("\nImputed values:")
        for col in missing_cols:
            median_val = imputer.statistics_[missing_cols.index(col)]
            print(f"  - {col}: {median_val:.2f}")
    else:
        print("\nNo missing values to impute")
    
    # Verify no missing values remain
    remaining_missing = df_processed[continuous_features].isnull().sum().sum()
    print(f"\nRemaining missing values: {remaining_missing}")
    
    return df_processed

# Execute Step 2
print("\nExecuting Step 2...")

ml_data_imputed = handle_missing_values(ml_data, continuous_features)

print("\nStep 2 Complete")

STEP 2: HANDLING MISSING VALUES

Executing Step 2...

Imputing missing values in 16 columns...
Imputation strategy: MEDIAN (robust to outliers)

Imputed values:
  - Rural_Beneficiaries: 4808972.50
  - Urban_Beneficiaries: 3763875.00
  - Total_Beneficiaries: 8289702.00
  - Balance_Rupees: 39484500000.00
  - RuPay_Cards: 6048789.00
  - Rural_Percent: 65.65
  - RuPay_Penetration: 68.00
  - Avg_Balance_Rs: 5161.10
  - Rural_Urban_Ratio: 1.92
  - Population: 25545198.00
  - Households: 7088008.00
  - Literacy_Rate: 67.43
  - Rural_HH_Percent: 67.10
  - Internet_Penetration: 1.78
  - Phone_Penetration: 47.93
  - Account_Density_Per_Lakh: 33918.95

Remaining missing values: 0

Step 2 Complete


### STEP 3: FEATURE SCALING & NORMALIZATION

In [5]:
print("STEP 3: FEATURE SCALING & NORMALIZATION")

def scale_features(df, continuous_features, binary_flags):
    """Apply appropriate scaling to different feature groups"""
    
    df_scaled = df.copy()
    
    # Separate features by their characteristics
    growth_features = [col for col in continuous_features if 'Growth' in col or 'CAGR' in col]
    rate_features = [col for col in continuous_features if 'Rate' in col or 'Percent' in col or 'Penetration' in col]
    absolute_features = [col for col in continuous_features if col not in growth_features + rate_features]
    
    print(f"\nFeature groups identified:")
    print(f"  - Growth metrics: {len(growth_features)} features")
    print(f"  - Rate/Percentage metrics: {len(rate_features)} features")
    print(f"  - Absolute values: {len(absolute_features)} features")
    
    # StandardScaler for growth features (can be negative)
    if growth_features:
        print("\nApplying StandardScaler to growth features...")
        scaler_growth = StandardScaler()
        df_scaled[growth_features] = scaler_growth.fit_transform(df[growth_features])
    
    # MinMaxScaler for rate features (bounded 0-100)
    if rate_features:
        print("Applying MinMaxScaler to rate features...")
        scaler_rate = MinMaxScaler()
        df_scaled[rate_features] = scaler_rate.fit_transform(df[rate_features])
    
    # RobustScaler for absolute features (handles outliers)
    if absolute_features:
        print("Applying RobustScaler to absolute features...")
        scaler_absolute = RobustScaler()
        df_scaled[absolute_features] = scaler_absolute.fit_transform(df[absolute_features])
    
    print(f"\nScaling complete for {len(continuous_features)} continuous features")
    print(f"Binary flags preserved as-is: {len(binary_flags)} features")
    
    return df_scaled, scaler_growth, scaler_rate, scaler_absolute

# Execute Step 3
print("\nExecuting Step 3...")

ml_data_scaled, scaler_g, scaler_r, scaler_a = scale_features(ml_data_imputed, continuous_features, binary_flags)

print("\nStep 3 Complete")

STEP 3: FEATURE SCALING & NORMALIZATION

Executing Step 3...

Feature groups identified:
  - Growth metrics: 7 features
  - Rate/Percentage metrics: 12 features
  - Absolute values: 26 features

Applying StandardScaler to growth features...
Applying MinMaxScaler to rate features...
Applying RobustScaler to absolute features...

Scaling complete for 45 continuous features
Binary flags preserved as-is: 5 features

Step 3 Complete


### STEP 4: HANDLING CLASS IMBALANCE

In [6]:
print("STEP 4: HANDLING CLASS IMBALANCE")

def check_and_balance_targets(df, binary_flags):
    """Check target distributions and apply SMOTE if needed"""
    
    print("\nTarget variable distributions:")
    print("-" * 40)
    
    imbalanced_targets = []
    
    for flag in binary_flags:
        counts = df[flag].value_counts()
        ratio = counts.min() / counts.max() if len(counts) == 2 else 1
        
        print(f"\n{flag}:")
        print(f"  Class 0: {counts.get(0, 0)} samples")
        print(f"  Class 1: {counts.get(1, 0)} samples")
        print(f"  Balance ratio: {ratio:.2f}")
        
        if ratio < 0.3:  # Significant imbalance
            imbalanced_targets.append(flag)
            print(f"IMBALANCED - needs balancing")
        else:
            print(f"Balanced")
    
    print(f"\n{len(imbalanced_targets)} targets need balancing: {imbalanced_targets}")
    
    # Prepare balanced datasets for each imbalanced target
    balanced_datasets = {}
    
    for target in imbalanced_targets:
        print(f"\nBalancing {target} using SMOTE...")
        
        # Prepare features and target
        feature_cols = [col for col in df.columns if col not in binary_flags + ['State/UT']]
        X = df[feature_cols]
        y = df[target]
        
        # Apply SMOTE
        smote = SMOTE(random_state=42, k_neighbors=3)
        X_balanced, y_balanced = smote.fit_resample(X, y)
        
        balanced_datasets[target] = {
            'X': X_balanced,
            'y': y_balanced,
            'original_ratio': y.value_counts()[1] / len(y),
            'balanced_ratio': y_balanced.value_counts()[1] / len(y_balanced)
        }
        
        print(f"  Original minority class: {y.value_counts()[1]} ({balanced_datasets[target]['original_ratio']:.1%})")
        print(f"  Balanced minority class: {y_balanced.value_counts()[1]} ({balanced_datasets[target]['balanced_ratio']:.1%})")
    
    return balanced_datasets, imbalanced_targets

# Execute Step 4
print("\nExecuting Step 4...")

balanced_data, imbalanced_flags = check_and_balance_targets(ml_data_scaled, binary_flags)

print("\nStep 4 Complete")

STEP 4: HANDLING CLASS IMBALANCE

Executing Step 4...

Target variable distributions:
----------------------------------------

High_Operative_Flag:
  Class 0: 11 samples
  Class 1: 25 samples
  Balance ratio: 0.44
Balanced

High_Growth_Flag:
  Class 0: 36 samples
  Class 1: 0 samples
  Balance ratio: 1.00
Balanced

High_RuPay_Flag:
  Class 0: 22 samples
  Class 1: 14 samples
  Balance ratio: 0.64
Balanced

High_Balance_Flag:
  Class 0: 9 samples
  Class 1: 27 samples
  Balance ratio: 0.33
Balanced

Rural_Dominated_Flag:
  Class 0: 22 samples
  Class 1: 14 samples
  Balance ratio: 0.64
Balanced

0 targets need balancing: []

Step 4 Complete


### STEP 5: FEATURE ENGINEERING

In [7]:
print("STEP 5: FEATURE ENGINEERING")

def engineer_features(df):
    """Create interaction and polynomial features"""
    
    df_engineered = df.copy()
    
    print("\nCreating interaction features...")
    
    # 1. Rural-Urban interaction features
    if 'Rural_Percent' in df.columns and 'Jul25_RuPay_Penetration' in df.columns:
        df_engineered['Rural_RuPay_Interaction'] = df['Rural_Percent'] * df['Jul25_RuPay_Penetration'] / 100
        print("  Rural_RuPay_Interaction")
    
    # 2. Growth-Operative interaction
    if 'Growth_2024_25' in df.columns and 'Jan25_Op_Rate' in df.columns:
        df_engineered['Growth_Operative_Interaction'] = df['Growth_2024_25'] * df['Jan25_Op_Rate'] / 100
        print("  Growth_Operative_Interaction")
    
    # 3. Account density squared (non-linear relationship)
    if 'Account_Density_Per_Lakh' in df.columns:
        df_engineered['Account_Density_Squared'] = df['Account_Density_Per_Lakh'] ** 2
        print("  Account_Density_Squared")
    
    # 4. Balance-Operative efficiency
    if 'Jul25_Avg_Balance' in df.columns and 'Jan25_Op_Rate' in df.columns:
        df_engineered['Balance_Efficiency'] = df['Jul25_Avg_Balance'] * df['Jan25_Op_Rate'] / 100
        print("  Balance_Efficiency")
    
    # 5. Regional clusters based on performance
    print("\nCreating regional clusters...")
    
    # Define regions based on geographic proximity
    northern_states = ['Delhi', 'Haryana', 'Himachal Pradesh', 'Jammu And Kashmir', 
                      'Punjab', 'Rajasthan', 'Uttarakhand', 'Chandigarh']
    southern_states = ['Andhra Pradesh', 'Karnataka', 'Kerala', 'Tamil Nadu', 
                      'Telangana', 'Puducherry', 'Lakshadweep']
    eastern_states = ['Bihar', 'Jharkhand', 'Odisha', 'West Bengal', 'Sikkim']
    western_states = ['Goa', 'Gujarat', 'Maharashtra', 'Dadra And Nagar Haveli And Daman And Diu']
    northeastern_states = ['Arunachal Pradesh', 'Assam', 'Manipur', 'Meghalaya', 
                          'Mizoram', 'Nagaland', 'Tripura']
    
    def assign_region(state):
        if state in northern_states:
            return 'North'
        elif state in southern_states:
            return 'South'
        elif state in eastern_states:
            return 'East'
        elif state in western_states:
            return 'West'
        elif state in northeastern_states:
            return 'Northeast'
        else:
            return 'Central'
    
    if 'State/UT' in df.columns:
        df_engineered['Region'] = df['State/UT'].apply(assign_region)
        
        # Create dummy variables for regions
        region_dummies = pd.get_dummies(df_engineered['Region'], prefix='Region')
        df_engineered = pd.concat([df_engineered, region_dummies], axis=1)
        print(f"  Regional dummies created: {region_dummies.columns.tolist()}")
    
    print(f"\nFeature engineering complete")
    print(f"  Original features: {df.shape[1]}")
    print(f"  Engineered features: {df_engineered.shape[1]}")
    print(f"  New features added: {df_engineered.shape[1] - df.shape[1]}")
    
    return df_engineered

# Execute Step 5
print("\nExecuting Step 5...")

ml_data_engineered = engineer_features(ml_data_scaled)

print("\nStep 5 Complete")

STEP 5: FEATURE ENGINEERING

Executing Step 5...

Creating interaction features...
  Growth_Operative_Interaction
  Account_Density_Squared

Creating regional clusters...
  Regional dummies created: ['Region_Central', 'Region_East', 'Region_North', 'Region_Northeast', 'Region_South', 'Region_West']

Feature engineering complete
  Original features: 52
  Engineered features: 61
  New features added: 9

Step 5 Complete


### STEP 6: DATA SPLITTING

In [8]:
print("STEP 6: DATA SPLITTING")

def split_data_for_ml(df, binary_flags):
    """Create train-test splits for different ML tasks"""
    
    splits = {}
    
    # Prepare feature columns (exclude state name and target variables)
    feature_cols = [col for col in df.columns if col not in binary_flags + ['State/UT', 'Region']]
    
    print(f"\nFeatures for ML: {len(feature_cols)} columns")
    
    for target in binary_flags:
        print(f"\nSplitting data for {target}...")
        
        X = df[feature_cols]
        y = df[target]
        
        # Stratified split to maintain class distribution
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        
        splits[target] = {
            'X_train': X_train,
            'X_test': X_test,
            'y_train': y_train,
            'y_test': y_test,
            'train_size': len(X_train),
            'test_size': len(X_test),
            'train_positive_ratio': y_train.mean(),
            'test_positive_ratio': y_test.mean()
        }
        
        print(f"  Train: {splits[target]['train_size']} samples (Positive: {splits[target]['train_positive_ratio']:.1%})")
        print(f"  Test:  {splits[target]['test_size']} samples (Positive: {splits[target]['test_positive_ratio']:.1%})")
    
    return splits

# Execute Step 6
print("\nExecuting Step 6...")

data_splits = split_data_for_ml(ml_data_engineered, binary_flags)

print("\nStep 6 Complete")

STEP 6: DATA SPLITTING

Executing Step 6...

Features for ML: 54 columns

Splitting data for High_Operative_Flag...
  Train: 28 samples (Positive: 67.9%)
  Test:  8 samples (Positive: 75.0%)

Splitting data for High_Growth_Flag...
  Train: 28 samples (Positive: 0.0%)
  Test:  8 samples (Positive: 0.0%)

Splitting data for High_RuPay_Flag...
  Train: 28 samples (Positive: 39.3%)
  Test:  8 samples (Positive: 37.5%)

Splitting data for High_Balance_Flag...
  Train: 28 samples (Positive: 75.0%)
  Test:  8 samples (Positive: 75.0%)

Splitting data for Rural_Dominated_Flag...
  Train: 28 samples (Positive: 39.3%)
  Test:  8 samples (Positive: 37.5%)

Step 6 Complete


### STEP 7: FINAL VALIDATION & EXPORT

In [9]:
print("STEP 7: FINAL VALIDATION & EXPORT")

def validate_and_export(df_processed, splits, balanced_data):
    """Perform final validation and export preprocessed data"""
    
    print("\nPerforming final validation checks...")
    
    # Check 1: No missing values
    missing_count = df_processed.isnull().sum().sum()
    print(f"  Missing values: {missing_count}")
    
    # Check 2: All numerical features scaled
    numerical_cols = df_processed.select_dtypes(include=['float64', 'int64']).columns
    for col in numerical_cols:
        if 'Flag' not in col and 'Region_' not in col:
            col_std = df_processed[col].std()
            col_mean = df_processed[col].mean()
            if abs(col_mean) > 10 or col_std > 10:
                print(f"  ⚠ Warning: {col} may not be properly scaled (mean={col_mean:.2f}, std={col_std:.2f})")
    
    # Check 3: Binary flags intact
    binary_cols = [col for col in df_processed.columns if 'Flag' in col]
    for col in binary_cols:
        unique_vals = df_processed[col].unique()
        if not set(unique_vals).issubset({0, 1}):
            print(f"  ⚠ Warning: {col} has non-binary values: {unique_vals}")
    
    print("\nValidation complete")
    
    # Export preprocessed data
    print("\nExporting preprocessed datasets...")
    
    # 1. Main preprocessed dataset
    df_processed.to_csv('ml_preprocessed_full.csv', index=False)
    print("  ml_preprocessed_full.csv")
    
    # 2. Export train-test splits for each target
    for target, split_data in splits.items():
        # Train set
        train_df = pd.DataFrame(split_data['X_train'])
        train_df[target] = split_data['y_train']
        train_df.to_csv(f'ml_train_{target}.csv', index=False)
        
        # Test set
        test_df = pd.DataFrame(split_data['X_test'])
        test_df[target] = split_data['y_test']
        test_df.to_csv(f'ml_test_{target}.csv', index=False)
        
        print(f"  ml_train_{target}.csv & ml_test_{target}.csv")
    
    # 3. Export balanced datasets for imbalanced targets
    for target, balanced in balanced_data.items():
        balanced_df = pd.DataFrame(balanced['X'])
        balanced_df[target] = balanced['y']
        balanced_df.to_csv(f'ml_balanced_{target}.csv', index=False)
        print(f"  ml_balanced_{target}.csv")
    
    print("\n" + "="*80)
    print("PREPROCESSING PIPELINE COMPLETE")
    print("="*80)
    
    # Summary statistics
    print("\nFinal Dataset Summary:")
    print(f"  Total samples: {len(df_processed)}")
    print(f"  Total features: {len(df_processed.columns)}")
    print(f"  Engineered features: {len([c for c in df_processed.columns if 'Interaction' in c or 'Squared' in c or 'Region_' in c])}")
    print(f"  Binary targets: {len(binary_cols)}")
    print(f"  Balanced datasets created: {len(balanced_data)}")
    
    return df_processed

# Execute Step 7
print("\nExecuting Step 7...")

final_data = validate_and_export(ml_data_engineered, data_splits, balanced_data)

print("\nALL PREPROCESSING STEPS COMPLETED SUCCESSFULLY!")

STEP 7: FINAL VALIDATION & EXPORT

Executing Step 7...

Performing final validation checks...
  Missing values: 0

Validation complete

Exporting preprocessed datasets...
  ml_preprocessed_full.csv
  ml_train_High_Operative_Flag.csv & ml_test_High_Operative_Flag.csv
  ml_train_High_Growth_Flag.csv & ml_test_High_Growth_Flag.csv
  ml_train_High_RuPay_Flag.csv & ml_test_High_RuPay_Flag.csv
  ml_train_High_Balance_Flag.csv & ml_test_High_Balance_Flag.csv
  ml_train_Rural_Dominated_Flag.csv & ml_test_Rural_Dominated_Flag.csv

PREPROCESSING PIPELINE COMPLETE

Final Dataset Summary:
  Total samples: 36
  Total features: 61
  Engineered features: 8
  Binary targets: 5
  Balanced datasets created: 0

ALL PREPROCESSING STEPS COMPLETED SUCCESSFULLY!
