### RESEARCH QUESTION 1: DETERMINANTS OF HIGH PMJDY ACCOUNT OPERATIONALIZATION

**Objective**: Which state-level characteristics predict whether a state achieves greater than 75% operative account rate?

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, 
                           roc_auc_score, confusion_matrix, classification_report)
from sklearn.model_selection import LeaveOneOut, cross_val_score, StratifiedKFold
from sklearn.utils import resample
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

### STEP 1: DATA LOADING AND EXPLORATION

In [3]:
print("\n" + "="*80)
print("STEP 1: DATA LOADING AND EXPLORATION")
print("="*80)

# Load preprocessed datasets
train_data = pd.read_csv('ml_train_High_Operative_Flag.csv')
test_data = pd.read_csv('ml_test_High_Operative_Flag.csv')
full_data = pd.read_csv('ml_preprocessed_full.csv')

print(f"\nDataset Sizes:")
print(f"  Training set: {train_data.shape[0]} states, {train_data.shape[1]} features")
print(f"  Test set: {test_data.shape[0]} states, {test_data.shape[1]} features")
print(f"  Full dataset: {full_data.shape[0]} states")

# Target distribution
print(f"\nTarget Distribution (High_Operative_Flag):")
print(f"  Training - Class 0 (≤75%): {(train_data['High_Operative_Flag']==0).sum()} states")
print(f"  Training - Class 1 (>75%): {(train_data['High_Operative_Flag']==1).sum()} states")
print(f"  Training - Positive ratio: {train_data['High_Operative_Flag'].mean():.1%}")
print(f"  Test - Class 0 (≤75%): {(test_data['High_Operative_Flag']==0).sum()} states")
print(f"  Test - Class 1 (>75%): {(test_data['High_Operative_Flag']==1).sum()} states")
print(f"  Test - Positive ratio: {test_data['High_Operative_Flag'].mean():.1%}")


STEP 1: DATA LOADING AND EXPLORATION

Dataset Sizes:
  Training set: 28 states, 55 features
  Test set: 8 states, 55 features
  Full dataset: 36 states

Target Distribution (High_Operative_Flag):
  Training - Class 0 (≤75%): 9 states
  Training - Class 1 (>75%): 19 states
  Training - Positive ratio: 67.9%
  Test - Class 0 (≤75%): 2 states
  Test - Class 1 (>75%): 6 states
  Test - Positive ratio: 75.0%


### STEP 2: FEATURE SELECTION (Based on Methodology)

In [4]:
print("\n" + "="*80)
print("STEP 2: FEATURE SELECTION")
print("="*80)

# Select key features as per methodology
key_features = [
    # Primary predictors from methodology
    'Rural_Urban_Ratio',  # Rural-Urban Distribution Ratio (if not present, calculate)
    'RuPay_Penetration',   # RuPay Card Penetration Rate
    'Avg_Balance_Rs',      # Average Account Balance
    'Account_Density_Per_Lakh',  # Account Density per Lakh Population
    'CAGR_2020_25',        # Historical Growth Momentum
    
    # Additional relevant features from preprocessing
    'Jan25_Op_Rate',       # Current operative rate
    'Rural_Percent',       # Rural dominance
    'Growth_2024_25',      # Recent growth
    'Operative_Mean',      # Mean operative rate over time
    'Operative_Trend',     # Trend in operative rates
    
    # Engineered features
    'Growth_Operative_Interaction',  # Growth-Operative interaction
    'Account_Density_Squared',       # Non-linear relationship
    
    # Regional dummies
    'Region_North', 'Region_South', 'Region_East', 
    'Region_West', 'Region_Northeast', 'Region_Central'
]

# Check which features are available
available_features = [f for f in key_features if f in train_data.columns]
missing_features = [f for f in key_features if f not in train_data.columns]

print(f"\nAvailable features: {len(available_features)}")
print(f"Missing features: {missing_features}")

# If Rural_Urban_Ratio is missing, calculate it
if 'Rural_Urban_Ratio' not in train_data.columns and 'Rural_Beneficiaries' in train_data.columns:
    if 'Urban_Beneficiaries' in train_data.columns:
        train_data['Rural_Urban_Ratio'] = train_data['Rural_Beneficiaries'] / (train_data['Urban_Beneficiaries'] + 1)
        test_data['Rural_Urban_Ratio'] = test_data['Rural_Beneficiaries'] / (test_data['Urban_Beneficiaries'] + 1)
        available_features.append('Rural_Urban_Ratio')
        print("  ✓ Calculated Rural_Urban_Ratio")

# Update feature list
features_to_use = [f for f in available_features if f in train_data.columns]

print(f"\nFinal feature set: {len(features_to_use)} features")
print("\nFeature Categories:")
print(f"  - Rural/Urban metrics: {[f for f in features_to_use if 'Rural' in f or 'Urban' in f][:3]}")
print(f"  - Growth metrics: {[f for f in features_to_use if 'Growth' in f or 'CAGR' in f][:3]}")
print(f"  - Operative metrics: {[f for f in features_to_use if 'Operative' in f or 'Op_Rate' in f][:3]}")
print(f"  - Regional indicators: {[f for f in features_to_use if 'Region' in f][:3]}")

# Prepare data
X_train = train_data[features_to_use]
y_train = train_data['High_Operative_Flag']
X_test = test_data[features_to_use]
y_test = test_data['High_Operative_Flag']

print(f"\nTraining set: X={X_train.shape}, y={y_train.shape}")
print(f"Test set: X={X_test.shape}, y={y_test.shape}")


STEP 2: FEATURE SELECTION

Available features: 18
Missing features: []

Final feature set: 18 features

Feature Categories:
  - Rural/Urban metrics: ['Rural_Urban_Ratio', 'Rural_Percent']
  - Growth metrics: ['CAGR_2020_25', 'Growth_2024_25', 'Growth_Operative_Interaction']
  - Operative metrics: ['Jan25_Op_Rate', 'Operative_Mean', 'Operative_Trend']
  - Regional indicators: ['Region_North', 'Region_South', 'Region_East']

Training set: X=(28, 18), y=(28,)
Test set: X=(8, 18), y=(8,)


### STEP 3: BOOTSTRAP-ENHANCED LOGISTIC REGRESSION (PRIMARY MODEL)

In [5]:
print("\n" + "="*80)
print("STEP 3: BOOTSTRAP-ENHANCED LOGISTIC REGRESSION")
print("="*80)

def bootstrap_logistic_regression(X_train, y_train, X_test, n_iterations=1000):
    """
    Implement bootstrap-enhanced logistic regression with Firth's penalty approximation
    """
    n_samples = len(X_train)
    n_features = X_train.shape[1]
    
    # Store results from each bootstrap iteration
    predictions = []
    coefficients = []
    accuracies = []
    
    print(f"\nRunning {n_iterations} bootstrap iterations...")
    
    for i in range(n_iterations):
        # Bootstrap sample
        indices = np.random.choice(n_samples, n_samples, replace=True)
        X_boot = X_train.iloc[indices]
        y_boot = y_train.iloc[indices]
        
        # Fit logistic regression with regularization (approximates Firth's penalty)
        lr = LogisticRegression(penalty='l2', C=1.0, max_iter=1000, random_state=i)
        lr.fit(X_boot, y_boot)
        
        # Store predictions and coefficients
        pred = lr.predict(X_test)
        predictions.append(pred)
        coefficients.append(lr.coef_[0])
        
        # Calculate out-of-bag accuracy if possible
        oob_indices = list(set(range(n_samples)) - set(indices))
        if len(oob_indices) > 0:
            X_oob = X_train.iloc[oob_indices]
            y_oob = y_train.iloc[oob_indices]
            oob_pred = lr.predict(X_oob)
            oob_acc = accuracy_score(y_oob, oob_pred)
            accuracies.append(oob_acc)
        
        if (i + 1) % 100 == 0:
            print(f"  Completed {i + 1} iterations...")
    
    # Aggregate predictions using majority voting
    predictions_array = np.array(predictions)
    final_predictions = np.apply_along_axis(
        lambda x: np.bincount(x).argmax(), 0, predictions_array
    )
    
    # Calculate confidence (proportion of votes for winning class)
    confidence = np.mean(predictions_array == final_predictions[np.newaxis, :], axis=0)
    
    # Calculate coefficient statistics
    coefficients_array = np.array(coefficients)
    coef_mean = np.mean(coefficients_array, axis=0)
    coef_std = np.std(coefficients_array, axis=0)
    
    # BCa confidence intervals (simplified version)
    coef_ci_lower = np.percentile(coefficients_array, 2.5, axis=0)
    coef_ci_upper = np.percentile(coefficients_array, 97.5, axis=0)
    
    return {
        'predictions': final_predictions,
        'confidence': confidence,
        'coefficients_mean': coef_mean,
        'coefficients_std': coef_std,
        'coefficients_ci': (coef_ci_lower, coef_ci_upper),
        'oob_accuracy': np.mean(accuracies) if accuracies else None
    }

# Run bootstrap logistic regression
bootstrap_results = bootstrap_logistic_regression(X_train, y_train, X_test, n_iterations=1000)

print("\nBootstrap Logistic Regression Results:")
print(f"  Out-of-bag accuracy: {bootstrap_results['oob_accuracy']:.3f}")
print(f"  Average prediction confidence: {np.mean(bootstrap_results['confidence']):.3f}")

# Evaluate on test set
y_pred_bootstrap = bootstrap_results['predictions']
print("\nTest Set Performance:")
print(f"  Accuracy: {accuracy_score(y_test, y_pred_bootstrap):.3f}")
print(f"  Precision: {precision_score(y_test, y_pred_bootstrap):.3f}")
print(f"  Recall: {recall_score(y_test, y_pred_bootstrap):.3f}")
print(f"  F1-Score: {f1_score(y_test, y_pred_bootstrap):.3f}")


STEP 3: BOOTSTRAP-ENHANCED LOGISTIC REGRESSION

Running 1000 bootstrap iterations...
  Completed 100 iterations...
  Completed 200 iterations...
  Completed 300 iterations...
  Completed 400 iterations...
  Completed 500 iterations...
  Completed 600 iterations...
  Completed 700 iterations...
  Completed 800 iterations...
  Completed 900 iterations...
  Completed 1000 iterations...

Bootstrap Logistic Regression Results:
  Out-of-bag accuracy: 0.847
  Average prediction confidence: 0.908

Test Set Performance:
  Accuracy: 0.875
  Precision: 0.857
  Recall: 1.000
  F1-Score: 0.923


### STEP 4: SECONDARY MODELS FOR VALIDATION

In [6]:
print("\n" + "="*80)
print("STEP 4: SECONDARY MODELS FOR VALIDATION")
print("="*80)

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(penalty='l2', C=1.0, max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=3, random_state=42),
    'SVM (RBF)': SVC(kernel='rbf', C=1.0, gamma='scale', probability=True, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=50, learning_rate=0.1, max_depth=2, random_state=42)
}

# Train and evaluate each model
results = {}

for name, model in models.items():
    print(f"\n{name}:")
    
    # Fit model
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # AUC-ROC if probabilities available
    if y_pred_proba is not None:
        auc_roc = roc_auc_score(y_test, y_pred_proba)
    else:
        auc_roc = None
    
    results[name] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'auc_roc': auc_roc,
        'predictions': y_pred
    }
    
    print(f"  Accuracy: {accuracy:.3f}")
    print(f"  Precision: {precision:.3f}")
    print(f"  Recall: {recall:.3f}")
    print(f"  F1-Score: {f1:.3f}")
    if auc_roc:
        print(f"  AUC-ROC: {auc_roc:.3f}")


STEP 4: SECONDARY MODELS FOR VALIDATION

Logistic Regression:
  Accuracy: 0.875
  Precision: 0.857
  Recall: 1.000
  F1-Score: 0.923
  AUC-ROC: 0.667

Random Forest:
  Accuracy: 0.875
  Precision: 0.857
  Recall: 1.000
  F1-Score: 0.923
  AUC-ROC: 1.000

SVM (RBF):
  Accuracy: 0.750
  Precision: 0.750
  Recall: 1.000
  F1-Score: 0.857
  AUC-ROC: 0.750

Gradient Boosting:
  Accuracy: 1.000
  Precision: 1.000
  Recall: 1.000
  F1-Score: 1.000
  AUC-ROC: 1.000


### STEP 5: LEAVE-ONE-OUT CROSS-VALIDATION (LOOCV)

In [7]:
print("\n" + "="*80)
print("STEP 5: LEAVE-ONE-OUT CROSS-VALIDATION")
print("="*80)

# Combine train and test for full LOOCV
X_full = pd.concat([X_train, X_test])
y_full = pd.concat([y_train, y_test])

print(f"\nPerforming LOOCV on {len(X_full)} states...")

# LOOCV for Logistic Regression
loo = LeaveOneOut()
lr_model = LogisticRegression(penalty='l2', C=1.0, max_iter=1000, random_state=42)

# Calculate LOOCV scores
loocv_scores = cross_val_score(lr_model, X_full, y_full, cv=loo, scoring='accuracy')

print(f"\nLOOCV Results (Logistic Regression):")
print(f"  Mean Accuracy: {np.mean(loocv_scores):.3f}")
print(f"  Std Deviation: {np.std(loocv_scores):.3f}")
print(f"  Min Accuracy: {np.min(loocv_scores):.3f}")
print(f"  Max Accuracy: {np.max(loocv_scores):.3f}")


STEP 5: LEAVE-ONE-OUT CROSS-VALIDATION

Performing LOOCV on 36 states...

LOOCV Results (Logistic Regression):
  Mean Accuracy: 0.889
  Std Deviation: 0.314
  Min Accuracy: 0.000
  Max Accuracy: 1.000


### STEP 6: FEATURE IMPORTANCE ANALYSIS

In [8]:
print("\n" + "="*80)
print("STEP 6: FEATURE IMPORTANCE ANALYSIS")
print("="*80)

# Get feature importance from Random Forest
rf_model = models['Random Forest']
feature_importance = pd.DataFrame({
    'feature': features_to_use,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features (Random Forest):")
for i, row in feature_importance.head(10).iterrows():
    print(f"  {i+1}. {row['feature']}: {row['importance']:.4f}")

# Calculate odds ratios from logistic regression coefficients
lr_model_final = LogisticRegression(penalty='l2', C=1.0, max_iter=1000, random_state=42)
lr_model_final.fit(X_train, y_train)

odds_ratios = np.exp(lr_model_final.coef_[0])
feature_odds = pd.DataFrame({
    'feature': features_to_use,
    'coefficient': lr_model_final.coef_[0],
    'odds_ratio': odds_ratios
}).sort_values('odds_ratio', ascending=False)

print("\nTop Features by Odds Ratios (Logistic Regression):")
print("Features with Odds Ratio > 1 (positive association):")
positive_features = feature_odds[feature_odds['odds_ratio'] > 1].head(5)
for i, row in positive_features.iterrows():
    print(f"  {row['feature']}: OR={row['odds_ratio']:.3f}")

print("\nFeatures with Odds Ratio < 1 (negative association):")
negative_features = feature_odds[feature_odds['odds_ratio'] < 1].head(5)
for i, row in negative_features.iterrows():
    print(f"  {row['feature']}: OR={row['odds_ratio']:.3f}")


STEP 6: FEATURE IMPORTANCE ANALYSIS

Top 10 Most Important Features (Random Forest):
  6. Jan25_Op_Rate: 0.3520
  9. Operative_Mean: 0.0893
  5. CAGR_2020_25: 0.0743
  1. Rural_Urban_Ratio: 0.0696
  11. Growth_Operative_Interaction: 0.0691
  10. Operative_Trend: 0.0652
  8. Growth_2024_25: 0.0627
  12. Account_Density_Squared: 0.0569
  7. Rural_Percent: 0.0398
  3. Avg_Balance_Rs: 0.0335

Top Features by Odds Ratios (Logistic Regression):
Features with Odds Ratio > 1 (positive association):
  Operative_Mean: OR=2.775
  Operative_Trend: OR=2.326
  Growth_2024_25: OR=1.779
  CAGR_2020_25: OR=1.747
  Region_West: OR=1.692

Features with Odds Ratio < 1 (negative association):
  Account_Density_Per_Lakh: OR=0.983
  Avg_Balance_Rs: OR=0.961
  RuPay_Penetration: OR=0.944
  Region_East: OR=0.855
  Region_North: OR=0.423


### STEP 7: FINAL SUMMARY AND RECOMMENDATIONS

In [11]:
print("\n" + "="*80)
print("SUMMARY OF RESULTS")
print("="*80)

print("\n1. MODEL PERFORMANCE COMPARISON:")
print("-" * 40)
performance_df = pd.DataFrame(results).T
print(performance_df[['accuracy', 'precision', 'recall', 'f1_score', 'auc_roc']].round(3))

best_model = performance_df['f1_score'].idxmax()
print(f"\nBest performing model (by F1-Score): {best_model}")

print("\n2. KEY PREDICTORS OF HIGH OPERATIONALIZATION (>75%):")
print("-" * 40)

# Identify top 3 predictors with odds ratios > 2.0
high_impact_features = feature_odds[feature_odds['odds_ratio'] > 2.0].head(3)
if len(high_impact_features) > 0:
    print("High-impact features (OR > 2.0):")
    for i, row in high_impact_features.iterrows():
        print(f"  • {row['feature']}: OR={row['odds_ratio']:.2f}")
else:
    print("Top 3 features by odds ratio:")
    for i, row in feature_odds.head(3).iterrows():
        print(f"  • {row['feature']}: OR={row['odds_ratio']:.2f}")

print("\n3. STATISTICAL VALIDATION:")
print("-" * 40)
print(f"  Bootstrap OOB Accuracy: {bootstrap_results['oob_accuracy']:.3f}")
print(f"  LOOCV Mean Accuracy: {np.mean(loocv_scores):.3f}")
print(f"  Test Set Best F1-Score: {performance_df['f1_score'].max():.3f}")

print("\n4. POLICY RECOMMENDATIONS:")
print("-" * 40)

# Based on top features, provide recommendations
if 'Jan25_Op_Rate' in feature_odds.head(3)['feature'].values:
    print("  • States should focus on improving current operative rates")
if 'Rural_Percent' in feature_odds.head(3)['feature'].values or 'Rural_Urban_Ratio' in feature_odds.head(3)['feature'].values:
    print("  • Rural-urban composition significantly impacts operationalization")
if 'RuPay_Penetration' in feature_odds.head(3)['feature'].values:
    print("  • RuPay card distribution is crucial for account activation")
if 'Account_Density_Per_Lakh' in feature_odds.head(3)['feature'].values:
    print("  • Account penetration density matters for operational success")
if any('Growth' in f for f in feature_odds.head(3)['feature'].values):
    print("  • Historical growth momentum predicts future operationalization")

print("\n" + "="*80)
print("ANALYSIS COMPLETE")
print("="*80)


SUMMARY OF RESULTS

1. MODEL PERFORMANCE COMPARISON:
----------------------------------------
                    accuracy precision recall  f1_score   auc_roc
Logistic Regression    0.875  0.857143    1.0  0.923077  0.666667
Random Forest          0.875  0.857143    1.0  0.923077       1.0
SVM (RBF)               0.75      0.75    1.0  0.857143      0.75
Gradient Boosting        1.0       1.0    1.0       1.0       1.0

Best performing model (by F1-Score): Gradient Boosting

2. KEY PREDICTORS OF HIGH OPERATIONALIZATION (>75%):
----------------------------------------
High-impact features (OR > 2.0):
  • Operative_Mean: OR=2.78
  • Operative_Trend: OR=2.33

3. STATISTICAL VALIDATION:
----------------------------------------
  Bootstrap OOB Accuracy: 0.847
  LOOCV Mean Accuracy: 0.889
  Test Set Best F1-Score: 1.000

4. POLICY RECOMMENDATIONS:
----------------------------------------
  • Historical growth momentum predicts future operationalization

ANALYSIS COMPLETE
