In [None]:
# Heart Disease UCI Dataset - Feature Selection
# Statistical and ML-based Feature Selection Techniques

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import (
    SelectKBest, chi2, f_classif, mutual_info_classif,
    RFE, RFECV, SelectFromModel
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

# Set style for visualizations
plt.style.use('default')
sns.set_palette("husl")

print("=== Heart Disease Prediction - Feature Selection ===")
print("Applying statistical and ML-based feature selection techniques...")

# Load preprocessed data
try:
    X_scaled = pd.read_csv('../data/X_scaled.csv')
    y = pd.read_csv('../data/y.csv')['target']
    print("✅ Preprocessed data loaded successfully")
except FileNotFoundError:
    print("❌ Preprocessed data not found. Please run 01_data_preprocessing.ipynb first.")
    # Create sample data for demonstration
    print("Creating sample data for demonstration...")
    np.random.seed(42)
    n_samples, n_features = 303, 13
    feature_names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
                    'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
    X_scaled = pd.DataFrame(
        np.random.randn(n_samples, n_features),
        columns=feature_names
    )
    y = pd.Series(np.random.choice([0, 1], n_samples), name='target')
    print("✅ Sample data created")

print(f"\nDataset shape: {X_scaled.shape}")
print(f"Features: {list(X_scaled.columns)}")
print(f"Target distribution: {dict(y.value_counts())}")

# 1. UNIVARIATE STATISTICAL TESTS
print("\n" + "="*70)
print("1. UNIVARIATE STATISTICAL TESTS")
print("="*70)

# Prepare data for statistical tests (ensure non-negative values for chi2)
X_positive = MinMaxScaler().fit_transform(X_scaled)
X_positive = pd.DataFrame(X_positive, columns=X_scaled.columns)

# Chi-square test
print("🔍 Chi-square Test (for categorical/discrete features):")
chi2_selector = SelectKBest(chi2, k='all')
chi2_selector.fit(X_positive, y)
chi2_scores = chi2_selector.scores_
chi2_pvalues = chi2_selector.pvalues_

chi2_results = pd.DataFrame({
    'Feature': X_scaled.columns,
    'Chi2_Score': chi2_scores,
    'Chi2_P_value': chi2_pvalues
}).sort_values('Chi2_Score', ascending=False)

print(chi2_results.round(4))

# F-test (ANOVA)
print(f"\n🔍 F-test (ANOVA) - for continuous features:")
f_selector = SelectKBest(f_classif, k='all')
f_selector.fit(X_scaled, y)
f_scores = f_selector.scores_
f_pvalues = f_selector.pvalues_

f_results = pd.DataFrame({
    'Feature': X_scaled.columns,
    'F_Score': f_scores,
    'F_P_value': f_pvalues
}).sort_values('F_Score', ascending=False)

print(f_results.round(4))

# Mutual Information
print(f"\n🔍 Mutual Information:")
mi_scores = mutual_info_classif(X_scaled, y, random_state=42)
mi_results = pd.DataFrame({
    'Feature': X_scaled.columns,
    'MI_Score': mi_scores
}).sort_values('MI_Score', ascending=False)

print(mi_results.round(4))

# 2. VISUALIZE STATISTICAL TESTS
print("\n" + "="*70)
print("2. STATISTICAL TESTS VISUALIZATION")
print("="*70)

fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Feature Selection - Statistical Tests', fontsize=16, fontweight='bold')

# Chi-square scores
chi2_sorted = chi2_results.sort_values('Chi2_Score', ascending=True)
axes[0,0].barh(range(len(chi2_sorted)), chi2_sorted['Chi2_Score'], alpha=0.7)
axes[0,0].set_yticks(range(len(chi2_sorted)))
axes[0,0].set_yticklabels(chi2_sorted['Feature'])
axes[0,0].set_xlabel('Chi-square Score')
axes[0,0].set_title('Chi-square Test Scores')
axes[0,0].grid(True, alpha=0.3)

# F-test scores
f_sorted = f_results.sort_values('F_Score', ascending=True)
axes[0,1].barh(range(len(f_sorted)), f_sorted['F_Score'], alpha=0.7, color='orange')
axes[0,1].set_yticks(range(len(f_sorted)))
axes[0,1].set_yticklabels(f_sorted['Feature'])
axes[0,1].set_xlabel('F-test Score')
axes[0,1].set_title('F-test (ANOVA) Scores')
axes[0,1].grid(True, alpha=0.3)

# Mutual Information scores
mi_sorted = mi_results.sort_values('MI_Score', ascending=True)
axes[1,0].barh(range(len(mi_sorted)), mi_sorted['MI_Score'], alpha=0.7, color='green')
axes[1,0].set_yticks(range(len(mi_sorted)))
axes[1,0].set_yticklabels(mi_sorted['Feature'])
axes[1,0].set_xlabel('Mutual Information Score')
axes[1,0].set_title('Mutual Information Scores')
axes[1,0].grid(True, alpha=0.3)

# P-values comparison
p_value_data = pd.DataFrame({
    'Feature': X_scaled.columns,
    'Chi2_P_value': chi2_pvalues,
    'F_P_value': f_pvalues
})
p_value_melted = p_value_data.melt(id_vars=['Feature'], var_name='Test', value_name='P_value')
sns.barplot(data=p_value_melted, x='P_value', y='Feature', hue='Test', ax=axes[1,1])
axes[1,1].axvline(x=0.05, color='red', linestyle='--', alpha=0.7, label='α=0.05')
axes[1,1].set_xlabel('P-value')
axes[1,1].set_title('Statistical Significance (P-values)')
axes[1,1].legend()

plt.tight_layout()
plt.show()

# 3. MACHINE LEARNING-BASED FEATURE SELECTION
print("\n" + "="*70)
print("3. MACHINE LEARNING-BASED FEATURE SELECTION")
print("="*70)

# Random Forest Feature Importance
print("🌳 Random Forest Feature Importance:")
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_scaled, y)
rf_importance = rf.feature_importances_

rf_results = pd.DataFrame({
    'Feature': X_scaled.columns,
    'RF_Importance': rf_importance
}).sort_values('RF_Importance', ascending=False)

print(rf_results.round(4))

# XGBoost Feature Importance
print(f"\n🚀 XGBoost Feature Importance:")
xgb_model = xgb.XGBClassifier(random_state=42, eval_metric='logloss')
xgb_model.fit(X_scaled, y)
xgb_importance = xgb_model.feature_importances_

xgb_results = pd.DataFrame({
    'Feature': X_scaled.columns,
    'XGB_Importance': xgb_importance
}).sort_values('XGB_Importance', ascending=False)

print(xgb_results.round(4))

# Logistic Regression Coefficients
print(f"\n📊 Logistic Regression Coefficients (L1 regularization):")
lr_l1 = LogisticRegression(penalty='l1', solver='liblinear', random_state=42)
lr_l1.fit(X_scaled, y)
lr_coef = np.abs(lr_l1.coef_[0])

lr_results = pd.DataFrame({
    'Feature': X_scaled.columns,
    'LR_Coef_Abs': lr_coef
}).sort_values('LR_Coef_Abs', ascending=False)

print(lr_results.round(4))

# 4. VISUALIZE ML-BASED FEATURE IMPORTANCE
print("\n" + "="*70)
print("4. ML-BASED FEATURE IMPORTANCE VISUALIZATION")
print("="*70)

fig, axes = plt.subplots(1, 3, figsize=(18, 6))
fig.suptitle('Machine Learning-based Feature Importance', fontsize=16, fontweight='bold')

# Random Forest importance
rf_sorted = rf_results.sort_values('RF_Importance', ascending=True)
axes[0].barh(range(len(rf_sorted)), rf_sorted['RF_Importance'], alpha=0.7, color='green')
axes[0].set_yticks(range(len(rf_sorted)))
axes[0].set_yticklabels(rf_sorted['Feature'])
axes[0].set_xlabel('Importance Score')
axes[0].set_title('Random Forest Feature Importance')
axes[0].grid(True, alpha=0.3)

# XGBoost importance
xgb_sorted = xgb_results.sort_values('XGB_Importance', ascending=True)
axes[1].barh(range(len(xgb_sorted)), xgb_sorted['XGB_Importance'], alpha=0.7, color='purple')
axes[1].set_yticks(range(len(xgb_sorted)))
axes[1].set_yticklabels(xgb_sorted['Feature'])
axes[1].set_xlabel('Importance Score')
axes[1].set_title('XGBoost Feature Importance')
axes[1].grid(True, alpha=0.3)

# Logistic Regression coefficients
lr_sorted = lr_results.sort_values('LR_Coef_Abs', ascending=True)
axes[2].barh(range(len(lr_sorted)), lr_sorted['LR_Coef_Abs'], alpha=0.7, color='red')
axes[2].set_yticks(range(len(lr_sorted)))
axes[2].set_yticklabels(lr_sorted['Feature'])
axes[2].set_xlabel('Absolute Coefficient')
axes[2].set_title('Logistic Regression Coefficients')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# 5. RECURSIVE FEATURE ELIMINATION (RFE)
print("\n" + "="*70)
print("5. RECURSIVE FEATURE ELIMINATION (RFE)")
print("="*70)

# RFE with Random Forest
print("🔄 RFE with Random Forest:")
rfe_rf = RFE(estimator=RandomForestClassifier(n_estimators=50, random_state=42),
             n_features_to_select=8)
rfe_rf.fit(X_scaled, y)

rfe_rf_results = pd.DataFrame({
    'Feature': X_scaled.columns,
    'RFE_RF_Selected': rfe_rf.support_,
    'RFE_RF_Ranking': rfe_rf.ranking_
}).sort_values('RFE_RF_Ranking')

print(rfe_rf_results)

# RFE with Logistic Regression
print(f"\n🔄 RFE with Logistic Regression:")
rfe_lr = RFE(estimator=LogisticRegression(random_state=42),
             n_features_to_select=8)
rfe_lr.fit(X_scaled, y)

rfe_lr_results = pd.DataFrame({
    'Feature': X_scaled.columns,
    'RFE_LR_Selected': rfe_lr.support_,
    'RFE_LR_Ranking': rfe_lr.ranking_
}).sort_values('RFE_LR_Ranking')

print(rfe_lr_results)

# RFE with Cross-Validation
print(f"\n🔄 RFE with Cross-Validation (Random Forest):")
rfecv = RFECV(estimator=RandomForestClassifier(n_estimators=50, random_state=42),
              step=1, cv=5, scoring='accuracy')
rfecv.fit(X_scaled, y)

print(f"Optimal number of features: {rfecv.n_features_}")
print(f"Cross-validation scores: {rfecv.cv_results_['mean_test_score']}")

rfecv_results = pd.DataFrame({
    'Feature': X_scaled.columns,
    'RFECV_Selected': rfecv.support_,
    'RFECV_Ranking': rfecv.ranking_
}).sort_values('RFECV_Ranking')

print(rfecv_results)

# Plot RFECV results
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(rfecv.cv_results_['mean_test_score']) + 1),
         rfecv.cv_results_['mean_test_score'], marker='o', linewidth=2, markersize=6)
plt.xlabel('Number of Features Selected')
plt.ylabel('Cross-Validation Accuracy')
plt.title('Recursive Feature Elimination with Cross-Validation')
plt.grid(True, alpha=0.3)
plt.axvline(x=rfecv.n_features_, color='red', linestyle='--',
            label=f'Optimal: {rfecv.n_features_} features')
plt.legend()
plt.tight_layout()
plt.show()

# 6. FEATURE SELECTION BASED ON MODEL PERFORMANCE
print("\n" + "="*70)
print("6. MODEL-BASED FEATURE SELECTION")
print("="*70)

# SelectFromModel with Random Forest
print("🎯 SelectFromModel with Random Forest (median threshold):")
sfm_rf = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42),
                        threshold='median')
sfm_rf.fit(X_scaled, y)

sfm_rf_selected = X_scaled.columns[sfm_rf.get_support()]
print(f"Selected features ({len(sfm_rf_selected)}): {list(sfm_rf_selected)}")

# SelectFromModel with L1 Logistic Regression
print(f"\n🎯 SelectFromModel with L1 Logistic Regression:")
sfm_lr = SelectFromModel(LogisticRegression(penalty='l1', solver='liblinear', random_state=42))
sfm_lr.fit(X_scaled, y)

sfm_lr_selected = X_scaled.columns[sfm_lr.get_support()]
print(f"Selected features ({len(sfm_lr_selected)}): {list(sfm_lr_selected)}")

# 7. COMBINE ALL FEATURE SELECTION RESULTS
print("\n" + "="*70)
print("7. COMPREHENSIVE FEATURE SELECTION SUMMARY")
print("="*70)

# Create comprehensive results dataframe
comprehensive_results = pd.DataFrame({
    'Feature': X_scaled.columns,
    'Chi2_Score': chi2_results.set_index('Feature').loc[X_scaled.columns, 'Chi2_Score'],
    'F_Score': f_results.set_index('Feature').loc[X_scaled.columns, 'F_Score'],
    'MI_Score': mi_results.set_index('Feature').loc[X_scaled.columns, 'MI_Score'],
    'RF_Importance': rf_results.set_index('Feature').loc[X_scaled.columns, 'RF_Importance'],
    'XGB_Importance': xgb_results.set_index('Feature').loc[X_scaled.columns, 'XGB_Importance'],
    'LR_Coef_Abs': lr_results.set_index('Feature').loc[X_scaled.columns, 'LR_Coef_Abs'],
    'RFE_RF_Ranking': rfe_rf_results.set_index('Feature').loc[X_scaled.columns, 'RFE_RF_Ranking'],
    'RFE_LR_Ranking': rfe_lr_results.set_index('Feature').loc[X_scaled.columns, 'RFE_LR_Ranking'],
    'RFECV_Ranking': rfecv_results.set_index('Feature').loc[X_scaled.columns, 'RFECV_Ranking']
})

# Normalize scores for comparison (0-1 scale)
score_columns = ['Chi2_Score', 'F_Score', 'MI_Score', 'RF_Importance', 'XGB_Importance', 'LR_Coef_Abs']
for col in score_columns:
    comprehensive_results[f'{col}_Normalized'] = (
        comprehensive_results[col] / comprehensive_results[col].max()
    )

# Calculate combined score
normalized_cols = [col for col in comprehensive_results.columns if '_Normalized' in col]
comprehensive_results['Combined_Score'] = comprehensive_results[normalized_cols].mean(axis=1)

# Rank features
comprehensive_results['Final_Ranking'] = comprehensive_results['Combined_Score'].rank(ascending=False)
comprehensive_results = comprehensive_results.sort_values('Final_Ranking')

print("📊 Comprehensive Feature Selection Results:")
print(comprehensive_results.round(3))

# 8. VISUALIZE COMPREHENSIVE RESULTS
print("\n" + "="*70)
print("8. COMPREHENSIVE RESULTS VISUALIZATION")
print("="*70)

# Feature selection heatmap
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))

# Normalized scores heatmap
scores_for_heatmap = comprehensive_results.set_index('Feature')[normalized_cols]
sns.heatmap(scores_for_heatmap.T, annot=True, cmap='YlOrRd', ax=ax1,
            cbar_kws={'label': 'Normalized Score'})
ax1.set_title('Feature Selection Methods Comparison (Normalized Scores)')
ax1.set_ylabel('Selection Method')

# Combined scores
comprehensive_sorted = comprehensive_results.sort_values('Combined_Score', ascending=True)
ax2.barh(range(len(comprehensive_sorted)), comprehensive_sorted['Combined_Score'], alpha=0.7)
ax2.set_yticks(range(len(comprehensive_sorted)))
ax2.set_yticklabels(comprehensive_sorted['Feature'])
ax2.set_xlabel('Combined Score')
ax2.set_title('Final Feature Ranking (Combined Score)')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# 9. SELECT TOP FEATURES AND CREATE REDUCED DATASETS
print("\n" + "="*70)
print("9. FINAL FEATURE SELECTION")
print("="*70)

# Select top features based on different criteria
top_k_features = 8  # Select top 8 features

# Method 1: Top combined score features
top_combined_features = comprehensive_results.head(top_k_features)['Feature'].tolist()
print(f"🏆 Top {top_k_features} features (Combined Score):")
for i, feature in enumerate(top_combined_features, 1):
    score = comprehensive_results[comprehensive_results['Feature'] == feature]['Combined_Score'].iloc[0]
    print(f"  {i}. {feature} (score: {score:.3f})")

# Method 2: RFECV selected features
rfecv_selected_features = rfecv_results[rfecv_results['RFECV_Selected']]['Feature'].tolist()
print(f"\n🔄 RFECV Selected features ({len(rfecv_selected_features)}):")
for i, feature in enumerate(rfecv_selected_features, 1):
    print(f"  {i}. {feature}")

# Method 3: Consensus features (selected by multiple methods)
selection_methods = ['RFE_RF_Selected', 'RFE_LR_Selected', 'RFECV_Selected']
if len(sfm_rf_selected) > 0:
    comprehensive_results['SFM_RF_Selected'] = comprehensive_results['Feature'].isin(sfm_rf_selected)
    selection_methods.append('SFM_RF_Selected')
if len(sfm_lr_selected) > 0:
    comprehensive_results['SFM_LR_Selected'] = comprehensive_results['Feature'].isin(sfm_lr_selected)
    selection_methods.append('SFM_LR_Selected')

# Add RFE selections to comprehensive results
comprehensive_results = comprehensive_results.merge(
    rfe_rf_results[['Feature', 'RFE_RF_Selected']], on='Feature'
).merge(
    rfe_lr_results[['Feature', 'RFE_LR_Selected']], on='Feature'
).merge(
    rfecv_results[['Feature', 'RFECV_Selected']], on='Feature'
)

# Count selections
available_methods = [col for col in comprehensive_results.columns if col in selection_methods]
comprehensive_results['Selection_Count'] = comprehensive_results[available_methods].sum(axis=1)
consensus_features = comprehensive_results[comprehensive_results['Selection_Count'] >= 2].sort_values('Selection_Count', ascending=False)

print(f"\n🤝 Consensus features (selected by ≥2 methods):")
for _, row in consensus_features.iterrows():
    print(f"  {row['Feature']} (selected by {int(row['Selection_Count'])} methods)")

# Create reduced datasets
print(f"\n📊 Creating reduced datasets...")

# Dataset 1: Top combined score features
X_top_combined = X_scaled[top_combined_features]

# Dataset 2: RFECV features
X_rfecv = X_scaled[rfecv_selected_features]

# Dataset 3: Consensus features (if any)
if len(consensus_features) > 0:
    consensus_feature_names = consensus_features['Feature'].tolist()
    X_consensus = X_scaled[consensus_feature_names]
else:
    X_consensus = X_scaled[top_combined_features[:6]]  # Fallback
    consensus_feature_names = top_combined_features[:6]

print(f"✅ Reduced datasets created:")
print(f"  - Top Combined: {X_top_combined.shape[1]} features")
print(f"  - RFECV: {X_rfecv.shape[1]} features")
print(f"  - Consensus: {X_consensus.shape[1]} features")

# 10. SAVE FEATURE SELECTION RESULTS
print("\n" + "="*70)
print("10. SAVING FEATURE SELECTION RESULTS")
print("="*70)

try:
    import os
    os.makedirs('../data', exist_ok=True)
    os.makedirs('../results', exist_ok=True)

    # Save reduced datasets
    X_top_combined.to_csv('data/X_top_features.csv', index=False)
    X_rfecv.to_csv('data/X_rfecv_features.csv', index=False)
    X_consensus.to_csv('data/X_consensus_features.csv', index=False)

    # Save feature selection results
    comprehensive_results.to_csv('results/feature_selection_results.csv', index=False)

    # Save feature lists
    feature_selections = {
        'top_combined_features': top_combined_features,
        'rfecv_selected_features': rfecv_selected_features,
        'consensus_features': consensus_feature_names,
        'all_features': X_scaled.columns.tolist()
    }

    import json
    with open('../results/selected_features.json', 'w') as f:
        json.dump(feature_selections, f, indent=2)

    print("✅ Feature selection results saved successfully!")
    print("Files saved:")
    print("  - X_top_features.csv (Top combined score features)")
    print("  - X_rfecv_features.csv (RFECV selected features)")
    print("  - X_consensus_features.csv (Consensus features)")
    print("  - feature_selection_results.csv (Comprehensive results)")
    print("  - selected_features.json (Feature lists)")

except Exception as e:
    print(f"⚠️ Error saving files: {e}")

# 11. FEATURE SELECTION SUMMARY
print("\n" + "="*70)
print("11. FEATURE SELECTION SUMMARY")
print("="*70)

print("✅ Feature selection analysis completed successfully!")
print(f"📊 Original features: {X_scaled.shape[1]}")
print(f"📊 Selected features (Combined): {len(top_combined_features)}")
print(f"📊 Selected features (RFECV): {len(rfecv_selected_features)}")
print(f"📊 Selected features (Consensus): {len(consensus_feature_names)}")

print(f"\n🏆 Top 5 most important features (Combined Score):")
for i, (_, row) in enumerate(comprehensive_results.head(5).iterrows(), 1):
    print(f"  {i}. {row['Feature']} (score: {row['Combined_Score']:.3f})")

print(f"\n🔍 Feature selection methods used:")
print("  ✓ Chi-square test")
print("  ✓ F-test (ANOVA)")
print("  ✓ Mutual Information")
print("  ✓ Random Forest importance")
print("  ✓ XGBoost importance")
print("  ✓ Logistic Regression coefficients")
print("  ✓ Recursive Feature Elimination (RFE)")
print("  ✓ RFE with Cross-Validation (RFECV)")
print("  ✓ SelectFromModel")

print(f"\n💡 Recommendations:")
print("  - Use top combined features for initial modeling")
print("  - Compare performance with RFECV features")
print("  - Consider consensus features for robust model")
print("  - Monitor overfitting with reduced feature sets")

print(f"\n🎯 Next steps:")
print("  1. ✅ Data preprocessing complete")
print("  2. ✅ PCA analysis complete")
print("  3. ✅ Feature selection complete")
print("  4. ⏳ Train supervised learning models (04_supervised_learning.ipynb)")
print("  5. ⏳ Apply unsupervised learning (05_unsupervised_learning.ipynb)")
print("  6. ⏳ Hyperparameter tuning (06_hyperparameter_tuning.ipynb)")

print(f"\n🎉 Ready to proceed to supervised learning models!")

# Display feature comparison table
print(f"\n📊 Feature Selection Comparison:")
comparison_data = []
for feature in X_scaled.columns:
    row_data = comprehensive_results[comprehensive_results['Feature'] == feature]
    if len(row_data) > 0:
        comparison_data.append([
            feature,
            '✓' if feature in top_combined_features else '✗',
            '✓' if feature in rfecv_selected_features else '✗',
            '✓' if feature in consensus_feature_names else '✗',
            f"{row_data['Combined_Score'].iloc[0]:.3f}"
        ])

comparison_df = pd.DataFrame(comparison_data,
                           columns=['Feature', 'Top Combined', 'RFECV', 'Consensus', 'Score'])
print(comparison_df.to_string(index=False))
