In [4]:
# ======================
# 05_modeling_feature_importance_lightweight.py
# ======================

import os
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from scipy import sparse

# ======================
# Paths
# ======================
OUTPUT_DIR = r"C:\Users\vibho\Downloads\Engineering\exoplanet-ai\data\processed"
PIPELINE_FILE = r"C:\Users\vibho\Downloads\Engineering\exoplanet-ai\src\preprocessing\feature_pipeline.pkl"

X_train_file = os.path.join(OUTPUT_DIR, "X_train.npz")
X_val_file = os.path.join(OUTPUT_DIR, "X_val.npz")
y_train_file = os.path.join(OUTPUT_DIR, "y_train.npy")
y_val_file = os.path.join(OUTPUT_DIR, "y_val.npy")

# ======================
# Load processed data
# ======================
X_train = sparse.load_npz(X_train_file)
X_val = sparse.load_npz(X_val_file)
y_train = np.load(y_train_file)
y_val = np.load(y_val_file)

# ======================
# Load trained models
# ======================
models = {}
for model_name in ["RandomForest", "XGBoost", "LogisticRegression"]:
    model_file = os.path.join(OUTPUT_DIR, f"{model_name}_baseline_model.pkl")
    models[model_name] = joblib.load(model_file)

# ======================
# Evaluate models
# ======================
print("🔹 Evaluating models...")
for name, model in models.items():
    y_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    print(f"{name} Validation Accuracy: {acc:.4f}")
    print(f"Classification Report for {name}:\n")
    print(classification_report(y_val, y_pred))
    print(f"Confusion Matrix for {name}:\n{confusion_matrix(y_val, y_pred)}\n")

# ======================
# Optional: Feature importances for tree-based models
# ======================
importances_dir = os.path.join(OUTPUT_DIR, "feature_importances")
os.makedirs(importances_dir, exist_ok=True)

for name in ["RandomForest", "XGBoost"]:
    model = models[name]
    if hasattr(model, "feature_importances_"):
        importances = model.feature_importances_
        
        # Save top 20 features only (without names)
        top20_idx = np.argsort(importances)[-20:][::-1]
        top20_values = importances[top20_idx]
        
        plt.figure(figsize=(10,6))
        plt.barh(range(20), top20_values[::-1])
        plt.yticks(range(20), [f"Feature {i}" for i in range(20)][::-1])
        plt.xlabel("Importance")
        plt.title(f"Top 20 Feature Importances - {name}")
        plt.tight_layout()
        plot_file = os.path.join(importances_dir, f"{name}_top20_features.png")
        plt.savefig(plot_file)
        plt.close()
        print(f"{name} top 20 feature plot saved to: {plot_file}")
    else:
        print(f"{name} does not have feature_importances_ attribute")

print("✅ Model evaluation complete.")


🔹 Evaluating models...
RandomForest Validation Accuracy: 0.9960
Classification Report for RandomForest:

              precision    recall  f1-score   support

           0       0.95      1.00      0.97       275
           1       1.00      1.00      1.00       463
           2       0.98      0.76      0.86        59
           3       1.00      0.75      0.86         4
           4       1.00      1.00      1.00      3152

    accuracy                           1.00      3953
   macro avg       0.99      0.90      0.94      3953
weighted avg       1.00      1.00      1.00      3953

Confusion Matrix for RandomForest:
[[ 274    0    1    0    0]
 [   0  463    0    0    0]
 [  14    0   45    0    0]
 [   0    1    0    3    0]
 [   0    0    0    0 3152]]

XGBoost Validation Accuracy: 1.0000
Classification Report for XGBoost:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       275
           1       1.00      1.00      1.00     