In [None]:
# HR Turnover Prediction Project

# ## 1. Introduction
# This notebook presents an end-to-end workflow for predicting employee turnover using machine learning.
# We will explore and visualize the data, preprocess it, train multiple models, and evaluate their performance.

# ## 2. Imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc
from models import load_data

# ## 3. Load and Prepare Data
df = load_data("data/HR_capstone_dataset.csv")
X = df.drop("left", axis=1)
y = df["left"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# ## 4. Train Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
}

trained_models = {}
for name, model in models.items():
    print(f"\n----- {name} -----")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, digits=3))
    trained_models[name] = model

# ## 5. Visualizations

# ### 5.1 Feature Importance — Random Forest
rf_model = trained_models["Random Forest"]
importances = rf_model.feature_importances_
feature_names = X.columns
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(10, 6))
sns.barplot(x=importances[indices], y=feature_names[indices])
plt.title("Feature Importance — Random Forest")
plt.xlabel("Importance Score")
plt.tight_layout()
plt.savefig("plots/feature_importance_rf.png")
plt.show()

# ### 5.2 Confusion Matrix — Random Forest
plt.figure(figsize=(6, 5))
disp = ConfusionMatrixDisplay.from_estimator(
    rf_model, X_test, y_test,
    display_labels=["Stayed", "Left"],
    cmap="Blues", colorbar=False
)
disp.ax_.set_title("Confusion Matrix — Random Forest")
plt.tight_layout()
plt.savefig("plots/confusion_matrix_rf.png")
plt.show()

# ### 5.3 ROC Curves
plt.figure(figsize=(10, 7))
for name, model in trained_models.items():
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)[:, 1]
    else:
        y_proba = model.decision_function(X_test)
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f"{name} (AUC = {roc_auc:.3f})")

plt.plot([0, 1], [0, 1], "k--", label="Chance")
plt.title("ROC Curves — Model Comparison")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.tight_layout()
plt.savefig("plots/roc_curves.png")
plt.show()

# ## 6. Summary
# - Random Forest and XGBoost were the top performing models.
# - Satisfaction level and number of projects were the most important predictors.
# - Visualizations like confusion matrices and ROC curves validate model performance.

: 