In [1]:
import pandas as pd

raw_df=pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")
df=raw_df.copy()

print("Raw shape",raw_df.shape)
print("Columns:",raw_df.columns.tolist())

Raw shape (1470, 35)
Columns: ['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department', 'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount', 'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']


In [2]:
# Check if EmployeeNumber is unique
print("EmployeeNumber unique? ->", raw_df["EmployeeNumber"].is_unique)

# If not unique, show duplicates
dup_ids = raw_df[raw_df.duplicated("EmployeeNumber", keep=False)].sort_values("EmployeeNumber")
dup_ids.head(10)


EmployeeNumber unique? -> True


Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager


In [3]:
missing = raw_df.isna().sum().sort_values(ascending=False)
print(missing[missing > 0])


Series([], dtype: int64)


In [4]:
# If you haven’t already:
# df = raw_df.copy()

dept_job_map = (
    df.groupby("Department")["JobRole"]
      .unique()
      .apply(list)
      .to_dict()
)

dept_job_map


{'Human Resources': ['Human Resources', 'Manager'],
 'Research & Development': ['Research Scientist',
  'Laboratory Technician',
  'Manufacturing Director',
  'Healthcare Representative',
  'Research Director',
  'Manager'],
 'Sales': ['Sales Executive', 'Manager', 'Sales Representative']}

In [5]:
job_dept_map = (
    df.groupby("JobRole")["Department"]
      .unique()
      .apply(list)
      .to_dict()
)

job_dept_map


{'Healthcare Representative': ['Research & Development'],
 'Human Resources': ['Human Resources'],
 'Laboratory Technician': ['Research & Development'],
 'Manager': ['Sales', 'Research & Development', 'Human Resources'],
 'Manufacturing Director': ['Research & Development'],
 'Research Director': ['Research & Development'],
 'Research Scientist': ['Research & Development'],
 'Sales Executive': ['Sales'],
 'Sales Representative': ['Sales']}

In [6]:
# Start from your working copy
df = raw_df.copy()

# Create numeric target
df["Attrition_Flag"] = df["Attrition"].map({"Yes": 1, "No": 0})

# Quick check of class balance
print(df["Attrition"].value_counts())
print()
print(df["Attrition_Flag"].value_counts(normalize=True))


Attrition
No     1233
Yes     237
Name: count, dtype: int64

Attrition_Flag
0    0.838776
1    0.161224
Name: proportion, dtype: float64


In [7]:
# Columns we don't want as features
drop_cols = ["EmployeeNumber", "EmployeeCount", "Over18", "StandardHours"]

# Create a separate modeling dataframe
df_model = df.drop(columns=drop_cols)

print("Original df shape:", df.shape)
print("Model df shape:", df_model.shape)
print("Dropped columns:", drop_cols)


Original df shape: (1470, 36)
Model df shape: (1470, 32)
Dropped columns: ['EmployeeNumber', 'EmployeeCount', 'Over18', 'StandardHours']


In [8]:
# Target
y = df_model["Attrition_Flag"]

# Features: drop original text label + numeric flag
X = df_model.drop(columns=["Attrition", "Attrition_Flag"])

print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (1470, 30)
y shape: (1470,)


In [9]:
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = [c for c in X.columns if c not in cat_cols]

print("Categorical columns:", cat_cols)
print("Numeric columns count:", len(num_cols))


Categorical columns: ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime']
Numeric columns count: 23


In [10]:
X_encoded = pd.get_dummies(
    X,
    columns=cat_cols,
    drop_first=True  # avoids dummy variable trap
)

print("X_encoded shape:", X_encoded.shape)
X_encoded.head()


X_encoded shape: (1470, 44)


Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,...,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Married,MaritalStatus_Single,OverTime_Yes
0,41,1102,1,2,2,94,3,2,4,5993,...,False,False,False,False,False,True,False,False,True,True
1,49,279,8,1,3,61,2,2,2,5130,...,False,False,False,False,True,False,False,True,False,False
2,37,1373,2,2,4,92,2,1,3,2090,...,True,False,False,False,False,False,False,False,True,True
3,33,1392,3,4,4,56,3,1,3,2909,...,False,False,False,False,True,False,False,True,False,True
4,27,591,2,1,1,40,3,1,2,3468,...,True,False,False,False,False,False,False,True,False,False


In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_encoded,
    y,
    test_size=0.2,
    stratify=y,      # keeps same Yes/No ratio in both splits
    random_state=42  # for reproducibility
)

print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)

print("\nTrain class proportion:")
print(y_train.value_counts(normalize=True))

print("\nTest class proportion:")
print(y_test.value_counts(normalize=True))


X_train: (1176, 44)
X_test: (294, 44)
y_train: (1176,)
y_test: (294,)

Train class proportion:
Attrition_Flag
0    0.838435
1    0.161565
Name: proportion, dtype: float64

Test class proportion:
Attrition_Flag
0    0.840136
1    0.159864
Name: proportion, dtype: float64


In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

rf = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    class_weight='balanced',
    n_jobs=-1
)

rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)[:, 1]  # for probability-based UI later

print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred, digits=3))


Confusion matrix:
 [[244   3]
 [ 43   4]]

Classification report:
               precision    recall  f1-score   support

           0      0.850     0.988     0.914       247
           1      0.571     0.085     0.148        47

    accuracy                          0.844       294
   macro avg      0.711     0.536     0.531       294
weighted avg      0.806     0.844     0.791       294



In [13]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

# Pipeline: scaling + logistic regression
logreg_clf = Pipeline(steps=[
    ("scaler", MinMaxScaler()),
    ("logreg", LogisticRegression(
        class_weight="balanced",
        max_iter=1000,
        random_state=42
    ))
])

logreg_clf.fit(X_train, y_train)

y_pred_lr = logreg_clf.predict(X_test)
y_prob_lr = logreg_clf.predict_proba(X_test)[:, 1]

print("Confusion matrix (LogReg):\n", confusion_matrix(y_test, y_pred_lr))
print("\nClassification report (LogReg):\n", classification_report(y_test, y_pred_lr, digits=3))


Confusion matrix (LogReg):
 [[195  52]
 [ 15  32]]

Classification report (LogReg):
               precision    recall  f1-score   support

           0      0.929     0.789     0.853       247
           1      0.381     0.681     0.489        47

    accuracy                          0.772       294
   macro avg      0.655     0.735     0.671       294
weighted avg      0.841     0.772     0.795       294



In [14]:
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

# probabilities from your logistic regression model
y_scores = y_prob_lr  # already computed as predict_proba(... )[:, 1]

def evaluate_at_threshold(threshold):
    y_pred_custom = (y_scores >= threshold).astype(int)
    print(f"=== Threshold: {threshold} ===")
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_custom))
    print("\nClassification report:\n", classification_report(y_test, y_pred_custom, digits=3))

# Try a few thresholds
for t in [0.3, 0.4, 0.5, 0.6]:
    evaluate_at_threshold(t)


=== Threshold: 0.3 ===
Confusion matrix:
 [[147 100]
 [  7  40]]

Classification report:
               precision    recall  f1-score   support

           0      0.955     0.595     0.733       247
           1      0.286     0.851     0.428        47

    accuracy                          0.636       294
   macro avg      0.620     0.723     0.580       294
weighted avg      0.848     0.636     0.684       294

=== Threshold: 0.4 ===
Confusion matrix:
 [[177  70]
 [ 10  37]]

Classification report:
               precision    recall  f1-score   support

           0      0.947     0.717     0.816       247
           1      0.346     0.787     0.481        47

    accuracy                          0.728       294
   macro avg      0.646     0.752     0.648       294
weighted avg      0.850     0.728     0.762       294

=== Threshold: 0.5 ===
Confusion matrix:
 [[195  52]
 [ 15  32]]

Classification report:
               precision    recall  f1-score   support

           0      0.9

In [15]:
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score

# ---- TRAIN PERFORMANCE ----
y_train_pred = logreg_clf.predict(X_train)
y_train_prob = logreg_clf.predict_proba(X_train)[:, 1]

print("=== TRAIN SET ===")
print(classification_report(y_train, y_train_pred, digits=3))
print("Train ROC-AUC:", roc_auc_score(y_train, y_train_prob))
print("Train PR-AUC (average precision):", average_precision_score(y_train, y_train_prob))

# ---- TEST PERFORMANCE (same as before, but with extra metrics) ----
y_test_pred = logreg_clf.predict(X_test)
y_test_prob = logreg_clf.predict_proba(X_test)[:, 1]

print("\n=== TEST SET ===")
print(classification_report(y_test, y_test_pred, digits=3))
print("Test ROC-AUC:", roc_auc_score(y_test, y_test_prob))
print("Test PR-AUC (average precision):", average_precision_score(y_test, y_test_prob))


=== TRAIN SET ===
              precision    recall  f1-score   support

           0      0.955     0.775     0.856       986
           1      0.410     0.811     0.544       190

    accuracy                          0.781      1176
   macro avg      0.682     0.793     0.700      1176
weighted avg      0.867     0.781     0.805      1176

Train ROC-AUC: 0.8741646204761397
Train PR-AUC (average precision): 0.6871078566404933

=== TEST SET ===
              precision    recall  f1-score   support

           0      0.929     0.789     0.853       247
           1      0.381     0.681     0.489        47

    accuracy                          0.772       294
   macro avg      0.655     0.735     0.671       294
weighted avg      0.841     0.772     0.795       294

Test ROC-AUC: 0.8079076578516667
Test PR-AUC (average precision): 0.571784487599875


In [16]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

roc_scores = cross_val_score(
    logreg_clf,
    X_encoded,
    y,
    cv=cv,
    scoring="roc_auc"
)

pr_scores = cross_val_score(
    logreg_clf,
    X_encoded,
    y,
    cv=cv,
    scoring="average_precision"
)

print("CV ROC-AUC scores:", roc_scores)
print("Mean ROC-AUC:", roc_scores.mean(), "±", roc_scores.std())

print("\nCV PR-AUC scores:", pr_scores)
print("Mean PR-AUC:", pr_scores.mean(), "±", pr_scores.std())


CV ROC-AUC scores: [0.81893631 0.8466294  0.83719528 0.84443104 0.78120424]
Mean ROC-AUC: 0.8256792561303596 ± 0.024276126365144944

CV PR-AUC scores: [0.53251943 0.64823391 0.59281821 0.65801815 0.56912606]
Mean PR-AUC: 0.600143155571167 ± 0.04743610269797584


In [17]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score
import numpy as np

# ----- define models -----
models = {}

# 1) Logistic Regression (with scaling)
models["logreg"] = Pipeline(steps=[
    ("scaler", MinMaxScaler()),
    ("clf", LogisticRegression(
        class_weight="balanced",
        max_iter=1000,
        random_state=42
    ))
])

# 2) Random Forest (regularized to reduce overfitting)
models["rf"] = RandomForestClassifier(
    n_estimators=300,
    max_depth=5,
    min_samples_leaf=20,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

# 3) Gradient Boosting (regularized)
models["gb"] = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=3,
    subsample=0.8,
    random_state=42
)

# ----- helper for evaluation -----
def evaluate_model(name, model, X_train, y_train, X_test, y_test):
    print(f"\n===== {name.upper()} =====")
    
    model.fit(X_train, y_train)
    
    # Train
    y_train_pred = model.predict(X_train)
    y_train_prob = model.predict_proba(X_train)[:, 1]
    print("---- TRAIN ----")
    print(classification_report(y_train, y_train_pred, digits=3))
    print("Train ROC-AUC:", roc_auc_score(y_train, y_train_prob))
    print("Train PR-AUC :", average_precision_score(y_train, y_train_prob))
    
    # Test
    y_test_pred = model.predict(X_test)
    y_test_prob = model.predict_proba(X_test)[:, 1]
    print("\n---- TEST ----")
    print(classification_report(y_test, y_test_pred, digits=3))
    print("Test ROC-AUC:", roc_auc_score(y_test, y_test_prob))
    print("Test PR-AUC :", average_precision_score(y_test, y_test_prob))
    
    return model

# ----- fit all models and store them -----
fitted_models = {}

for name, model in models.items():
    fitted_models[name] = evaluate_model(
        name, model,
        X_train, y_train,
        X_test, y_test
    )



===== LOGREG =====
---- TRAIN ----
              precision    recall  f1-score   support

           0      0.955     0.775     0.856       986
           1      0.410     0.811     0.544       190

    accuracy                          0.781      1176
   macro avg      0.682     0.793     0.700      1176
weighted avg      0.867     0.781     0.805      1176

Train ROC-AUC: 0.8741646204761397
Train PR-AUC : 0.6871078566404933

---- TEST ----
              precision    recall  f1-score   support

           0      0.929     0.789     0.853       247
           1      0.381     0.681     0.489        47

    accuracy                          0.772       294
   macro avg      0.655     0.735     0.671       294
weighted avg      0.841     0.772     0.795       294

Test ROC-AUC: 0.8079076578516667
Test PR-AUC : 0.571784487599875

===== RF =====
---- TRAIN ----
              precision    recall  f1-score   support

           0      0.944     0.895     0.919       986
           1      0.

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold

pipe_logreg = Pipeline(steps=[
    ("scaler", MinMaxScaler()),
    ("clf", LogisticRegression(
        class_weight="balanced",
        max_iter=1000,
        random_state=42
    ))
])

param_grid_lr = {
    "clf__C": [1.0, 0.5, 0.25, 0.1]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_lr = GridSearchCV(
    pipe_logreg,
    param_grid_lr,
    cv=cv,
    scoring="average_precision",  # focus on minority-class quality
    n_jobs=-1
)

grid_lr.fit(X_encoded, y)

print("Best C:", grid_lr.best_params_)
print("Best CV PR-AUC:", grid_lr.best_score_)
logreg_best = grid_lr.best_estimator_


Best C: {'clf__C': 0.1}
Best CV PR-AUC: 0.6117914978973933


In [19]:
from sklearn.ensemble import RandomForestClassifier

rf_base = RandomForestClassifier(
    n_estimators=300,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

param_grid_rf = {
    "max_depth": [3, 4, 5],
    "min_samples_leaf": [10, 20, 50],
    "max_features": ["sqrt", 0.5]
}

grid_rf = GridSearchCV(
    rf_base,
    param_grid_rf,
    cv=cv,
    scoring="average_precision",
    n_jobs=-1
)

grid_rf.fit(X_encoded, y)

print("Best RF params:", grid_rf.best_params_)
print("Best RF CV PR-AUC:", grid_rf.best_score_)
rf_best = grid_rf.best_estimator_


Best RF params: {'max_depth': 4, 'max_features': 0.5, 'min_samples_leaf': 10}
Best RF CV PR-AUC: 0.5459519354601203


In [20]:
from sklearn.ensemble import GradientBoostingClassifier

gb_base = GradientBoostingClassifier(random_state=42)

param_grid_gb = {
    "n_estimators": [50, 100, 150],
    "learning_rate": [0.1, 0.05, 0.02],
    "max_depth": [2, 3],
    "subsample": [0.7, 0.8, 1.0]
}

grid_gb = GridSearchCV(
    gb_base,
    param_grid_gb,
    cv=cv,
    scoring="average_precision",
    n_jobs=-1
)

grid_gb.fit(X_encoded, y)

print("Best GB params:", grid_gb.best_params_)
print("Best GB CV PR-AUC:", grid_gb.best_score_)
gb_best = grid_gb.best_estimator_


Best GB params: {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 150, 'subsample': 0.7}
Best GB CV PR-AUC: 0.6079954523858395


In [21]:
# Using the best estimators you just got:
logreg_best = grid_lr.best_estimator_
rf_best = grid_rf.best_estimator_
gb_best = grid_gb.best_estimator_

from sklearn.metrics import classification_report, roc_auc_score, average_precision_score

def eval_on_split(name, model):
    print(f"\n===== {name} (BEST) =====")
    model.fit(X_train, y_train)

    # Train
    y_tr_pred = model.predict(X_train)
    y_tr_prob = model.predict_proba(X_train)[:, 1]
    print("---- TRAIN ----")
    print(classification_report(y_train, y_tr_pred, digits=3))
    print("Train ROC-AUC:", roc_auc_score(y_train, y_tr_prob))
    print("Train PR-AUC :", average_precision_score(y_train, y_tr_prob))

    # Test
    y_te_pred = model.predict(X_test)
    y_te_prob = model.predict_proba(X_test)[:, 1]
    print("\n---- TEST ----")
    print(classification_report(y_test, y_te_pred, digits=3))
    print("Test ROC-AUC:", roc_auc_score(y_test, y_te_prob))
    print("Test PR-AUC :", average_precision_score(y_test, y_te_prob))

    return model

logreg_best = eval_on_split("LOGREG", logreg_best)
rf_best = eval_on_split("RF", rf_best)
gb_best = eval_on_split("GB", gb_best)



===== LOGREG (BEST) =====
---- TRAIN ----
              precision    recall  f1-score   support

           0      0.949     0.771     0.851       986
           1      0.397     0.784     0.527       190

    accuracy                          0.773      1176
   macro avg      0.673     0.778     0.689      1176
weighted avg      0.860     0.773     0.798      1176

Train ROC-AUC: 0.8614764599124585
Train PR-AUC : 0.6790759882552997

---- TEST ----
              precision    recall  f1-score   support

           0      0.944     0.822     0.879       247
           1      0.443     0.745     0.556        47

    accuracy                          0.810       294
   macro avg      0.694     0.783     0.717       294
weighted avg      0.864     0.810     0.827       294

Test ROC-AUC: 0.8105779998277199
Test PR-AUC : 0.5868224559643914

===== RF (BEST) =====
---- TRAIN ----
              precision    recall  f1-score   support

           0      0.939     0.910     0.924       986
     

In [22]:
# Final tuned models
logreg_best = grid_lr.best_estimator_
rf_best = grid_rf.best_estimator_
gb_best = grid_gb.best_estimator_

fitted_models = {
    "logreg": logreg_best,   # main / recommended
    "rf": rf_best,           # alternative
    "gb": gb_best            # alternative + part of ensemble
}

RISK_THRESHOLD = 0.4  # fixed, not editable in UI


In [23]:
import numpy as np
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score

def ensemble_proba(fitted_models, X):
    probs = []
    for name, model in fitted_models.items():
        p = model.predict_proba(X)[:, 1]
        probs.append(p)
    probs = np.vstack(probs)          # (n_models, n_samples)
    return probs.mean(axis=0)         # (n_samples,)

def classify_from_proba(probs, threshold=RISK_THRESHOLD):
    return (probs >= threshold).astype(int)

ensemble_test_prob = ensemble_proba(fitted_models, X_test)
ensemble_test_pred = classify_from_proba(ensemble_test_prob)

print("=== ENSEMBLE (LOGREG + RF + GB) – TEST SET ===")
print("ROC-AUC:", roc_auc_score(y_test, ensemble_test_prob))
print("PR-AUC :", average_precision_score(y_test, ensemble_test_prob))
print("\nClassification report:\n",
      classification_report(y_test, ensemble_test_pred, digits=3))


=== ENSEMBLE (LOGREG + RF + GB) – TEST SET ===
ROC-AUC: 0.8147988629511586
PR-AUC : 0.5561941630713517

Classification report:
               precision    recall  f1-score   support

           0      0.924     0.834     0.877       247
           1      0.423     0.638     0.508        47

    accuracy                          0.803       294
   macro avg      0.673     0.736     0.693       294
weighted avg      0.844     0.803     0.818       294



In [24]:
import joblib

joblib.dump(logreg_best, "model_logreg_best.joblib")
joblib.dump(rf_best, "model_rf_best.joblib")
joblib.dump(gb_best, "model_gb_best.joblib")

print("Saved all models.")


Saved all models.


In [25]:
import joblib

# X_encoded is your final training feature matrix
feature_cols = X_encoded.columns

joblib.dump(feature_cols, "model_feature_columns.joblib")
print("Saved feature columns:", len(feature_cols))


Saved feature columns: 44
