### **Import Libraries & Data Preprocessing**

In [1]:
import joblib
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
import plotly.graph_objects as go
from catboost import CatBoostClassifier
from plotly.subplots import make_subplots
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTETomek
from collections import Counter


from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, 
                           roc_auc_score, confusion_matrix, classification_report,
                           precision_recall_curve, average_precision_score)

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
employee_df = pd.read_csv('data/WA_Fn-UseC_-HR-Employee-Attrition.csv')

In [4]:
# drop unnecessary columns
columns_to_drop = ['EmployeeCount', 'StandardHours', 'Over18', 'EmployeeNumber']
employee_df = employee_df.drop(columns=columns_to_drop, errors='ignore')

In [5]:
# encode target variable
employee_df['Attrition'] = employee_df['Attrition'].map({'Yes': 1, 'No': 0})

# identify categorical and numerical columns
categorical_cols = employee_df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = employee_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_cols.remove('Attrition')  # remove target from numerical

print(f"Categorical columns: {categorical_cols}")
print(f"Numerical columns: {len(numerical_cols)} columns")

Categorical columns: ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime']
Numerical columns: 23 columns


In [6]:
# encode categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    employee_df[col] = le.fit_transform(employee_df[col])
    label_encoders[col] = le

In [7]:
# split features and target
X = employee_df.drop('Attrition', axis=1)
y = employee_df['Attrition']

In [8]:
# split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [9]:
print(f"\nOriginal Training set size: {X_train.shape}")
print(f"Original class distribution: {Counter(y_train)}")


Original Training set size: (1176, 30)
Original class distribution: Counter({0: 986, 1: 190})


Handling Class Imbalance

In [10]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
# can experiment different resampling strategies
resampling_methods = {
    'SMOTE': SMOTE(random_state=42, k_neighbors=5),
    'ADASYN': ADASYN(random_state=42, n_neighbors=5),
    'SMOTETomek': SMOTETomek(random_state=42)
}

In [12]:
# we'll use SMOTE as default
sampler = resampling_methods['SMOTE']
X_train_balanced, y_train_balanced = sampler.fit_resample(X_train_scaled, y_train)

In [13]:
print(f"After SMOTE - Training set size: {X_train_balanced.shape}")
print(f"After SMOTE - Class distribution: {Counter(y_train_balanced)}")

After SMOTE - Training set size: (1972, 30)
After SMOTE - Class distribution: Counter({0: 986, 1: 986})


In [14]:
# also prepare unscaled balanced data for CatBoost
X_train_balanced_unscaled, y_train_balanced_unscaled = SMOTE(random_state=42).fit_resample(X_train, y_train)

<br/>

### **Model Training**

In [15]:
results = {}

XGBoost

In [16]:
xgb_params = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

In [17]:
xgb_model = xgb.XGBClassifier(random_state=42, eval_metric='logloss')

xgb_grid = GridSearchCV(
    xgb_model, 
    {'max_depth': [5, 7], 'learning_rate': [0.1, 0.3], 'n_estimators': [100, 200]},
    cv=5, 
    scoring='f1',
    n_jobs=-1,
    verbose=0
)

In [18]:
xgb_grid.fit(X_train_balanced, y_train_balanced)
xgb_best = xgb_grid.best_estimator_

In [19]:
# predictions
xgb_pred = xgb_best.predict(X_test_scaled)
xgb_pred_proba = xgb_best.predict_proba(X_test_scaled)[:, 1]

In [20]:
# store results
results['XGBoost'] = {
    'accuracy': accuracy_score(y_test, xgb_pred),
    'precision': precision_score(y_test, xgb_pred),
    'recall': recall_score(y_test, xgb_pred),
    'f1': f1_score(y_test, xgb_pred),
    'roc_auc': roc_auc_score(y_test, xgb_pred_proba),
    'predictions': xgb_pred,
    'probabilities': xgb_pred_proba,
    'model': xgb_best
}

print(f"XGBoost Best Parameters: {xgb_grid.best_params_}")
print(f"XGBoost F1 Score: {results['XGBoost']['f1']:.4f}")

XGBoost Best Parameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100}
XGBoost F1 Score: 0.3889


LightGBM

In [21]:
lgb_params = {
    'num_leaves': [31, 50],
    'learning_rate': [0.1, 0.3],
    'n_estimators': [100, 200],
    'min_child_samples': [20, 30]
}

In [22]:
lgb_model = lgb.LGBMClassifier(random_state=42, verbose=-1)

lgb_grid = GridSearchCV(
    lgb_model,
    lgb_params,
    cv=5,
    scoring='f1',
    n_jobs=-1,
    verbose=0
)

In [23]:
lgb_grid.fit(X_train_balanced, y_train_balanced)
lgb_best = lgb_grid.best_estimator_

In [24]:
# predictions
lgb_pred = lgb_best.predict(X_test_scaled)
lgb_pred_proba = lgb_best.predict_proba(X_test_scaled)[:, 1]

In [25]:
# store results
results['LightGBM'] = {
    'accuracy': accuracy_score(y_test, lgb_pred),
    'precision': precision_score(y_test, lgb_pred),
    'recall': recall_score(y_test, lgb_pred),
    'f1': f1_score(y_test, lgb_pred),
    'roc_auc': roc_auc_score(y_test, lgb_pred_proba),
    'predictions': lgb_pred,
    'probabilities': lgb_pred_proba,
    'model': lgb_best
}

print(f"LightGBM Best Parameters: {lgb_grid.best_params_}")
print(f"LightGBM F1 Score: {results['LightGBM']['f1']:.4f}")

LightGBM Best Parameters: {'learning_rate': 0.1, 'min_child_samples': 20, 'n_estimators': 100, 'num_leaves': 50}
LightGBM F1 Score: 0.3125


CatBoost

In [26]:
cat_features = [i for i, col in enumerate(X.columns) if col in categorical_cols]

catboost_model = CatBoostClassifier(
    iterations=200,
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    eval_metric='F1',
    random_seed=42,
    verbose=False
)

catboost_model.fit(
    X_train_balanced_unscaled, y_train_balanced_unscaled,
    eval_set=(X_test, y_test),
    early_stopping_rounds=50,
    verbose=False
)

<catboost.core.CatBoostClassifier at 0x1584572e0>

In [27]:
# predictions
cat_pred = catboost_model.predict(X_test)
cat_pred_proba = catboost_model.predict_proba(X_test)[:, 1]

In [28]:
results['CatBoost'] = {
    'accuracy': accuracy_score(y_test, cat_pred),
    'precision': precision_score(y_test, cat_pred),
    'recall': recall_score(y_test, cat_pred),
    'f1': f1_score(y_test, cat_pred),
    'roc_auc': roc_auc_score(y_test, cat_pred_proba),
    'predictions': cat_pred,
    'probabilities': cat_pred_proba,
    'model': catboost_model
}

In [29]:
print(f"CatBoost F1 Score: {results['CatBoost']['f1']:.4f}")

CatBoost F1 Score: 0.4522


Random Forest

In [30]:
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

In [31]:
rf_model = RandomForestClassifier(random_state=42, n_jobs=-1)

rf_grid = GridSearchCV(
    rf_model,
    {'n_estimators': [100, 200], 'max_depth': [10, 20], 'min_samples_split': [2, 5]},
    cv=5,
    scoring='f1',
    n_jobs=-1,
    verbose=0
)

In [32]:
rf_grid.fit(X_train_balanced, y_train_balanced)
rf_best = rf_grid.best_estimator_

In [33]:
# predictions
rf_pred = rf_best.predict(X_test_scaled)
rf_pred_proba = rf_best.predict_proba(X_test_scaled)[:, 1]

In [34]:
results['RandomForest'] = {
    'accuracy': accuracy_score(y_test, rf_pred),
    'precision': precision_score(y_test, rf_pred),
    'recall': recall_score(y_test, rf_pred),
    'f1': f1_score(y_test, rf_pred),
    'roc_auc': roc_auc_score(y_test, rf_pred_proba),
    'predictions': rf_pred,
    'probabilities': rf_pred_proba,
    'model': rf_best
}

print(f"Random Forest Best Parameters: {rf_grid.best_params_}")
print(f"Random Forest F1 Score: {results['RandomForest']['f1']:.4f}")

Random Forest Best Parameters: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 200}
Random Forest F1 Score: 0.3768


<br/>

### **Model Comparison**

In [35]:
# create comparison dataframe
comparison_df = pd.DataFrame({
    model: {
        'Accuracy': results[model]['accuracy'],
        'Precision': results[model]['precision'],
        'Recall': results[model]['recall'],
        'F1-Score': results[model]['f1'],
        'ROC-AUC': results[model]['roc_auc']
    }
    for model in results.keys()
}).T

In [36]:
print("\nModel Performance Metrics:")
print(comparison_df.round(4))


Model Performance Metrics:
              Accuracy  Precision  Recall  F1-Score  ROC-AUC
XGBoost         0.8503     0.5600  0.2979    0.3889   0.8080
LightGBM        0.8503     0.5882  0.2128    0.3125   0.7940
CatBoost        0.7857     0.3824  0.5532    0.4522   0.7317
RandomForest    0.8537     0.5909  0.2766    0.3768   0.8054


In [37]:
best_model_name = comparison_df['F1-Score'].idxmax()
print(f"\n Best Model (by F1-Score): {best_model_name}")
print(f"   F1-Score: {comparison_df.loc[best_model_name, 'F1-Score']:.4f}")


 Best Model (by F1-Score): CatBoost
   F1-Score: 0.4522


Performance Comparison Bar Chart

In [38]:
fig = go.Figure()

metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC']
colors_list = ['#3873E5', '#00cc44', '#ffaa00', '#ff1e0e']

for i, model in enumerate(comparison_df.index):
    fig.add_trace(go.Bar(
        name=model,
        x=metrics,
        y=comparison_df.loc[model].values,
        marker_color=colors_list[i],
        text=comparison_df.loc[model].round(3).values,
        textposition='outside'
    ))

fig.update_layout(
    title='Model Performance Comparison',
    xaxis_title='Metrics',
    yaxis_title='Score',
    yaxis_range=[0, 1.1],
    barmode='group',
    height=500
)
fig.show()

<br/>

### **Save**

In [39]:
# check best model
best_model_name = comparison_df['F1-Score'].idxmax()
print(f"\nBest Model: {best_model_name}")

if best_model_name == 'CatBoost':
    catboost_model.save_model('models/best_model_catboost.cbm')
    model_metadata = {
        'model_type': 'CatBoost',
        'needs_scaling': False
    }
else:
    # save XGBoost, LightGBM, or RandomForest
    joblib.dump(results[best_model_name]['model'], f'models/best_model_{best_model_name.lower()}.pkl')
    joblib.dump(scaler, 'models/scaler.pkl')
    
    model_metadata = {
        'model_type': best_model_name,
        'needs_scaling': True
    }

# save all label encoders
joblib.dump(label_encoders, 'models/label_encoders.pkl')
joblib.dump(model_metadata, 'models/model_metadata.pkl')

print(f"Model saved successfully!")


Best Model: CatBoost
Model saved successfully!
