In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

In [2]:
df = pd.read_csv('./Human_Resources.csv')
df.head(5)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [3]:
# dataframe size
df.shape

(1470, 35)

In [4]:
# column data types
df.dtypes

Age                          int64
Attrition                   object
BusinessTravel              object
DailyRate                    int64
Department                  object
DistanceFromHome             int64
Education                    int64
EducationField              object
EmployeeCount                int64
EmployeeNumber               int64
EnvironmentSatisfaction      int64
Gender                      object
HourlyRate                   int64
JobInvolvement               int64
JobLevel                     int64
JobRole                     object
JobSatisfaction              int64
MaritalStatus               object
MonthlyIncome                int64
MonthlyRate                  int64
NumCompaniesWorked           int64
Over18                      object
OverTime                    object
PercentSalaryHike            int64
PerformanceRating            int64
RelationshipSatisfaction     int64
StandardHours                int64
StockOptionLevel             int64
TotalWorkingYears   

In [5]:
# numerical and categorical columns
num_cols = df.columns[df.dtypes == 'int64']
cat_cols = df.columns[df.dtypes == 'object']

# checking if every column is included 
print(len(num_cols) + len(cat_cols) == len(df.columns))

True


In [6]:
cat_cols

Index(['Attrition', 'BusinessTravel', 'Department', 'EducationField', 'Gender',
       'JobRole', 'MaritalStatus', 'Over18', 'OverTime'],
      dtype='object')

In [7]:
num_cols

Index(['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome',
       'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

In [8]:
single_value_columns = []
for col in df.columns:
    if len(df[col].unique()) == 1:
        single_value_columns.append(col)

print(single_value_columns)


['EmployeeCount', 'Over18', 'StandardHours']


If one of your column in the dataset is having the same values, you can drop this column as it will not do any help to your model to differentiate between two different labels while on the other hand, it can even negatively affect your model by creating a bias in the data. <br>
Also EmployeeNumber is just unique ID of teh employee,so it cn also be dropped.

In [9]:
df = df.drop(columns=['EmployeeCount', 'Over18', 'StandardHours','EmployeeNumber'])
df.shape

(1470, 31)

In [10]:
num_cols = num_cols.difference(['EmployeeCount', 'EmployeeNumber', 'StandardHours'])
cat_cols = cat_cols.difference(['Over18'])
num_cols,cat_cols

(Index(['Age', 'DailyRate', 'DistanceFromHome', 'Education',
        'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel',
        'JobSatisfaction', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
        'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction',
        'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
        'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
        'YearsSinceLastPromotion', 'YearsWithCurrManager'],
       dtype='object'),
 Index(['Attrition', 'BusinessTravel', 'Department', 'EducationField', 'Gender',
        'JobRole', 'MaritalStatus', 'OverTime'],
       dtype='object'))

Encoding:
- One Hot: Department, MaritalStatus, Job Role,Gender, OverTime, EducationField
- Ordinal: BusinessTravel

So we will do data transformations two ways when it comes to encoding: 
1. For general classification Models
2. For tree based models

In [11]:
# 2. Checking Missing Values
df.isnull().sum().sum()

0

So there are no missing values. <br>
Now we will be using non-tree based models like Logistic Regression and SVC, then tree based models like Random forest, XGBoost and Catboost. Hence the encoding and scaling will be different. <br>
Now we will start by splittng the data into train and test sets, with a 80:20 ratio. The train set will also be used for doing cross valaidation to tune hyperparameters and get the best model.
And then a final result on the test set.

In [12]:
# Making the input data matrix and target vector
X = df.drop('Attrition',axis=1)
y = df['Attrition'].map({
    'Yes': 1,
    'No': 0
})
X.shape, y.shape

((1470, 30), (1470,))

In [13]:
# Creating the train and test dataset
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(
    X,y,test_size=0.2,stratify=y,random_state=42
)

In [14]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((1176, 30), (294, 30), (1176,), (294,))

In [15]:
X_train

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
1194,47,Travel_Rarely,1225,Sales,2,4,Life Sciences,2,Female,47,...,3,3,3,29,2,3,3,2,1,2
128,22,Travel_Rarely,594,Research & Development,2,1,Technical Degree,3,Male,100,...,3,3,1,3,2,3,2,1,2,1
810,46,Travel_Rarely,406,Sales,3,1,Marketing,1,Male,52,...,3,4,1,23,3,3,12,9,4,9
478,25,Travel_Rarely,622,Sales,13,1,Medical,2,Male,40,...,3,3,0,7,1,3,7,4,0,6
491,43,Travel_Frequently,1001,Research & Development,9,5,Medical,4,Male,72,...,3,2,1,10,3,3,8,7,4,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1213,23,Travel_Rarely,427,Sales,7,3,Life Sciences,3,Male,99,...,4,2,1,3,2,3,3,2,0,2
963,38,Travel_Rarely,1009,Sales,2,2,Life Sciences,2,Female,31,...,3,4,1,11,3,3,7,7,1,7
734,22,Travel_Rarely,217,Research & Development,8,1,Life Sciences,2,Male,94,...,3,1,1,4,3,2,4,3,1,1
1315,36,Travel_Rarely,430,Research & Development,2,4,Other,4,Female,73,...,4,4,1,15,2,3,1,0,0,0


In [17]:
y_train

1194    0
128     0
810     0
478     0
491     0
       ..
1213    1
963     0
734     0
1315    0
1292    0
Name: Attrition, Length: 1176, dtype: int64

In [18]:
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,StandardScaler

In [19]:
ohe_cols = ['Department','EducationField','JobRole','MaritalStatus','Gender','OverTime']
ord_cols = ['BusinessTravel']
std_scl_cols = num_cols

In [20]:
ordinal_encoder = OrdinalEncoder(categories=[['Non-Travel','Travel_Rarely', 'Travel_Frequently']])
X_train[ord_cols] = ordinal_encoder.fit_transform(X_train[ord_cols])
X_test[ord_cols] = ordinal_encoder.transform(X_test[ord_cols])
X_train[ord_cols], X_test[ord_cols]

(      BusinessTravel
 1194             1.0
 128              1.0
 810              1.0
 478              1.0
 491              2.0
 ...              ...
 1213             1.0
 963              1.0
 734              1.0
 1315             1.0
 1292             2.0
 
 [1176 rows x 1 columns],
       BusinessTravel
 1061             0.0
 891              1.0
 456              1.0
 922              1.0
 69               1.0
 ...              ...
 1269             1.0
 1352             1.0
 1236             1.0
 1023             1.0
 285              1.0
 
 [294 rows x 1 columns])

In [21]:
ohe = OneHotEncoder(sparse_output=False,handle_unknown='ignore')
ohe_train_array = ohe.fit_transform(X_train[ohe_cols])
# Transform X_test using the already-fitted encoder
ohe_test_array = ohe.transform(X_test[ohe_cols])

In [22]:
feature_names = ohe.get_feature_names_out(ohe_cols)
print("Feature names:", feature_names)

Feature names: ['Department_Human Resources' 'Department_Research & Development'
 'Department_Sales' 'EducationField_Human Resources'
 'EducationField_Life Sciences' 'EducationField_Marketing'
 'EducationField_Medical' 'EducationField_Other'
 'EducationField_Technical Degree' 'JobRole_Healthcare Representative'
 'JobRole_Human Resources' 'JobRole_Laboratory Technician'
 'JobRole_Manager' 'JobRole_Manufacturing Director'
 'JobRole_Research Director' 'JobRole_Research Scientist'
 'JobRole_Sales Executive' 'JobRole_Sales Representative'
 'MaritalStatus_Divorced' 'MaritalStatus_Married' 'MaritalStatus_Single'
 'Gender_Female' 'Gender_Male' 'OverTime_No' 'OverTime_Yes']


In [23]:
ohe_train_df = pd.DataFrame(ohe_train_array,columns=feature_names,index=X_train.index)
ohe_test_df = pd.DataFrame(ohe_test_array,columns=feature_names,index=X_test.index)


In [24]:
ohe_train_df.head(5)

Unnamed: 0,Department_Human Resources,Department_Research & Development,Department_Sales,EducationField_Human Resources,EducationField_Life Sciences,EducationField_Marketing,EducationField_Medical,EducationField_Other,EducationField_Technical Degree,JobRole_Healthcare Representative,...,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,Gender_Female,Gender_Male,OverTime_No,OverTime_Yes
1194,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
128,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
810,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
478,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
491,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0


In [25]:
ohe_test_df.head(5)

Unnamed: 0,Department_Human Resources,Department_Research & Development,Department_Sales,EducationField_Human Resources,EducationField_Life Sciences,EducationField_Marketing,EducationField_Medical,EducationField_Other,EducationField_Technical Degree,JobRole_Healthcare Representative,...,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,Gender_Female,Gender_Male,OverTime_No,OverTime_Yes
1061,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
891,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
456,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
922,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
69,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0


In [26]:
X_train = pd.concat([X_train.drop(columns=ohe_cols),ohe_train_df],axis = 1)
X_test = pd.concat([X_test.drop(columns=ohe_cols),ohe_test_df],axis = 1)
X_train.columns, X_test.columns

(Index(['Age', 'BusinessTravel', 'DailyRate', 'DistanceFromHome', 'Education',
        'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel',
        'JobSatisfaction', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
        'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction',
        'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
        'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
        'YearsSinceLastPromotion', 'YearsWithCurrManager',
        'Department_Human Resources', 'Department_Research & Development',
        'Department_Sales', 'EducationField_Human Resources',
        'EducationField_Life Sciences', 'EducationField_Marketing',
        'EducationField_Medical', 'EducationField_Other',
        'EducationField_Technical Degree', 'JobRole_Healthcare Representative',
        'JobRole_Human Resources', 'JobRole_Laboratory Technician',
        'JobRole_Manager', 'JobRole_Manufacturing Director',
        'Jo

In [28]:
# Scaling the numerical columns/features
scaler = StandardScaler()
X_train[std_scl_cols] = scaler.fit_transform(X_train[std_scl_cols])
X_test[std_scl_cols] = scaler.transform(X_test[std_scl_cols])

In [29]:
X_train[std_scl_cols]

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
1194,1.090194,1.049455,-0.899915,1.064209,-0.658710,-0.908436,1.795282,1.762189,-0.647997,2.026752,...,-0.432065,0.240218,2.613100,2.261482,-0.605389,0.337621,-0.665706,-0.625365,-0.368024,-0.616406
128,-1.634828,-0.523449,-0.899915,-1.855332,0.260202,1.694111,0.373564,-0.986265,1.153526,-0.864408,...,-0.432065,0.240218,0.247430,-1.072675,-0.605389,0.337621,-0.830071,-0.905635,-0.056884,-0.897047
810,0.981193,-0.992080,-0.777610,-1.855332,-1.577622,-0.662913,0.373564,1.762189,0.252765,2.347706,...,-0.432065,1.160403,0.247430,1.492061,0.190962,0.337621,0.813578,1.336527,0.565398,1.348076
478,-1.307825,-0.453653,0.445433,-1.855332,-0.658710,-1.252169,0.373564,-0.986265,0.252765,-0.956202,...,-0.432065,0.240218,-0.935405,-0.559727,-1.401740,0.337621,-0.008246,-0.064824,-0.679165,0.506155
491,0.654191,0.491086,-0.043784,2.037390,1.179114,0.319180,0.373564,-0.070114,0.252765,-0.185956,...,-0.432065,-0.679966,0.247430,-0.175017,0.190962,0.337621,0.156119,0.775986,0.565398,0.786795
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1213,-1.525827,-0.939733,-0.288393,0.091029,0.260202,1.645007,0.373564,-0.986265,1.153526,-0.917721,...,2.314467,-0.679966,0.247430,-1.072675,-0.605389,0.337621,-0.665706,-0.625365,-0.679165,-0.616406
963,0.109186,0.511028,-0.899915,-0.882152,-0.658710,-1.694111,0.373564,-0.070114,-1.548758,0.075020,...,-0.432065,1.160403,0.247430,-0.046780,0.190962,0.337621,-0.008246,0.775986,-0.368024,0.786795
734,-1.634828,-1.463203,-0.166088,-1.855332,-0.658710,1.399483,-2.469873,-0.986265,-1.548758,-0.879886,...,-0.432065,-1.600151,0.247430,-0.944438,0.190962,-1.055510,-0.501341,-0.345095,-0.368024,-0.897047
1315,-0.108815,-0.932255,-0.899915,1.064209,1.179114,0.368285,0.373564,-0.070114,-0.647997,0.089853,...,2.314467,1.160403,0.247430,0.466167,-0.605389,0.337621,-0.994436,-1.185905,-0.679165,-1.177687


In [30]:
X_train.shape,y_train.shape

((1176, 49), (1176,))

# Model Training

In [40]:
from sklearn.model_selection import GridSearchCV, cross_val_predict, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import make_scorer, classification_report,recall_score, precision_score, f1_score, accuracy_score

In [41]:
# For attrition detection, recall for class 1 (quitters) is most important
# This custom scorer focuses on recall for the positive class (employees who quit)
scoring = {
    'recall' : make_scorer(recall_score,pos_label = 1),
    'precision': make_scorer(precision_score, pos_label=1),
    'f1': make_scorer(f1_score, pos_label=1),
    'accuracy': make_scorer(accuracy_score)
}

In [67]:
log_reg = LogisticRegression(max_iter=1000,penalty='elasticnet',solver='saga',random_state=42)

In [68]:
# Stratified K-Fold Cross Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [69]:
# Define grid
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'l1_ratio': [0.1, 0.5, 0.9],
    'class_weight': [None, 'balanced']
}

In [70]:
grid_search = GridSearchCV(
    estimator= log_reg,
    param_grid=param_grid,
    scoring=scoring,
    refit='recall',    
    cv = cv,
    n_jobs=-1,
    verbose=3,
    return_train_score= True
)

In [71]:
# Fit on training data (X_train, y_train)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [72]:
# Best parameters and score
print( "Best Params:", grid_search.best_params_)
print("Best Recall Score:", grid_search.best_score_)

Best Params: {'C': 0.01, 'class_weight': 'balanced', 'l1_ratio': 0.1}
Best Recall Score: 0.7526315789473685


So when you use refit = 'recall', it uses recall from the scoring dictionary and what it does is,that after it gets doen with teh cross validaton loop,it trains on teh entire X_train with the modle taht had the best recall value. So you don't need to retarin it,it can be directly used for prediction.

In [73]:
y_pred = grid_search.best_estimator_.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.94      0.73      0.82       247
           1       0.35      0.77      0.48        47

    accuracy                           0.74       294
   macro avg       0.65      0.75      0.65       294
weighted avg       0.85      0.74      0.77       294



In [77]:
svc = SVC(probability=False, random_state=42)

In [78]:
svc_param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto'],
    'class_weight': [None, 'balanced']
}

grid_search_svc = GridSearchCV(
    estimator=svc,
    param_grid=svc_param_grid,
    scoring=scoring,  # or use make_scorer if using custom scoring
    cv=cv,
    verbose=3,
    n_jobs=-1,
    refit='recall'
)


In [79]:
grid_search_svc.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [80]:
# Best parameters and score
print( "Best Params:", grid_search_svc.best_params_)
print("Best Recall Score:", grid_search_svc.best_score_)

Best Params: {'C': 0.1, 'class_weight': 'balanced', 'gamma': 'scale', 'kernel': 'linear'}
Best Recall Score: 0.7315789473684211


In [81]:
y_pred = grid_search_svc.best_estimator_.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.92      0.80      0.86       247
           1       0.37      0.62      0.46        47

    accuracy                           0.77       294
   macro avg       0.64      0.71      0.66       294
weighted avg       0.83      0.77      0.79       294



precision: 	“When I predict positive, am I right?” 
recall : “Did I catch all actual positives?”

In [82]:
rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2'],
    'class_weight': [None, 'balanced']
}


In [85]:
gb_param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'subsample': [0.8, 1.0],
    'max_features': ['sqrt', 'log2']
}


In [83]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [84]:
rf_model = RandomForestClassifier(random_state=42)
gb_model = GradientBoostingClassifier(random_state=42)

In [86]:
grid_rf = GridSearchCV(
    estimator=rf_model,
    param_grid=rf_param_grid,
    scoring=scoring,  # or use make_scorer()
    cv=cv,
    n_jobs=-1,
    verbose=3,
    refit='recall'
)

grid_gb = GridSearchCV(
    estimator=gb_model,
    param_grid=gb_param_grid,
    scoring=scoring,
    cv=cv,
    n_jobs=-1,
    verbose=3,
    refit='recall'
)

In [87]:
grid_rf.fit(X_train,y_train)
# Best parameters and score
print( "Best Params:", grid_rf.best_params_)
print("Best Recall Score:", grid_rf.best_score_)

Fitting 5 folds for each of 96 candidates, totalling 480 fits
Best Params: {'class_weight': 'balanced', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
Best Recall Score: 0.3


In [88]:
grid_gb.fit(X_train,y_train)
# Best parameters and score
print( "Best Params:", grid_gb.best_params_)
print("Best Recall Score:", grid_gb.best_score_)

Fitting 5 folds for each of 128 candidates, totalling 640 fits
Best Params: {'learning_rate': 0.1, 'max_depth': 3, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200, 'subsample': 0.8}
Best Recall Score: 0.4052631578947368


In [89]:
y_pred = grid_rf.best_estimator_.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.86      0.96      0.91       247
           1       0.48      0.21      0.29        47

    accuracy                           0.84       294
   macro avg       0.67      0.58      0.60       294
weighted avg       0.80      0.84      0.81       294



In [90]:
y_pred = grid_gb.best_estimator_.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.88      0.98      0.92       247
           1       0.68      0.28      0.39        47

    accuracy                           0.86       294
   macro avg       0.78      0.63      0.66       294
weighted avg       0.85      0.86      0.84       294



In [91]:
from imblearn.over_sampling import SMOTE

In [92]:
# Step 2: Apply SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("Before SMOTE:", y_train.value_counts().to_dict())
print("After SMOTE:", y_train_res.value_counts().to_dict())

Before SMOTE: {0: 986, 1: 190}
After SMOTE: {0: 986, 1: 986}


In [94]:
grid_rf.fit(X_train_res,y_train_res)
# Best parameters and score
print( "Best Params:", grid_rf.best_params_)
print("Best Recall Score:", grid_rf.best_score_)

Fitting 5 folds for each of 96 candidates, totalling 480 fits
Best Params: {'class_weight': None, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best Recall Score: 0.9066861508485873


In [95]:
y_pred = grid_rf.best_estimator_.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.87      0.95      0.91       247
           1       0.46      0.23      0.31        47

    accuracy                           0.83       294
   macro avg       0.66      0.59      0.61       294
weighted avg       0.80      0.83      0.81       294



In [97]:
grid_gb.fit(X_train_res,y_train_res)
# Best parameters and score
print( "Best Params:", grid_gb.best_params_)
print("Best Recall Score:", grid_gb.best_score_)

Fitting 5 folds for each of 128 candidates, totalling 640 fits
Best Params: {'learning_rate': 0.1, 'max_depth': 3, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100, 'subsample': 0.8}
Best Recall Score: 0.9097267087114803


In [98]:
y_pred = grid_gb.best_estimator_.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.88      0.95      0.92       247
           1       0.57      0.34      0.43        47

    accuracy                           0.85       294
   macro avg       0.73      0.65      0.67       294
weighted avg       0.83      0.85      0.84       294



Using the newdatset for 

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Step 1: Train-test split (you already have this)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Step 2: Apply SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("Before SMOTE:", y_train.value_counts().to_dict())
print("After SMOTE:", y_train_res.value_counts().to_dict())

# Step 3: Train a classifier on resampled data
clf = RandomForestClassifier(random_state=42, class_weight='balanced')  # optional class_weight

clf.fit(X_train_res, y_train_res)

# Step 4: Evaluate on original test set
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))


In [None]:
gb_param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'subsample': [0.8, 1.0],
    'max_features': ['sqrt', 'log2']
}


In [None]:
models_params = {
    'Logistic Regression': (LogisticRegression(max_iter=1000,random_state=42), {
        'penalty': ['l1', 'l2'],
        'C': [0.01, 0.1, 1, 10],
        'solver': ['liblinear']
    }),
    'KNN': (KNeighborsClassifier(), {
        'n_neighbors': [3, 5, 7],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    }),
    'SVC': (SVC(), {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto']
    }),
    'LDA': (
    LinearDiscriminantAnalysis(),
    [
        {'solver': ['svd']},  # no shrinkage
        {'solver': ['lsqr'], 'shrinkage': [None, 'auto']}  # shrinkage valid here
    ]
)
}

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
best_models = {}
results_summary = []

In [None]:
# for name, (model, params) in models_params.items():
#     print(f"Training {name}...")
    
#     # Grid search with recall scoring (for detecting quitters)
#     grid = GridSearchCV(model, params, cv=cv, scoring=scoring, n_jobs=-1)
#     grid.fit(X_train, y_train)
#     best_models[name] = grid
    
#     # Get predictions on test set
#     y_pred = grid.predict(X_test)
    
#     # Generate classification report
#     report = classification_report(y_test, y_pred, output_dict=True)
    
#     print(f"\n{name} Results:")
#     print(f"Best CV Recall Score (Class 1): {grid.best_score_:.4f}")
#     print(f"Best Params: {grid.best_params_}")
#     print("\nClassification Report:")
#     print(classification_report(y_test, y_pred))
    
#     # Extract key metrics for comparison
#     recall_class_1 = report['1']['recall']  # This is what you want to maximize
#     precision_class_1 = report['1']['precision']
#     f1_class_1 = report['1']['f1-score']
    
#     results_summary.append({
#         'Model': name,
#         'Recall_Class_1_Quitters': recall_class_1,
#         'Precision_Class_1_Quitters': precision_class_1,
#         'F1_Class_1_Quitters': f1_class_1,
#         'Overall_Accuracy': report['accuracy']
#     })
    
#     print(f"Key Metrics for Attrition Detection:")
#     print(f"- Recall for Class 1 (Quitters): {recall_class_1:.3f}")
#     print(f"- Precision for Class 1 (Quitters): {precision_class_1:.3f}")
#     print(f"- F1 for Class 1 (Quitters): {f1_class_1:.3f}")
#     print("-" * 60)

# # Summary comparison
# print("\nSUMMARY - Model Comparison for Employee Attrition:")
# df_results = pd.DataFrame(results_summary)
# df_results = df_results.sort_values('Recall_Class_1_Quitters', ascending=False)
# print(df_results.round(3))

# print(f"\nBest Model for Attrition Detection: {df_results.iloc[0]['Model']}")
# print(f"Best Recall for detecting quitters: {df_results.iloc[0]['Recall_Class_1_Quitters']:.3f}")

In [None]:
# for name, (model, params) in models_params.items():
#     print(f"\nTraining {name}...")
    
#     # Grid search with 5-fold CV on TRAINING data only
#     grid = GridSearchCV(model, params, cv=cv, scoring=scoring, n_jobs=-1)
#     grid.fit(X_train, y_train)
#     best_models[name] = grid
    
#     # Get cross-validation predictions on TRAINING data
#     # This gives us unbiased estimates without touching test data
#     cv_pred = cross_val_predict(grid.best_estimator_, X_train, y_train, cv=5)
    
#     # Classification report based on CV predictions (TRAINING DATA)
#     cv_report = classification_report(y_train, cv_pred, output_dict=True)
    
#     print(f"Best CV Recall Score (Class 1): {grid.best_score_:.4f}")
#     print(f"Best Params: {grid.best_params_}")
#     print("\nCross-Validation Classification Report (Training Data):")
#     print(classification_report(y_train, cv_pred))
    
#     # Extract key metrics for model selection
#     recall_class_1 = cv_report['1']['recall']
#     precision_class_1 = cv_report['1']['precision'] 
#     f1_class_1 = cv_report['1']['f1-score']
    
#     cv_results_summary.append({
#         'Model': name,
#         'CV_Recall_Class_1': recall_class_1,
#         'CV_Precision_Class_1': precision_class_1,
#         'CV_F1_Class_1': f1_class_1,
#         'CV_Overall_Accuracy': cv_report['accuracy'],
#         'GridSearch_Best_Score': grid.best_score_
#     })
    
#     print(f"Cross-Validation Metrics for Attrition Detection:")
#     print(f"- CV Recall for Class 1 (Quitters): {recall_class_1:.3f}")
#     print(f"- CV Precision for Class 1 (Quitters): {precision_class_1:.3f}")
#     print(f"- CV F1 for Class 1 (Quitters): {f1_class_1:.3f}")
#     print("-" * 50)

# # Model comparison and selection based on TRAINING data performance
# print("\n" + "="*60)
# print("MODEL SELECTION SUMMARY (Based on Training Data CV)")
# print("="*60)
# df_cv_results = pd.DataFrame(cv_results_summary)
# df_cv_results = df_cv_results.sort_values('CV_Recall_Class_1', ascending=False)
# print(df_cv_results.round(3))

# # Select best model
# best_model_name = df_cv_results.iloc[0]['Model']
# best_model = best_models[best_model_name]

# print(f"\nSELECTED BEST MODEL: {best_model_name}")
# print(f"Expected Recall for detecting quitters: {df_cv_results.iloc[0]['CV_Recall_Class_1']:.3f}")

# print("\n" + "="*60)
# print("STEP 2: FINAL EVALUATION ON TEST DATA (ONCE ONLY)")
# print("="*60)

# # STEP 2: FINAL EVALUATION ON TEST DATA (ONLY ONCE!)
# print(f"\nEvaluating selected model ({best_model_name}) on unseen test data...")

# # Final predictions on test data
# final_predictions = best_model.predict(X_test)

# # Final classification report
# print(f"\nFINAL TEST RESULTS for {best_model_name}:")
# print("="*40)
# print(classification_report(y_test, final_predictions))

# # Extract final test metrics
# final_report = classification_report(y_test, final_predictions, output_dict=True)
# final_recall_class_1 = final_report['1']['recall']
# final_precision_class_1 = final_report['1']['precision']
# final_f1_class_1 = final_report['1']['f1-score']

# print(f"\nFINAL PERFORMANCE SUMMARY:")
# print(f"- Test Recall for Class 1 (Quitters): {final_recall_class_1:.3f}")
# print(f"- Test Precision for Class 1 (Quitters): {final_precision_class_1:.3f}")
# print(f"- Test F1 for Class 1 (Quitters): {final_f1_class_1:.3f}")

# print(f"\nModel Generalization Check:")
# expected_recall = df_cv_results.iloc[0]['CV_Recall_Class_1']
# print(f"- Expected Recall (CV): {expected_recall:.3f}")
# print(f"- Actual Test Recall: {final_recall_class_1:.3f}")
# print(f"- Difference: {abs(expected_recall - final_recall_class_1):.3f}")

# if abs(expected_recall - final_recall_class_1) < 0.05:
#     print("✓ Model generalizes well!")
# else:
#     print("⚠ Model may be overfitting - consider more regularization")

# print("\n" + "="*60)
# print("DEPLOYMENT READY MODEL")
# print("="*60)
# print(f"Selected Model: {best_model_name}")
# print(f"Model Object: best_models['{best_model_name}']")
# print("This model is now ready for production deployment.")

In [None]:

for name, (model, params) in models_params.items():
    print(f"\n{name}:")
    print("-" * 30)
    
    # GridSearchCV with 5-fold CV, optimizing for recall of class 1
    grid = GridSearchCV(model, params, cv=cv, scoring=scoring, n_jobs=-1)
    grid.fit(X_train, y_train)
    best_models[name] = grid
    
    # Get predictions on training data using the best model from grid search
    best_model = grid.best_estimator_
    y_train_pred = best_model.predict(X_train)
    
    # Classification report for this model (training data)
    print(f"Best CV Recall Score for Class 1 (Quitters): {grid.best_score_:.4f}")
    print(f"Best Parameters: {grid.best_params_}")
    print("\nClassification Report (Training Data):")
    print(classification_report(y_train, y_train_pred))
    
    # Extract key metrics for comparison
    train_report = classification_report(y_train, y_train_pred, output_dict=True)
    recall_class_1 = train_report['1']['recall']
    precision_class_1 = train_report['1']['precision']
    f1_class_1 = train_report['1']['f1-score']
    
    results_summary.append({
        'Model': name,
        'CV_Recall_Score': grid.best_score_,
        'Train_Recall_Class_1': recall_class_1,
        'Train_Precision_Class_1': precision_class_1,
        'Train_F1_Class_1': f1_class_1
    })

# Compare all models and select the best one
print("\n" + "="*70)
print("MODEL COMPARISON SUMMARY")
print("="*70)
df_results = pd.DataFrame(results_summary)
df_results = df_results.sort_values('CV_Recall_Score', ascending=False)
print(df_results.round(4))

# Select best model based on CV recall score
best_model_name = df_results.iloc[0]['Model']
best_model = best_models[best_model_name]

print(f"\nBEST MODEL SELECTED: {best_model_name}")
print(f"CV Recall for Class 1 (Quitters): {df_results.iloc[0]['CV_Recall_Score']:.4f}")

# STEP 2: Final evaluation on test data (ONCE ONLY)
print("\n" + "="*70)
print("STEP 2: Final Test Evaluation (ONCE ONLY - NO FURTHER CHANGES ALLOWED)")
print("="*70)

# Final predictions on test data
y_test_pred = best_model.predict(X_test)

print(f"\nFinal Test Results for {best_model_name}:")
print(classification_report(y_test, y_test_pred))

# Extract the key metric you care about
test_report = classification_report(y_test, y_test_pred, output_dict=True)
test_recall_class_1 = test_report['1']['recall']

print(f"\nKEY RESULT:")
print(f"Test Recall for Class 1 (Employees who quit): {test_recall_class_1:.4f}")
print(f"This means your model catches {test_recall_class_1*100:.1f}% of employees who actually quit.")

print(f"\nModel is ready for deployment: best_models['{best_model_name}']")

In [None]:
# # STEP 1: MODEL SELECTION AND EVALUATION (TRAINING DATA ONLY)
# # For attrition detection, recall for class 1 (quitters) is most important
# scoring = make_scorer(recall_score, pos_label=1)

# models_params = {
#     'Logistic Regression': (
#         LogisticRegression(max_iter=1000, random_state=42), {
#             'penalty': ['l1', 'l2'],
#             'C': [0.01, 0.1, 1, 10],
#             'solver': ['liblinear']
#         }
#     ),
#     'KNN': (
#         KNeighborsClassifier(), {
#             'n_neighbors': [3, 5, 7],
#             'weights': ['uniform', 'distance'],
#             'metric': ['euclidean', 'manhattan']
#         }
#     ),
#     'SVC': (
#         SVC(), {
#             'C': [0.1, 1, 10],
#             'kernel': ['linear', 'rbf'],
#             'gamma': ['scale', 'auto']
#         }
#     ),
#     'LDA': (
#         LinearDiscriminantAnalysis(), {
#             'solver': ['svd', 'lsqr'],
#             'shrinkage': [None, 'auto']
#         }
#     ),
#     'Naive Bayes': (
#         GaussianNB(), {
#             'var_smoothing': [1e-9, 1e-8, 1e-7]
#         }
#     )
# }

# print("="*60)
# print("STEP 1: MODEL SELECTION (USING ONLY TRAINING DATA)")
# print("="*60)

# best_models = {}
# cv_results_summary = []

# for name, (model, params) in models_params.items():
#     print(f"\nTraining {name}...")
    
#     # Grid search with 5-fold CV on TRAINING data only
#     grid = GridSearchCV(model, params, cv=5, scoring=scoring, n_jobs=-1)
#     grid.fit(X_train, y_train)
#     best_models[name] = grid
    
#     # Get cross-validation predictions on TRAINING data
#     # This gives us unbiased estimates without touching test data
#     cv_pred = cross_val_predict(grid.best_estimator_, X_train, y_train, cv=5)
    
#     # Classification report based on CV predictions (TRAINING DATA)
#     cv_report = classification_report(y_train, cv_pred, output_dict=True)
    
#     print(f"Best CV Recall Score (Class 1): {grid.best_score_:.4f}")
#     print(f"Best Params: {grid.best_params_}")
#     print("\nCross-Validation Classification Report (Training Data):")
#     print(classification_report(y_train, cv_pred))
    
#     # Extract key metrics for model selection
#     recall_class_1 = cv_report['1']['recall']
#     precision_class_1 = cv_report['1']['precision'] 
#     f1_class_1 = cv_report['1']['f1-score']
    
#     cv_results_summary.append({
#         'Model': name,
#         'CV_Recall_Class_1': recall_class_1,
#         'CV_Precision_Class_1': precision_class_1,
#         'CV_F1_Class_1': f1_class_1,
#         'CV_Overall_Accuracy': cv_report['accuracy'],
#         'GridSearch_Best_Score': grid.best_score_
#     })
    
#     print(f"Cross-Validation Metrics for Attrition Detection:")
#     print(f"- CV Recall for Class 1 (Quitters): {recall_class_1:.3f}")
#     print(f"- CV Precision for Class 1 (Quitters): {precision_class_1:.3f}")
#     print(f"- CV F1 for Class 1 (Quitters): {f1_class_1:.3f}")
#     print("-" * 50)

# # Model comparison and selection based on TRAINING data performance
# print("\n" + "="*60)
# print("MODEL SELECTION SUMMARY (Based on Training Data CV)")
# print("="*60)
# df_cv_results = pd.DataFrame(cv_results_summary)
# df_cv_results = df_cv_results.sort_values('CV_Recall_Class_1', ascending=False)
# print(df_cv_results.round(3))

# # Select best model
# best_model_name = df_cv_results.iloc[0]['Model']
# best_model = best_models[best_model_name]

# print(f"\nSELECTED BEST MODEL: {best_model_name}")
# print(f"Expected Recall for detecting quitters: {df_cv_results.iloc[0]['CV_Recall_Class_1']:.3f}")

# print("\n" + "="*60)
# print("STEP 2: FINAL EVALUATION ON TEST DATA (ONCE ONLY)")
# print("="*60)

# # STEP 2: FINAL EVALUATION ON TEST DATA (ONLY ONCE!)
# print(f"\nEvaluating selected model ({best_model_name}) on unseen test data...")

# # Final predictions on test data
# final_predictions = best_model.predict(X_test)

# # Final classification report
# print(f"\nFINAL TEST RESULTS for {best_model_name}:")
# print("="*40)
# print(classification_report(y_test, final_predictions))

# # Extract final test metrics
# final_report = classification_report(y_test, final_predictions, output_dict=True)
# final_recall_class_1 = final_report['1']['recall']
# final_precision_class_1 = final_report['1']['precision']
# final_f1_class_1 = final_report['1']['f1-score']

# print(f"\nFINAL PERFORMANCE SUMMARY:")
# print(f"- Test Recall for Class 1 (Quitters): {final_recall_class_1:.3f}")
# print(f"- Test Precision for Class 1 (Quitters): {final_precision_class_1:.3f}")
# print(f"- Test F1 for Class 1 (Quitters): {final_f1_class_1:.3f}")

# print(f"\nModel Generalization Check:")
# expected_recall = df_cv_results.iloc[0]['CV_Recall_Class_1']
# print(f"- Expected Recall (CV): {expected_recall:.3f}")
# print(f"- Actual Test Recall: {final_recall_class_1:.3f}")
# print(f"- Difference: {abs(expected_recall - final_recall_class_1):.3f}")

# if abs(expected_recall - final_recall_class_1) < 0.05:
#     print("✓ Model generalizes well!")
# else:
#     print("⚠ Model may be overfitting - consider more regularization")

# print("\n" + "="*60)
# print("DEPLOYMENT READY MODEL")
# print("="*60)
# print(f"Selected Model: {best_model_name}")
# print(f"Model Object: best_models['{best_model_name}']")
# print("This model is now ready for production deployment.")

In [None]:
for name, (model, params) in models_params.items():
    grid = GridSearchCV(model, params, cv=5, scoring=scoring, n_jobs=-1)
    grid.fit(X_train, y_train)
    best_models[name] = grid
    print(f"{name}: Best recall Score = {grid.best_score_:.4f}")
    print(f"Best Params: {grid.best_params_}\n")


In [None]:
X_train['BusinessTravel'].unique()

In [None]:
X_train['Department'].unique()

In [None]:
X_train['EducationField'].unique()

In [None]:
X_train['JobRole'].unique()

In [None]:
X_train['Gender'].unique()

In [None]:
X_train['MaritalStatus'].unique()

In [None]:
X_train['OverTime'].unique()

In [None]:
df['Employee']