In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

In [2]:
df = pd.read_csv('./Human Resources Data/Human_Resources.csv')
df.head(5)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [3]:
# numerical and categorical columns
num_cols = df.columns[df.dtypes == 'int64']
cat_cols = df.columns[df.dtypes == 'object']

# checking if every column is included 
print(len(num_cols) + len(cat_cols) == len(df.columns))
cat_cols,num_cols

True


(Index(['Attrition', 'BusinessTravel', 'Department', 'EducationField', 'Gender',
        'JobRole', 'MaritalStatus', 'Over18', 'OverTime'],
       dtype='object'),
 Index(['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EmployeeCount',
        'EmployeeNumber', 'EnvironmentSatisfaction', 'HourlyRate',
        'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome',
        'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike',
        'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours',
        'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
        'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
        'YearsSinceLastPromotion', 'YearsWithCurrManager'],
       dtype='object'))

In [4]:
single_value_columns = []
for col in df.columns:
    if len(df[col].unique()) == 1:
        single_value_columns.append(col)

print(single_value_columns)


['EmployeeCount', 'Over18', 'StandardHours']


In [5]:
df = df.drop(columns=['EmployeeCount', 'Over18', 'StandardHours','EmployeeNumber'])
df.shape

(1470, 31)

In [6]:
num_cols = num_cols.difference(['EmployeeCount', 'EmployeeNumber', 'StandardHours'])
cat_cols = cat_cols.difference(['Over18'])
num_cols,cat_cols

(Index(['Age', 'DailyRate', 'DistanceFromHome', 'Education',
        'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel',
        'JobSatisfaction', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
        'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction',
        'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
        'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
        'YearsSinceLastPromotion', 'YearsWithCurrManager'],
       dtype='object'),
 Index(['Attrition', 'BusinessTravel', 'Department', 'EducationField', 'Gender',
        'JobRole', 'MaritalStatus', 'OverTime'],
       dtype='object'))

In [7]:
# 2. Checking Missing Values
df.isnull().sum().sum()

0

In [8]:
# Making the input data matrix and target vector
X = df.drop('Attrition',axis=1)
y = df['Attrition'].map({
    'Yes': 1,
    'No': 0
})
X.shape, y.shape

((1470, 30), (1470,))

In [9]:
# Creating the train and test dataset
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(
    X,y,test_size=0.2,stratify=y,random_state=42
)

X_train.shape,X_test.shape,y_train.shape,y_test.shape

((1176, 30), (294, 30), (1176,), (294,))

In [10]:
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,StandardScaler,LabelEncoder

In [11]:
ord_cols = ['BusinessTravel']
ordinal_encoder = OrdinalEncoder(categories=[['Non-Travel','Travel_Rarely', 'Travel_Frequently']])
X_train[ord_cols] = ordinal_encoder.fit_transform(X_train[ord_cols])
X_test[ord_cols] = ordinal_encoder.transform(X_test[ord_cols])
X_train[ord_cols], X_test[ord_cols]

(      BusinessTravel
 1194             1.0
 128              1.0
 810              1.0
 478              1.0
 491              2.0
 ...              ...
 1213             1.0
 963              1.0
 734              1.0
 1315             1.0
 1292             2.0
 
 [1176 rows x 1 columns],
       BusinessTravel
 1061             0.0
 891              1.0
 456              1.0
 922              1.0
 69               1.0
 ...              ...
 1269             1.0
 1352             1.0
 1236             1.0
 1023             1.0
 285              1.0
 
 [294 rows x 1 columns])

In [12]:
nominal_cols = ['Department','EducationField','JobRole','MaritalStatus','Gender','OverTime']

In [13]:
nom_encoder = OrdinalEncoder()

In [14]:
X_train[nominal_cols] = nom_encoder.fit_transform(X_train[nominal_cols])
X_test[nominal_cols] = nom_encoder.transform(X_test[nominal_cols])
X_train[nominal_cols], X_test[nominal_cols]

(      Department  EducationField  JobRole  MaritalStatus  Gender  OverTime
 1194         2.0             1.0      3.0            0.0     0.0       0.0
 128          1.0             5.0      2.0            1.0     1.0       0.0
 810          2.0             2.0      3.0            1.0     1.0       0.0
 478          2.0             3.0      8.0            1.0     1.0       0.0
 491          1.0             3.0      2.0            0.0     1.0       1.0
 ...          ...             ...      ...            ...     ...       ...
 1213         2.0             1.0      8.0            0.0     1.0       1.0
 963          2.0             1.0      7.0            0.0     0.0       0.0
 734          1.0             1.0      2.0            1.0     1.0       0.0
 1315         1.0             4.0      6.0            1.0     0.0       1.0
 1292         2.0             1.0      7.0            0.0     1.0       0.0
 
 [1176 rows x 6 columns],
       Department  EducationField  JobRole  MaritalStatus  G

So we have done ordinal encoding for both nominal and ordianl columns, there will be no scaling , and we will test teh forest based models here.

In [15]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, classification_report,recall_score, precision_score, f1_score, accuracy_score

In [16]:
# For attrition detection, recall for class 1 (quitters) is most important
# This custom scorer focuses on recall for the positive class (employees who quit)
scoring = {
    'recall' : make_scorer(recall_score,pos_label = 1),
    'precision': make_scorer(precision_score, pos_label=1),
    'f1': make_scorer(f1_score, pos_label=1),
    'accuracy': make_scorer(accuracy_score)
}

In [17]:
# Stratified K-Fold Cross Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [18]:
rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2'],
    'class_weight': [None, 'balanced']
}

In [19]:
gb_param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'subsample': [0.8, 1.0],
    'max_features': ['sqrt', 'log2']
}

In [20]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [21]:
rf_model = RandomForestClassifier(random_state=42)
gb_model = GradientBoostingClassifier(random_state=42)

In [22]:
grid_rf = GridSearchCV(
    estimator=rf_model,
    param_grid=rf_param_grid,
    scoring=scoring,  # or use make_scorer()
    cv=cv,
    n_jobs=-1,
    verbose=2,
    refit='recall'
)

grid_gb = GridSearchCV(
    estimator=gb_model,
    param_grid=gb_param_grid,
    scoring=scoring,
    cv=cv,
    n_jobs=-1,
    verbose=1,
    refit='recall'
)

In [None]:
hfeef

In [23]:
grid_rf.fit(X_train,y_train)
# Best parameters and score
print( "Best Params:", grid_rf.best_params_)
print("Best Recall Score:", grid_rf.best_score_)

Fitting 5 folds for each of 96 candidates, totalling 480 fits
Best Params: {'class_weight': 'balanced', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
Best Recall Score: 0.25263157894736843


In [24]:
grid_gb.fit(X_train,y_train)
# Best parameters and score
print( "Best Params:", grid_gb.best_params_)
print("Best Recall Score:", grid_gb.best_score_)

Fitting 5 folds for each of 128 candidates, totalling 640 fits
Best Params: {'learning_rate': 0.1, 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200, 'subsample': 0.8}
Best Recall Score: 0.4052631578947368


In [26]:
results_df = pd.DataFrame(grid_gb.cv_results_)
results_df  # recall-based scores

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,...,std_test_f1,rank_test_f1,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,split3_test_accuracy,split4_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy
0,0.606908,0.028619,0.035021,0.013682,0.01,3,sqrt,1,2,100,...,0.044724,114,0.838983,0.838298,0.846809,0.842553,0.846809,0.842690,0.003660,114
1,0.578538,0.059680,0.032049,0.004104,0.01,3,sqrt,1,2,100,...,0.037559,121,0.838983,0.838298,0.842553,0.842553,0.846809,0.841839,0.003047,121
2,1.183721,0.038983,0.033188,0.002338,0.01,3,sqrt,1,2,200,...,0.110979,87,0.847458,0.851064,0.859574,0.876596,0.859574,0.858853,0.010064,84
3,1.136354,0.029502,0.031769,0.005089,0.01,3,sqrt,1,2,200,...,0.112862,77,0.847458,0.851064,0.868085,0.876596,0.855319,0.859704,0.010949,76
4,0.579713,0.017297,0.035707,0.003282,0.01,3,sqrt,1,5,100,...,0.044724,114,0.838983,0.838298,0.846809,0.842553,0.846809,0.842690,0.003660,114
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,2.314678,0.136342,0.039164,0.004368,0.10,5,log2,2,2,200,...,0.087994,62,0.860169,0.859574,0.880851,0.880851,0.855319,0.867353,0.011147,56
124,1.147446,0.124924,0.038990,0.004394,0.10,5,log2,2,5,100,...,0.070008,50,0.881356,0.851064,0.876596,0.876596,0.851064,0.867335,0.013399,60
125,1.160317,0.067436,0.032982,0.008017,0.10,5,log2,2,5,100,...,0.075692,38,0.881356,0.863830,0.876596,0.876596,0.876596,0.874995,0.005879,28
126,2.141488,0.101892,0.039126,0.006188,0.10,5,log2,2,5,200,...,0.069974,37,0.877119,0.855319,0.889362,0.872340,0.855319,0.869892,0.013130,51


In [28]:
results_df.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_learning_rate', 'param_max_depth', 'param_max_features',
       'param_min_samples_leaf', 'param_min_samples_split',
       'param_n_estimators', 'param_subsample', 'params', 'split0_test_recall',
       'split1_test_recall', 'split2_test_recall', 'split3_test_recall',
       'split4_test_recall', 'mean_test_recall', 'std_test_recall',
       'rank_test_recall', 'split0_test_precision', 'split1_test_precision',
       'split2_test_precision', 'split3_test_precision',
       'split4_test_precision', 'mean_test_precision', 'std_test_precision',
       'rank_test_precision', 'split0_test_f1', 'split1_test_f1',
       'split2_test_f1', 'split3_test_f1', 'split4_test_f1', 'mean_test_f1',
       'std_test_f1', 'rank_test_f1', 'split0_test_accuracy',
       'split1_test_accuracy', 'split2_test_accuracy', 'split3_test_accuracy',
       'split4_test_accuracy', 'mean_test_accuracy', 'std_test_accuracy',
  

In [37]:
cond = (results_df['param_learning_rate'] == 0.1) & 
        (results_df['param_max_depth'] == 3) &
        (results_df['param_max_features'] == 'sqrt') &
        (results_df['param_min_samples_leaf'] == 1) &
        (results_df['param_min_samples_split'] == 5) &
        (results_df['param_n_estimators'] == 200) &
        (results_df['param_subsample'] == 0.8) 

SyntaxError: invalid syntax (2413426759.py, line 1)

In [38]:
results_df[(results_df['param_learning_rate'] == 0.1) & 
        (results_df['param_max_depth'] == 3) &
        (results_df['param_max_features'] == 'sqrt') &
        (results_df['param_min_samples_leaf'] == 1) &
        (results_df['param_min_samples_split'] == 5) &
        (results_df['param_n_estimators'] == 200) &
        (results_df['param_subsample'] == 0.8) ]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,...,std_test_f1,rank_test_f1,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,split3_test_accuracy,split4_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy
70,1.249129,0.067759,0.041023,0.004853,0.1,3,sqrt,1,5,200,...,0.08524,1,0.902542,0.855319,0.893617,0.868085,0.876596,0.879232,0.017049,9


In [39]:
results_df[results_df['rank_test_recall'] == 1]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,...,std_test_f1,rank_test_f1,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,split3_test_accuracy,split4_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy
70,1.249129,0.067759,0.041023,0.004853,0.1,3,sqrt,1,5,200,...,0.08524,1,0.902542,0.855319,0.893617,0.868085,0.876596,0.879232,0.017049,9
78,1.269362,0.078742,0.03633,0.002296,0.1,3,sqrt,2,5,200,...,0.087079,3,0.898305,0.859574,0.893617,0.86383,0.880851,0.879235,0.015472,8


In [40]:
y_pred = grid_rf.best_estimator_.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.86      0.95      0.90       247
           1       0.41      0.19      0.26        47

    accuracy                           0.83       294
   macro avg       0.63      0.57      0.58       294
weighted avg       0.79      0.83      0.80       294



In [41]:
y_pred = grid_gb.best_estimator_.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.88      0.97      0.92       247
           1       0.62      0.28      0.38        47

    accuracy                           0.86       294
   macro avg       0.75      0.62      0.65       294
weighted avg       0.83      0.86      0.83       294

