# All Boosting Algorithms

Importing Dependencies

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

Reading Our CSV Data

In [2]:
original = pd.read_csv("/kaggle/input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv")
train = pd.read_csv("/kaggle/input/playground-series-s3e3/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s3e3/test.csv")

# Analyzing Data

Mapping Data Correctly

In [3]:
original["Attrition"] = original["Attrition"].map({"Yes": 1, "No": 0})

In [4]:
train = train.drop(["id"], axis = 1)
test = test.drop(["id"], axis = 1)
original = original.drop(["EmployeeNumber"], axis = 1)

In [5]:
original.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 34 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   int64 
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EnvironmentSatisfaction   1470 non-null   int64 
 10  Gender                    1470 non-null   object
 11  HourlyRate                1470 non-null   int64 
 12  JobInvolvement            1470 non-null   int64 
 13  JobLevel                  1470 non-null   int64 
 14  JobRole                 

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1677 entries, 0 to 1676
Data columns (total 34 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1677 non-null   int64 
 1   BusinessTravel            1677 non-null   object
 2   DailyRate                 1677 non-null   int64 
 3   Department                1677 non-null   object
 4   DistanceFromHome          1677 non-null   int64 
 5   Education                 1677 non-null   int64 
 6   EducationField            1677 non-null   object
 7   EmployeeCount             1677 non-null   int64 
 8   EnvironmentSatisfaction   1677 non-null   int64 
 9   Gender                    1677 non-null   object
 10  HourlyRate                1677 non-null   int64 
 11  JobInvolvement            1677 non-null   int64 
 12  JobLevel                  1677 non-null   int64 
 13  JobRole                   1677 non-null   object
 14  JobSatisfaction         

Both Original and Train csv match now, So we can Concatenate

In [7]:
train = pd.concat([train, original], axis = 0)

In [8]:
train.reset_index(inplace = True)
train = train.drop(["index"], axis = 1)

In [9]:
train.describe()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,...,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
count,3147.0,3147.0,3147.0,3147.0,3147.0,3147.0,3147.0,3147.0,3147.0,3147.0,...,3147.0,3147.0,3147.0,3147.0,3147.0,3147.0,3147.0,3147.0,3147.0,3147.0
mean,36.451223,850.586273,8.921513,2.926279,1.0,2.741023,66.907531,2.757865,2.029234,2.762313,...,80.0,0.755323,10.97585,2.763902,2.765809,6.900858,4.183985,2.076263,4.156975,0.138862
std,8.815861,390.859919,7.961278,1.032051,0.0,1.089733,19.877918,0.680183,1.09459,1.100224,...,0.0,0.810688,7.509438,1.214918,0.675707,5.998015,3.601691,3.130744,3.572003,0.345858
min,18.0,102.0,1.0,1.0,1.0,1.0,30.0,1.0,1.0,1.0,...,80.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,30.0,543.0,2.0,2.0,1.0,2.0,50.0,2.0,1.0,2.0,...,80.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0,0.0
50%,35.0,852.0,7.0,3.0,1.0,3.0,67.0,3.0,2.0,3.0,...,80.0,1.0,9.0,3.0,3.0,5.0,3.0,1.0,3.0,0.0
75%,42.0,1198.5,13.0,4.0,1.0,4.0,84.0,3.0,3.0,4.0,...,80.0,1.0,15.0,3.0,3.0,9.0,7.0,2.0,7.0,0.0
max,60.0,3921.0,29.0,15.0,1.0,4.0,100.0,4.0,7.0,4.0,...,80.0,3.0,41.0,6.0,4.0,41.0,18.0,15.0,17.0,1.0


In [10]:
train.isnull().sum()

Age                         0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
Attrition 

Preprocessing Our Data

In [11]:
def preprocess(df):
    df = df.drop(["EmployeeCount", "Over18", "StandardHours"], axis=1)
    df["OverTime"] = df["OverTime"].map({"Yes": 1, "No": 0})
    df["Gender"] = df["Gender"].map({"Male": 1, "Female": 0})
    return df

In [12]:
train = preprocess(train)
test = preprocess(test)

In [13]:
train.tail()

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,...,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
3142,36,Travel_Frequently,884,Research & Development,23,2,Medical,3,1,41,...,3,1,17,3,3,5,2,0,3,0
3143,39,Travel_Rarely,613,Research & Development,6,1,Medical,4,1,42,...,1,1,9,5,3,7,7,1,7,0
3144,27,Travel_Rarely,155,Research & Development,4,3,Life Sciences,2,1,87,...,2,1,6,0,3,6,2,0,3,0
3145,49,Travel_Frequently,1023,Sales,2,3,Medical,4,1,63,...,4,0,17,3,2,9,6,0,8,0
3146,34,Travel_Rarely,628,Research & Development,8,3,Medical,2,1,82,...,1,0,6,3,4,4,3,1,2,0


Extracting meaningful and correlated columns

In [14]:
corr_df = train.corr()

Important Categorical Features

In [15]:
cat_feat = ["DistanceFromHome", "Gender", "MonthlyRate", "NumCompaniesWorked", "OverTime", "PerformanceRating", "BusinessTravel", "Department", "EducationField", "JobRole", "MaritalStatus", "PercentSalaryHike"]

----

# Extreme Gradient Boosting Model

Defining Our Train and Target Data

In [16]:
X = train.drop(["Attrition"], axis=1)
y = train["Attrition"]

Spliting Our Data into Train and Validation Sets

In [17]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

Encoding Our Categorical Data

In [18]:
le = LabelEncoder()

In [19]:
columns = ["BusinessTravel", "Department", "EducationField", "JobRole", "MaritalStatus"]

In [20]:
for column in columns:
    X[column] = le.fit_transform(X[column].astype("str"))

Spliting Data into Train and Validation Sets

In [21]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

Defining XGBoost Model

In [22]:
xgb = XGBRegressor()

Fitting and Predicting the Data

In [23]:
xgb.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=2, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [24]:
xgb_pred = xgb.predict(X_val)

Score for XGB Model

In [25]:
xgb_score = roc_auc_score(y_val, xgb_pred)

In [26]:
xgb_score

0.7664609053497943

----

# CatBoostRegressor Model

Defining Our Train and Target Data

In [27]:
X = train.drop(["Attrition"], axis=1)
y = train["Attrition"]

Converting cat_feat list to Categorical Data

In [28]:
for column in cat_feat:
    X[column] = X[column].astype("category")

Spliting Our Data into Train and Validation Sets

In [29]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

Creating Pools for our CBR Model

In [30]:
pool_train = Pool(X_train, y_train, cat_features = cat_feat)
pool_val = Pool(X_val, y_val, cat_features = cat_feat)

Defining our CBR Model

In [31]:
cbr = CatBoostRegressor()

Fitting and Predicting the Data

In [32]:
cbr.fit(pool_train)

Learning rate set to 0.046
0:	learn: 0.3420323	total: 69.3ms	remaining: 1m 9s
1:	learn: 0.3399516	total: 82.7ms	remaining: 41.2s
2:	learn: 0.3378184	total: 93.3ms	remaining: 31s
3:	learn: 0.3359472	total: 104ms	remaining: 25.8s
4:	learn: 0.3342527	total: 114ms	remaining: 22.8s
5:	learn: 0.3322343	total: 124ms	remaining: 20.6s
6:	learn: 0.3302963	total: 134ms	remaining: 19s
7:	learn: 0.3292829	total: 145ms	remaining: 17.9s
8:	learn: 0.3277633	total: 154ms	remaining: 17s
9:	learn: 0.3263485	total: 164ms	remaining: 16.2s
10:	learn: 0.3247079	total: 176ms	remaining: 15.8s
11:	learn: 0.3231087	total: 185ms	remaining: 15.3s
12:	learn: 0.3217713	total: 197ms	remaining: 15s
13:	learn: 0.3200514	total: 208ms	remaining: 14.6s
14:	learn: 0.3190512	total: 213ms	remaining: 14s
15:	learn: 0.3177940	total: 222ms	remaining: 13.7s
16:	learn: 0.3165161	total: 232ms	remaining: 13.4s
17:	learn: 0.3151945	total: 242ms	remaining: 13.2s
18:	learn: 0.3141222	total: 253ms	remaining: 13.1s
19:	learn: 0.3130763	

<catboost.core.CatBoostRegressor at 0x7f35962b1290>

In [33]:
cbr_pred = cbr.predict(X_val)

Score for our CBR Model

In [34]:
cbr_score = roc_auc_score(y_val, cbr_pred)

In [35]:
cbr_score

0.8496707818930042

----

# LightGradientBoostRegressor Model

Defining Our Train and Target Data

In [36]:
X = train.drop(["Attrition"], axis=1)
y = train["Attrition"]

In [37]:
for column in columns:
    X[column] = X[column].astype("category")

Spliting Our Data into Train and Validation Sets

In [38]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

Defining our LGBR Model

In [39]:
lgbrm = LGBMRegressor()

Fitting and Predicting the Data

In [40]:
lgbrm.fit(X_train, y_train)

LGBMRegressor()

In [41]:
lgbrm_pred = lgbrm.predict(X_val)

Score for our LGBR Model

In [42]:
lgbrm_score = roc_auc_score(y_val, lgbrm_pred)

In [43]:
lgbrm_score

0.8108436213991769

----

# Submissions

Converting cat_feat list to Categorical Data for Test Data

In [44]:
for column in cat_feat:
    test[column] = test[column].astype("category")

Reading Submissions.csv

In [45]:
submission = pd.read_csv("/kaggle/input/playground-series-s3e3/sample_submission.csv", usecols = ["id"])

Predicting Test CSV

In [46]:
cbr_testpred = cbr.predict(test)

In [47]:
submission["Attrition"] = cbr_testpred

Saving Our Final submission.csv

In [48]:
submission.to_csv("submission.csv", index = False)

----