<a href="https://colab.research.google.com/github/viveksharma-niet/TreeModelUpgrad/blob/main/GBM_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [40]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [41]:
data = pd.read_csv('https://raw.githubusercontent.com/viveksharma-niet/TreeModelUpgrad/refs/heads/main/WA_Fn-UseC_-HR-Employee-Attrition.csv')
data.head(5)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [42]:
y = data['Attrition']
y.shape

(1470,)

In [43]:
X = data.drop('Attrition', axis=1)

In [44]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 34 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   BusinessTravel            1470 non-null   object
 2   DailyRate                 1470 non-null   int64 
 3   Department                1470 non-null   object
 4   DistanceFromHome          1470 non-null   int64 
 5   Education                 1470 non-null   int64 
 6   EducationField            1470 non-null   object
 7   EmployeeCount             1470 non-null   int64 
 8   EmployeeNumber            1470 non-null   int64 
 9   EnvironmentSatisfaction   1470 non-null   int64 
 10  Gender                    1470 non-null   object
 11  HourlyRate                1470 non-null   int64 
 12  JobInvolvement            1470 non-null   int64 
 13  JobLevel                  1470 non-null   int64 
 14  JobRole                 

In [45]:
mapping = {'Yes':1, 'No':0}
y = y.map(mapping)
y.value_counts()

Unnamed: 0_level_0,count
Attrition,Unnamed: 1_level_1
0,1233
1,237


In [46]:
int_data = X.select_dtypes(include=['int64'])
cat_data = X.select_dtypes(include=['object'])


In [47]:
dummy_var = pd.get_dummies(cat_data)

dummy_var.columns

Index(['BusinessTravel_Non-Travel', 'BusinessTravel_Travel_Frequently',
       'BusinessTravel_Travel_Rarely', 'Department_Human Resources',
       'Department_Research & Development', 'Department_Sales',
       'EducationField_Human Resources', 'EducationField_Life Sciences',
       'EducationField_Marketing', 'EducationField_Medical',
       'EducationField_Other', 'EducationField_Technical Degree',
       'Gender_Female', 'Gender_Male', 'JobRole_Healthcare Representative',
       'JobRole_Human Resources', 'JobRole_Laboratory Technician',
       'JobRole_Manager', 'JobRole_Manufacturing Director',
       'JobRole_Research Director', 'JobRole_Research Scientist',
       'JobRole_Sales Executive', 'JobRole_Sales Representative',
       'MaritalStatus_Divorced', 'MaritalStatus_Married',
       'MaritalStatus_Single', 'Over18_Y', 'OverTime_No', 'OverTime_Yes'],
      dtype='object')

In [48]:
final_data = pd.concat([int_data, dummy_var], axis=1)
final_data.head()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,Over18_Y,OverTime_No,OverTime_Yes
0,41,1102,1,2,1,1,2,94,3,2,...,False,False,True,False,False,False,True,True,False,True
1,49,279,8,1,1,2,3,61,2,2,...,False,True,False,False,False,True,False,True,True,False
2,37,1373,2,2,1,4,4,92,2,1,...,False,False,False,False,False,False,True,True,False,True
3,33,1392,3,4,1,5,4,56,3,1,...,False,True,False,False,False,True,False,True,False,True
4,27,591,2,1,1,7,1,40,3,1,...,False,False,False,False,False,True,False,True,True,False


In [49]:
from sklearn.model_selection import train_test_split

In [50]:
X_train, X_test, y_train, y_test = train_test_split(final_data, y, train_size= 0.75,random_state=0)

In [51]:
from sklearn.ensemble import GradientBoostingClassifier

In [52]:
gbc = GradientBoostingClassifier(random_state=100)

In [53]:
gbc.fit(X_train, y_train)

In [54]:
gb_predictions = gbc.predict(X_test)

In [55]:
gb_predictions_pro = gbc.predict_proba(X_test)

In [56]:
gb_predictions_pro

array([[0.95425715, 0.04574285],
       [0.96266164, 0.03733836],
       [0.86753512, 0.13246488],
       [0.93446343, 0.06553657],
       [0.11595517, 0.88404483],
       [0.69684147, 0.30315853],
       [0.64887562, 0.35112438],
       [0.95196738, 0.04803262],
       [0.97038412, 0.02961588],
       [0.86642399, 0.13357601],
       [0.94437031, 0.05562969],
       [0.91453905, 0.08546095],
       [0.97008856, 0.02991144],
       [0.30172989, 0.69827011],
       [0.94679291, 0.05320709],
       [0.98937899, 0.01062101],
       [0.94587557, 0.05412443],
       [0.93925719, 0.06074281],
       [0.94572155, 0.05427845],
       [0.92824768, 0.07175232],
       [0.63031267, 0.36968733],
       [0.95095032, 0.04904968],
       [0.96466225, 0.03533775],
       [0.97249008, 0.02750992],
       [0.58256377, 0.41743623],
       [0.75837581, 0.24162419],
       [0.95692266, 0.04307734],
       [0.97394765, 0.02605235],
       [0.25660257, 0.74339743],
       [0.96627476, 0.03372524],
       [0.