In [None]:
import numpy as np
import pandas as pd

#visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
import plotly.express as px

#machine learning libraries
from xgboost import XGBClassifier

#model selection
from sklearn.model_selection import train_test_split,GridSearchCV

#metrics
from sklearn.metrics import confusion_matrix,plot_confusion_matrix,precision_score,recall_score,f1_score,balanced_accuracy_score,accuracy_score
from sklearn import metrics

pd.set_option("display.max_columns",None)

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline


from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

# Preprocessing
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# Pipeline
from sklearn import pipeline


# Loading the data

In [None]:
df = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")

In [None]:
df_td = df # Making copy of data frame 

In [None]:
df.head()

# Exploratory Data Analysis

Let's see the basic statistics of our data

In [None]:
df.describe().T

In [None]:
df.info()

Here we have 9 categorical features. The rest are ordinal or continuous values.

Let's look at the categorical columns

In [None]:
categories = df.select_dtypes([object]).columns
print(categories)

In [None]:
df.select_dtypes([object]).head()

Let's look at the unique values of each category.

In [None]:
for i in df.select_dtypes([object]).columns:
    print(i,":",df[i].unique())

| Feature | Comment |
| --- | --- |
| **Attrition** | This is the label of interest. We will convert this to a binary variable {1: yes, 0: no} |
| **BusinessTravel** | We will convert this to an ordinal variable {0: Non-Travel, 1: Travel_Rarely, 2: Travel_Frequently} |
| **Department** | This is purely categorical, so we will have to one-hot encode or convert to a dummy variable |
| **EducationField** | This is purely categorical, so we will have to one-hot encode or convert to a dummy variable |
| **Gender** | There are only 2 values in the dataset, so we can convert this to a binary variable. This is legally protected data, so we have to be careful that using this feature does not lead to any forms of gender discrimination. |
| **JobRole** | This is purely categorical, so we will have to one-hot encode or convert to a dummy variable. This category has a decent number of values. By converting it, we are increasing our dimensionality by 9. This could lead to the curse of dimensionality. |
| **MaritalStatus** | Categorical, but one could make the case to make it ordinal. If we made it ordinal, then the mapping would be {0: Single, 1: Married, 2: Divorced}. Again, we have to be careful as using this information to make a hiring decision would be illegal. |
| **Over18** | This feature only has 1 value, so we will drop it. |
| **OverTime** | We will convert this to a binary variable {0: No, 1: Yes} |

For now we are transforming the ordinal features into their numeric typed columns and drop the constant column.

In [None]:
df = df.drop("Over18",axis=1)
df["Attrition"] = df["Attrition"].map({"No": 0, "Yes": 1})
df["BusinessTravel"] = df["BusinessTravel"].map({"Non-Travel": 0, "Travel_Rarely": 1,"Travel_Frequently": 2})
df["OverTime"] = df["OverTime"].map({"No": 0, "Yes": 1})

Let's look at the numerical columns.

In [None]:
numerical = df.select_dtypes([np.int64]).columns
print(numerical)

In [None]:
df.select_dtypes([np.int64]).head()

In the numerical features, we have 2 columns that have a constant value: StandardHours and EmployeeCount. We can drop those columns.

In [None]:
print(df["StandardHours"].unique())
print(df["EmployeeCount"].unique())
df = df.drop(["StandardHours","EmployeeCount"],axis=1)

In [None]:
numerical = df.select_dtypes([np.int64]).columns

In [None]:
corr = df[numerical].corr()

In [None]:
trace = go.Heatmap(
    z=np.abs(corr.values),
    x=corr.columns.values,
    y=corr.columns.values,
    colorscale="greys"
)
fig = go.Figure(data=trace)
fig.update_layout(
    title="Interactive correlation heatmap of numerical features",
    autosize=False,
    width=750,
    height=750
)
fig.show()

The only features that have a high correlation are MonthlyIncome and JobLevel.

Attrition is not strongly correlated with any other feature. Its highest correlation is with OverTime, and it is not that high.

In [None]:
corr["Attrition"][np.abs(corr["Attrition"]) > 0.1]

In [None]:
fig = px.histogram(df,x="BusinessTravel",color="Attrition")
fig.update_layout(
    autosize=False,
    title="Interactive histogram of BusinessTravel"
)
fig.show()

# Tejas

### EDA

In [None]:
df_td.describe().T

#### Dropped below columns (EmployeeCount,Over18,StandardHours) because it's only has one value for all rows. Also dropping EmployeeNumber.

In [None]:
df_td.drop(columns=['EmployeeCount', 'Over18', 'StandardHours','EmployeeNumber'], inplace=True)

In [None]:
dfColumns = []
for i in df_td.columns:
    dfColumns.append([i, df_td[i].nunique(), df_td[i].drop_duplicates().values])
pd.DataFrame(dfColumns, columns = ['Features', 'Unique Number', 'Values'])

#### Number 1 is Yes, means employee moves out of company and number 0 is No, means stay.

In [None]:
df_td['Attrition'] = np.where(df_td['Attrition'] == 'Yes', 1, 0)

In [None]:
df_td["Attrition"].value_counts()

In [None]:
df_td['Attrition'].value_counts()/df_td.shape[0]*100

In [None]:
# Correlation Plot
f, ax = plt.subplots(figsize=(18, 14))
corr = df_td.corr()
hm = sns.heatmap(round(corr,2), annot=True, ax=ax, cmap="coolwarm",fmt='.2f', linewidths=.05)
f.subplots_adjust(top=0.95)
t= f.suptitle('HR-Employee-Attrition - Correlation Heatmap', fontsize=16)

#### We can see from the correlation plot that Attrition does not have strong correlation with any other variables. We can also see from the correlation plot that only few variables have relatively strong correlation. Job Level and Monthly Income has highest correlation of 0.95. 

#### Some other correlated variables are as follows: 
- TotalWorkingYears and Monthly Income (0.78)
- Age and TotalWorkingYears (0.68)
- YearsAtComapny and YearsWithCurrManager (0.77)
- YearsInCurrentRole and YearsWithCurrManager (0.71)
- YearsInCurrentRole and YearsAtCompany (0.76)

In [None]:
fig = plt.figure(figsize=(10, 6))
(df_td[df_td['Attrition'] == 1]['Age']).hist(bins=30, color='blue', linewidth=1.0,alpha=0.5, label = 'Attrition=1',
              xlabelsize=8, ylabelsize=8, grid=True)  
(df_td[df_td['Attrition'] == 0]['Age']).hist(bins=30, color='red', linewidth=1.0,alpha=0.5,label = 'Attrition=0',
              xlabelsize=8, ylabelsize=8, grid=True)  
plt.legend()
plt.xlabel("Age")

#### Younger employees were more likely to attrition 

In [None]:
plt.figure(figsize=[11,7])
ax = sns.countplot(x="JobLevel", hue="Attrition", data=df_td, palette="Set1")

In [None]:
round(df_td[["JobLevel","Attrition"]].groupby(["JobLevel","Attrition"]).size().groupby(level=0).apply(lambda x: x/x.sum()),2)

#### Low job level has higher attrition rate

In [None]:
plt.figure(figsize=[11,7])
ax = sns.countplot(x="Department", hue="Attrition", data=df_td, palette="Set1")

In [None]:
# Doing some math
print(df_td[df_td['Attrition'] == 1]['Department'].value_counts())
print(df_td[df_td['Attrition'] == 0]['Department'].value_counts())
print(df_td[df_td['Department'] == 'Research & Development'].count().unique())
print(df_td[df_td['Department'] == 'Sales'].count().unique())
print(df_td[df_td['Department'] == 'Human Resources'].count().unique())

In [None]:
round(df_td[["Department","Attrition"]].groupby(["Department","Attrition"]).size().groupby(level=0).apply(lambda x: x/x.sum()),2)

#### Sales and HR have higher attrition rates compared to research and development  department

In [None]:
plt.figure(figsize=[11,7])
ax = sns.countplot(x="Gender", hue="Attrition", data=df_td, palette="Set1")

In [None]:
# Doing Some math
print(df_td[df_td['Attrition'] == 1]['Gender'].value_counts())
print(df_td[df_td['Attrition'] == 0]['Gender'].value_counts())
print(df_td[df_td['Gender'] == 'Male'].count().unique())
print(df_td[df_td['Gender'] == 'Female'].count().unique())

In [None]:
round(df_td[["Gender","Attrition"]].groupby(["Gender","Attrition"]).size().groupby(level=0).apply(lambda x: x/x.sum()),2)

#### Males were found to be more likely to attrition  

In [None]:
fig = plt.figure(figsize=(10, 6))
(df_td[df_td['Attrition'] == 1]['MonthlyIncome']).hist(bins=30, color='blue', linewidth=1.0,alpha=0.5, label = 'Attrition=1',
              xlabelsize=8, ylabelsize=8, grid=True)  
(df_td[df_td['Attrition'] == 0]['MonthlyIncome']).hist(bins=30, color='red', linewidth=1.0,alpha=0.5,label = 'Attrition=0',
              xlabelsize=8, ylabelsize=8, grid=True)  
plt.legend()
plt.xlabel("MonthlyIncome")

#### Attrition rate is higher at lower monthly income 

In [None]:
fig = plt.figure(figsize=(10, 6))
(df_td[df_td['Attrition'] == 1]['YearsAtCompany']).hist(bins=30, color='blue', linewidth=1.0,alpha=0.5, label = 'Attrition=1',
              xlabelsize=8, ylabelsize=8, grid=True)  
(df_td[df_td['Attrition'] == 0]['YearsAtCompany']).hist(bins=30, color='red', linewidth=1.0,alpha=0.5,label = 'Attrition=0',
              xlabelsize=8, ylabelsize=8, grid=True)  
plt.legend()
plt.xlabel("YearsAtCompany")

In [None]:
plt.figure(figsize=[22,7])
ax = sns.countplot(x="JobRole", hue="Attrition", data=df_td, palette="Set1")

In [None]:
round(df_td[["JobRole","Attrition"]].groupby(["JobRole","Attrition"]).size().groupby(level=0).apply(lambda x: x/x.sum()),2)

#### Sales Representative and Lab Technicain have higher attrition rate

In [None]:
plt.figure(figsize=[11,7])
ax = sns.countplot(x="MaritalStatus", hue="Attrition", data=df_td, palette="Set1")

In [None]:
round(df_td[["MaritalStatus","Attrition"]].groupby(["MaritalStatus","Attrition"]).size().groupby(level=0).apply(lambda x: x/x.sum()),2)

#### Employees who are single have higher rates of attrition compared to married and divorced workers  

In [None]:
plt.figure(figsize=[11,7])
ax = sns.countplot(x="BusinessTravel", hue="Attrition", data=df_td, palette="Set1")

#### Employees who travel frequently have higher attrition rates than who travel rarely or didn’t travel at all 

In [None]:
round(df_td[["BusinessTravel","Attrition"]].groupby(["BusinessTravel","Attrition"]).size().groupby(level=0).apply(lambda x: x/x.sum()),2)

# Tejas Model 1 -  AdaBoost

In [None]:
cat_feats =  ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime']

In [None]:
# Converting categorical variables to dummy variables
df_new = pd.get_dummies(df_td, columns = cat_feats,)

In [None]:
df_new.info()

### Preparing data from machine learning

In [None]:
X = df_new.drop('Attrition', axis = 1)
y = df_new['Attrition']
X.shape

In [None]:
# Trai, Validation Split
X_train, X_val, y_train, y_val = train_test_split(X, y,stratify=y, test_size=0.30, random_state=2021)

In [None]:
# Standardization of the data
ss = StandardScaler()
ss.fit(X_train)
X_train = ss.transform(X_train)
X_val = ss.transform(X_val)

# AdaBoost model

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
adabc = AdaBoostClassifier()

In [None]:
adabc.fit(X_train, y_train)

In [None]:
y_pred_adabc = adabc.predict(X_val)

In [None]:
# Validation accuracy
metrics.accuracy_score(y_val, y_pred_adabc)

In [None]:
pred_train_adabc=adabc.predict(X_train)

In [None]:
# Training accuracy
metrics.accuracy_score(y_train, pred_train_adabc)

In [None]:
print('Adaboost\n',metrics.classification_report(y_val, y_pred_adabc))

In [None]:
print('Confusion Matrix:Adaboost\n')
tn, fp, fn, tp = metrics.confusion_matrix(y_val, y_pred_adabc).ravel()
print(metrics.confusion_matrix(y_val, y_pred_adabc))
recall_score = metrics.recall_score(y_val, y_pred_adabc)
specificity = tn / (tn+fp)
precision_score = metrics.precision_score(y_val, y_pred_adabc)
accuracy = metrics.accuracy_score(y_val, y_pred_adabc)
balanced_accuracy = metrics.balanced_accuracy_score(y_val, y_pred_adabc)
f1_score = metrics.f1_score(y_val, y_pred_adabc)
print("\nMetrics on test data")
print('Recall Score :', round(recall_score,2))
print('Specificity :', round(specificity,2))
print('Precision Score :', round(precision_score,2))
print('Accuracy:', round(accuracy,2))
print('Balanced Accuracy:', round(balanced_accuracy,2))
print('F1 Score :', round(f1_score,2))

#### Apply cross validation

In [None]:
cv_ab = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [None]:
adabc = AdaBoostClassifier()
scores = cross_val_score(adabc, X=X_train, y=y_train, scoring='accuracy', cv=cv_ab, n_jobs=1, )
print('CV accuracy scores: %s' % scores)

In [None]:
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

## Adaboost model - pipeline and gridsearch

In [None]:
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

In [None]:
ss = preprocessing.StandardScaler()
adabc = AdaBoostClassifier()

steps = [('ss', ss),
         ('classifier', adabc)]

parameters = {
    'classifier__n_estimators': [50,100,200,300],
    'classifier__learning_rate': [0.1,0.2,1,2],
}
pipe = pipeline.Pipeline(steps)

cv_pipe_ab = GridSearchCV(pipe, parameters, cv=cv_ab, n_jobs=-1, scoring = 'accuracy')
cv_pipe_ab.fit(X_train, y_train)

print_results(cv_pipe_ab)

In [None]:
best_estimator_ab_pipe = cv_pipe_ab.best_estimator_
best_estimator_ab_pipe

In [None]:
pred_pipe_ab = best_estimator_ab_pipe.predict(X_val)

In [None]:
# Testing # Validation Accuracy
metrics.accuracy_score(y_val, pred_pipe_ab)

In [None]:
# Training
pred_train_pipe_ab=best_estimator_ab_pipe.predict(X_train)
metrics.accuracy_score(y_train, pred_train_pipe_ab)

In [None]:
print('Adaboost with Hyperparamter Tuning\n',metrics.classification_report(y_val, pred_pipe_ab))

In [None]:
print('\nConfusion Matrix: Adaboost with Hyperparamter Tuning\n')
tn, fp, fn, tp = metrics.confusion_matrix(y_val, pred_pipe_ab).ravel()
print(metrics.confusion_matrix(y_val, pred_pipe_ab))
recall_score = metrics.recall_score(y_val, pred_pipe_ab)
specificity = tn / (tn+fp)
precision_score = metrics.precision_score(y_val, pred_pipe_ab)
accuracy = metrics.accuracy_score(y_val, pred_pipe_ab)
balanced_accuracy = metrics.balanced_accuracy_score(y_val, pred_pipe_ab)
f1_score = metrics.f1_score(y_val, pred_pipe_ab)

print("\nMetrics on test data")
print('Recall Score :', round(recall_score,2))
print('Specificity :', round(specificity,2))
print('Precision Score :', round(precision_score,2))
print('Accuracy:', round(accuracy,2))
print('Balanced Accuracy:', round(balanced_accuracy,2))
print('F1 Score :', round(f1_score,2))

# Tejas Model 2 - Support vector machine Classifier

## Fit RBF Kernel SVM Classifier

In [None]:
from sklearn.svm import SVC

In [None]:
svc_rbf = SVC(kernel ='rbf')

In [None]:
svc_rbf

In [None]:
cv_svc = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [None]:
svc_rbf.fit(X_train, y_train)

In [None]:
# Validation # Testing
y_pred_svc_rbf = svc_rbf.predict(X_val)

In [None]:
metrics.accuracy_score(y_val, y_pred_svc_rbf)

In [None]:
# Train
pred_train_svc_rbf=svc_rbf.predict(X_train)

In [None]:
# Train
metrics.accuracy_score(y_train, pred_train_svc_rbf)

In [None]:
print('SVC-Fit RBF Kernel SVM Classifier\n',metrics.classification_report(y_val, y_pred_svc_rbf))

In [None]:
print('\nConfusion Matrix: SVC-Fit RBF Kernel SVM Classifier\n')
tn, fp, fn, tp = metrics.confusion_matrix(y_val, y_pred_svc_rbf).ravel()
print(metrics.confusion_matrix(y_val, y_pred_svc_rbf))
recall_score = metrics.recall_score(y_val, y_pred_svc_rbf)
specificity = tn / (tn+fp)
precision_score = metrics.precision_score(y_val, y_pred_svc_rbf)
accuracy = metrics.accuracy_score(y_val, y_pred_svc_rbf)
balanced_accuracy = metrics.balanced_accuracy_score(y_val, y_pred_svc_rbf)
f1_score = metrics.f1_score(y_val, y_pred_svc_rbf)

print("\nMetrics on test data")
print('Recall Score :', round(recall_score,2))
print('Specificity :', round(specificity,2))
print('Precision Score :', round(precision_score,2))
print('Accuracy:', round(accuracy,2))
print('Balanced Accuracy:', round(balanced_accuracy,2))
print('F1 Score :', round(f1_score,2))

# Grid Search # Hyperparamter Tuning

In [None]:
# Code
parameters = {
    'gamma': [1,0.1,0.01,0.001,0.0001],
    'C': [1,10,100,1000]
}
svc_rbf_gs = GridSearchCV(svc_rbf,parameters,cv=cv_svc)
svc_rbf_gs.fit(X_train, y_train)
print_results(svc_rbf_gs)

print('Best score for data:', svc_rbf_gs.best_score_)
print('Best C:',svc_rbf_gs.best_estimator_.C) 
print('Best Gamma:',svc_rbf_gs.best_estimator_.gamma)

In [None]:
best_estimator_svc_gs = svc_rbf_gs.best_estimator_
best_estimator_svc_gs

In [None]:
# Validation Predict
y_pred_svc_rbf_gs = best_estimator_svc_gs.predict(X_val)
# Training Predict
pred_train_svc_rbf_gs= best_estimator_svc_gs.predict(X_train)

In [None]:
# Testing Accuracy Score # Validation
metrics.accuracy_score(y_val, y_pred_svc_rbf_gs)

In [None]:
# Training Accuracy Score
metrics.accuracy_score(y_train, pred_train_svc_rbf_gs)

In [None]:
print('SVC-Fit RBF Kernel SVM Classifier with tuning\n',metrics.classification_report(y_val, y_pred_svc_rbf_gs))

In [None]:
print('\nConfusion Matrix:SVC-Fit RBF Kernel SVM Classifier with tuning-Grid Search')
tn, fp, fn, tp = metrics.confusion_matrix(y_val, y_pred_svc_rbf_gs).ravel()
print(metrics.confusion_matrix(y_val, y_pred_svc_rbf_gs))
recall_score = metrics.recall_score(y_val, y_pred_svc_rbf_gs)
specificity = tn / (tn+fp)
precision_score = metrics.precision_score(y_val, y_pred_svc_rbf_gs)
accuracy = metrics.accuracy_score(y_val, y_pred_svc_rbf_gs)
balanced_accuracy = metrics.balanced_accuracy_score(y_val, y_pred_svc_rbf_gs)
f1_score = metrics.f1_score(y_val, y_pred_svc_rbf_gs)

print("\nMetrics on test data")
print('Recall Score :', round(recall_score,2))
print('Specificity :', round(specificity,2))
print('Precision Score :', round(precision_score,2))
print('Accuracy:', round(accuracy,2))
print('Balanced Accuracy:', round(balanced_accuracy,2))
print('F1 Score :', round(f1_score,2))

# Helper functions

In [None]:
def training_run_classification(model,parameters,X_train,y_train,X_val,y_val,scoring=None):
    cv = GridSearchCV(model,parameters,cv=4,n_jobs=-1,scoring=scoring)
    cv.fit(X_train,y_train)
    model = cv.best_estimator_
    print(cv.best_params_)
    pred = model.predict(X_val)
    cm = confusion_matrix(y_val,pred)
    acc = accuracy_score(y_val,pred)
    balanced_accuracy = balanced_accuracy_score(y_val,pred)
    precision = precision_score(y_val,pred)
    recall = recall_score(y_val,pred)
    f1 = f1_score(y_val,pred)
    specificity = cm[0,0]/np.sum(cm[0])
    
    print(f"Accuracy: {acc}")
    print(f"Balanced Accuracy: {balanced_accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"f1: {f1}")
    print(f"Specificity: {specificity}")
    plot_confusion_matrix(model,X_val,y_val)
    plt.show()

    return model

# ADABoost

parameters = {
    "n_estimators": [5,25,100,200],
    "max_depth": [None,2,5,10,20],
    "learning_rate": [0.1,1,10],
    "booster": ["gbtree","gblinear","dart"],
    "tree_method": ["exact","approx","hisat"],
    "n_jobs": [-1],
    "reg_alpha": [0.1,1,10],
    "reg_lambda": [0.1,1,10],
    "scale_pos_weight": [3],
    "random_state": [1]
}

xgb = training_run_classification(XGBClassifier(),parameters,x_train,y_train,x_test,y_test,scoring="balanced_accuracy")