# Employee Turnover Analytics

## Import Dependencies || Packages¶

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as matplot
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, precision_score, recall_score, confusion_matrix, precision_recall_curve
from sklearn import metrics

## Load the Dataset

In [2]:
data = pd.read_csv("emp_turnover.csv", encoding="ISO-8859-1")
data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,work_accident,promotion_last_5years,department,salary,attrition
0,0.38,0.53,2,157,3,0,0,sales,low,1
1,0.8,0.86,5,262,6,0,0,sales,medium,1
2,0.11,0.88,7,272,4,0,0,sales,medium,1
3,0.72,0.87,5,223,5,0,0,sales,low,1
4,0.37,0.52,2,159,3,0,0,sales,low,1


In [3]:
data = data.rename(
    columns={'satisfaction_level': 'satisfaction_level', 
            'last_evaluation': 'evaluation',
            'number_project': 'number_project',
            'average_montly_hours': 'avg_monthly_hours',
            'time_spend_company': 'time_spend_with_company',
            'Work_accident': 'work_accident',
            'promotion_last_5years': 'promotion',
            'department' : 'department',
             'salary':'salary',
            'attrition' : 'turnover_lebel'
})

In [4]:
category_variable = ['department','salary','turnover_lebel','promotion']
numerical_variable = ['satisfaction_level','evaluation','number_project','avg_monthly_hours','time_spend_with_company', 'work_accident']
categorical_data = pd.get_dummies(data[category_variable], drop_first=True)
numerical_data = data[numerical_variable]

df = pd.concat([categorical_data , numerical_data], axis=1)
df.head()

Unnamed: 0,turnover_lebel,promotion,department_RandD,department_accounting,department_hr,department_management,department_marketing,department_product_mng,department_sales,department_support,department_technical,salary_low,salary_medium,satisfaction_level,evaluation,number_project,avg_monthly_hours,time_spend_with_company,work_accident
0,1,0,0,0,0,0,0,0,1,0,0,1,0,0.38,0.53,2,157,3,0
1,1,0,0,0,0,0,0,0,1,0,0,0,1,0.8,0.86,5,262,6,0
2,1,0,0,0,0,0,0,0,1,0,0,0,1,0.11,0.88,7,272,4,0
3,1,0,0,0,0,0,0,0,1,0,0,1,0,0.72,0.87,5,223,5,0
4,1,0,0,0,0,0,0,0,1,0,0,1,0,0.37,0.52,2,159,3,0


In [5]:
# Create the X and y set
X = df.iloc[:,1:]
y = df.iloc[:,0]

# Define train and test
X_train, X_test, y_train, y_test = train_test_split(X , y, test_size=0.20, random_state=123, stratify=y)

## Training The KNN Algorithm

In [6]:
import math 
math.sqrt(len(y_test))

54.772255750516614

In [7]:
#Define the model:InitK-NN
classifier = KNeighborsClassifier(n_neighbors=10)
#Fit Model
classifier.fit(X_train,y_train)

KNeighborsClassifier(n_neighbors=10)

In [8]:
y_pred = classifier.predict(X_test)
y_pred

array([0, 1, 0, ..., 0, 0, 1], dtype=int64)

In [9]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[2167,  119],
       [  57,  657]], dtype=int64)

In [10]:
print('Accuracy: ', metrics.accuracy_score(y_test, y_pred))
print('Precision: ', metrics.precision_score(y_test, y_pred))
print('Recall: ', metrics.recall_score(y_test, y_pred))
print('F1-Score: ', metrics.f1_score(y_test, y_pred))

Accuracy:  0.9413333333333334
Precision:  0.8466494845360825
Recall:  0.9201680672268907
F1-Score:  0.8818791946308724


In [11]:
print(metrics.classification_report(y_test, y_pred, labels=np.unique(y_pred)))


              precision    recall  f1-score   support

           0       0.97      0.95      0.96      2286
           1       0.85      0.92      0.88       714

    accuracy                           0.94      3000
   macro avg       0.91      0.93      0.92      3000
weighted avg       0.94      0.94      0.94      3000

