In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

df = pd.read_csv('https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/log_reg/employee-turnover-balanced.csv')

In [2]:
df.head()

Unnamed: 0,left_company,age,frequency_of_travel,department,commuting_distance,education,satisfaction_with_environment,gender,seniority_level,position,satisfaction_with_job,married_or_single,last_raise_pct,last_performance_rating,total_years_working,years_at_company,years_in_current_job,years_since_last_promotion,years_with_current_supervisor
0,No,37,Travel_Rarely,Sales,16,4,4,Male,2,Sales Executive,3,Divorced,19,3,9,1,0,0,0
1,No,39,Travel_Rarely,Research & Development,3,2,3,Male,2,Laboratory Technician,3,Divorced,15,3,11,10,8,0,7
2,No,52,Travel_Frequently,Research & Development,25,4,3,Female,4,Manufacturing Director,4,Married,22,4,31,9,8,0,0
3,No,50,Non-Travel,Sales,1,3,4,Female,2,Sales Executive,3,Married,12,3,19,18,7,0,13
4,No,44,Travel_Rarely,Research & Development,4,3,4,Male,2,Healthcare Representative,2,Single,12,3,10,5,2,2,3


In [3]:
df['left_company'] = df['left_company'].replace({'Yes': 1, 'No': 0})

In [4]:
from sklearn.model_selection import train_test_split

y = df['left_company']
X = df.iloc[:, 1:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

print(f'Training examples: {X_train.shape[0]:,}')
print(f'Test examples: {X_test.shape[0]:,}')

Training examples: 800
Test examples: 200


In [5]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier

cat_vars = list(df.dtypes[df.dtypes == object].index)
num_vars = list(df.dtypes[df.dtypes == 'int64'].index)
num_vars.remove('left_company')

processing_pipeline = ColumnTransformer(transformers=[
#     ('numscaling', StandardScaler(), num_vars),
    ('dummys', OneHotEncoder(drop='first'), cat_vars)]
)

modeling_pipeline = Pipeline([
    ('data_processing', processing_pipeline),
    ('dt', DecisionTreeClassifier())]
)

modeling_pipeline.fit(X_train, y_train)

In [6]:
pred_test = modeling_pipeline.predict(X_test)

pred_train = modeling_pipeline.predict(X_train)

In [7]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

accuracy_test = accuracy_score(y_test, pred_test)
accuracy_train = accuracy_score(y_train, pred_train)

precision_test = precision_score(y_test, pred_test)
precision_train = precision_score(y_train, pred_train)


recall_test = recall_score(y_test, pred_test)
recall_train = recall_score(y_train, pred_train)


print("Training Accuracy:", accuracy_train, "   Testing Accuracy:", accuracy_test)
print("Training Precision:", precision_train, "   Testing Precision:", precision_test)
print("Training Recall:", recall_train, "   Testing Recall:", recall_test)

Training Accuracy: 0.73125    Testing Accuracy: 0.68
Training Precision: 0.7384196185286104    Testing Precision: 0.73
Training Recall: 0.6948717948717948    Testing Recall: 0.6636363636363637


In [8]:
from sklearn.ensemble import RandomForestClassifier

rf_pipeline = Pipeline([
    ('data_processing', processing_pipeline),
    ('rf', RandomForestClassifier())]
)

rf_pipeline.fit(X_train, y_train)

In [9]:
pred_test = rf_pipeline.predict(X_test)

pred_train = rf_pipeline.predict(X_train)

In [10]:
accuracy_test = accuracy_score(y_test, pred_test)
accuracy_train = accuracy_score(y_train, pred_train)

precision_test = precision_score(y_test, pred_test)
precision_train = precision_score(y_train, pred_train)


recall_test = recall_score(y_test, pred_test)
recall_train = recall_score(y_train, pred_train)


print("Training Accuracy:", accuracy_train, "   Testing Accuracy:", accuracy_test)
print("Training Precision:", precision_train, "   Testing Precision:", precision_test)
print("Training Recall:", recall_train, "   Testing Recall:", recall_test)

Training Accuracy: 0.73125    Testing Accuracy: 0.7
Training Precision: 0.7204030226700252    Testing Precision: 0.7358490566037735
Training Recall: 0.7333333333333333    Testing Recall: 0.7090909090909091


### Now, we have 3 metrics on which to evaluate the model, let us see which is the most optimal

In [11]:
df["left_company"].value_counts()

0    500
1    500
Name: left_company, dtype: int64

### Since the data is very much balanced, I believe that accuracy in this case is the perfect metric to evaluate the models

#### Looking at the testing accuracy of the 2 models, over multiple iterations, it is hard to decide which is giving a better score. Both are very close to each other and depending on the split, give different values. Thus, it is hard to say which is performing better. However, after running it a few times, I have noticed that the random forest is giving a better value more often than not (in terms of test accuracy)