# Import libraries

In [11]:
import numpy as np
import pandas as pd
import plotly.express as px

from sklearn import svm
from sklearn import tree
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split


# Load and check dataset

In [2]:
df_data = pd.read_csv('dataset/HR_comma_sep.csv')
df_data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [3]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   Department             14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


# Data preparation

In [4]:
# separate input (features) and output (target)
y = df_data[['left']]
X = df_data.drop('left', axis=1)

In [5]:
# separate train set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.25)
print(X_train.shape)
print(X_test.shape)

(11249, 9)
(3750, 9)


In [6]:
# encode categorical features
encoder = OrdinalEncoder()
encoder.fit(X_train[['salary', 'Department']])
X_train[['salary', 'Department']] = encoder.transform(X_train[['salary', 'Department']])
X_test[['salary', 'Department']] = encoder.transform(X_test[['salary', 'Department']])
# encoder.inverse_transform(df_data[['salary']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


# Train some ML models

In [7]:
# create a dictionary of models
model_dict = {
                'LogisticRegression': LogisticRegression(max_iter=1000), 
                'GaussianNB': GaussianNB(), 
                'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=1), 
                'DecisionTreeClassifier': DecisionTreeClassifier(min_samples_split=25),
                'SVM': svm.SVC(kernel='rbf',probability=False),
                'RandomForestClassifier': RandomForestClassifier(n_estimators = 10, min_samples_split=2, max_depth=30),
            }

In [26]:
model_scores = []
model_names = []
for name, model in model_dict.items():
    model.fit(X_train, np.ravel(y_train))
    prediction = model.predict(X_test)
    acc_score = accuracy_score(y_test, prediction)
    model_scores.append(round(acc_score, 4))
    model_names.append(name)
    
    print('*'*10 + name + '*'*10 + ': {}'.format(acc_score))

**********LogisticRegression**********: 0.7586666666666667
**********GaussianNB**********: 0.7994666666666667
**********KNeighborsClassifier**********: 0.9538666666666666
**********DecisionTreeClassifier**********: 0.9744
**********SVM**********: 0.7834666666666666
**********RandomForestClassifier**********: 0.9864


In [31]:
fig = px.line(
    x=model_names,
    y=model_scores,
    labels={'x':'Models', 'y':'Accuracy Score'},
    text=model_scores,
    # textposition='auto'
)
fig.update_traces(textposition='bottom right', textfont_size=14)

fig.show()

In [9]:
# k_range = range(1,26)
# scores=[]
# for k in k_range:
#     knn=KNeighborsClassifier(n_neighbors=k)
#     knn.fit(X_train, np.ravel(y_train))
#     y_pred=knn.predict(X_test)
#     scores.append(accuracy_score(y_test,y_pred))

# fig = px.line(x=k_range, y=scores, width=1000, labels={'x':'Value of k for KNN', 'y':'Testing accuracy'})
# fig.show()

In [10]:
# # The Random forest classifier seems to produce the best results so far, so let's try to optimise it!

# max_depth = range(1,50)
# scores=[]
# for r in max_depth:
#     clf_rf = RandomForestClassifier(n_estimators = 10, min_samples_split=2, max_depth=r)
#     clf_rf.fit(X_train, np.ravel(y_train))
#     y_pred=clf_rf.predict(X_test)
#     scores.append(accuracy_score(y_test,y_pred))
    
# fig = px.line(x=max_depth, y=scores, width=1000, labels={'x':'Value of r for Random Forest', 'y':'Testing accuracy'})
# fig.show()