In [58]:
# importing the necesaary libraries

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
%matplotlib inline

## Preprocessing

In [59]:
df = pd.read_csv('turnover.csv')

### Understanding the data

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
satisfaction_level       14999 non-null float64
last_evaluation          14999 non-null float64
number_project           14999 non-null int64
average_montly_hours     14999 non-null int64
time_spend_company       14999 non-null int64
Work_accident            14999 non-null int64
left                     14999 non-null int64
promotion_last_5years    14999 non-null int64
sales                    14999 non-null object
salary                   14999 non-null object
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [61]:
df.describe()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years
count,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0
mean,0.612834,0.716102,3.803054,201.050337,3.498233,0.14461,0.238083,0.021268
std,0.248631,0.171169,1.232592,49.943099,1.460136,0.351719,0.425924,0.144281
min,0.09,0.36,2.0,96.0,2.0,0.0,0.0,0.0
25%,0.44,0.56,3.0,156.0,3.0,0.0,0.0,0.0
50%,0.64,0.72,4.0,200.0,3.0,0.0,0.0,0.0
75%,0.82,0.87,5.0,245.0,4.0,0.0,0.0,0.0
max,1.0,1.0,7.0,310.0,10.0,1.0,1.0,1.0


In [62]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [63]:
df.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident', 'left',
       'promotion_last_5years', 'sales', 'salary'],
      dtype='object')

In [64]:
df.left.unique()

array([1, 0])

### Encoding 'sales' and 'salary'

In [65]:
df.sales.unique()

array(['sales', 'accounting', 'hr', 'technical', 'support', 'management',
       'IT', 'product_mng', 'marketing', 'RandD'], dtype=object)

In [66]:
df.salary.unique()

array(['low', 'medium', 'high'], dtype=object)

In [67]:
from sklearn.preprocessing import LabelEncoder

In [68]:
label_encoder = LabelEncoder()

In [69]:
sales_label = label_encoder.fit_transform(df.sales)
sales_label

array([7, 7, 7, ..., 8, 8, 8])

In [70]:
salary_label = pd.get_dummies(df.salary, prefix='sal')
salary_label.head()

Unnamed: 0,sal_high,sal_low,sal_medium
0,0,1,0
1,0,0,1
2,0,0,1
3,0,1,0
4,0,1,0


In [71]:
df_new = pd.concat([df, salary_label], axis=1, join='inner')

In [72]:
sales_label

array([7, 7, 7, ..., 8, 8, 8])

In [73]:
df_new['sales_encoded'] = sales_label
df_new.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary,sal_high,sal_low,sal_medium,sales_encoded
0,0.38,0.53,2,157,3,0,1,0,sales,low,0,1,0,7
1,0.8,0.86,5,262,6,0,1,0,sales,medium,0,0,1,7
2,0.11,0.88,7,272,4,0,1,0,sales,medium,0,0,1,7
3,0.72,0.87,5,223,5,0,1,0,sales,low,0,1,0,7
4,0.37,0.52,2,159,3,0,1,0,sales,low,0,1,0,7


In [74]:
# Removing 'sales' an 'salary' columns
df_new = df_new.drop(['sales','salary'], axis=1)

In [75]:
df_new.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sal_high,sal_low,sal_medium,sales_encoded
0,0.38,0.53,2,157,3,0,1,0,0,1,0,7
1,0.8,0.86,5,262,6,0,1,0,0,0,1,7
2,0.11,0.88,7,272,4,0,1,0,0,0,1,7
3,0.72,0.87,5,223,5,0,1,0,0,1,0,7
4,0.37,0.52,2,159,3,0,1,0,0,1,0,7


## Training and Cross-Validation

In [78]:
X = df_new.drop('left',axis=1)
X.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,sal_high,sal_low,sal_medium,sales_encoded
0,0.38,0.53,2,157,3,0,0,0,1,0,7
1,0.8,0.86,5,262,6,0,0,0,0,1,7
2,0.11,0.88,7,272,4,0,0,0,0,1,7
3,0.72,0.87,5,223,5,0,0,0,1,0,7
4,0.37,0.52,2,159,3,0,0,0,1,0,7


In [79]:
Y = df_new['left']
Y.head()

0    1
1    1
2    1
3    1
4    1
Name: left, dtype: int64

In [None]:
# importing models
from sklearn.linear_model import LogisticRegression
from sklearn.