In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


df = pd.read_csv('https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/log_reg/employee-turnover-balanced.csv')
df.head()

Unnamed: 0,left_company,age,frequency_of_travel,department,commuting_distance,education,satisfaction_with_environment,gender,seniority_level,position,satisfaction_with_job,married_or_single,last_raise_pct,last_performance_rating,total_years_working,years_at_company,years_in_current_job,years_since_last_promotion,years_with_current_supervisor
0,No,37,Travel_Rarely,Sales,16,4,4,Male,2,Sales Executive,3,Divorced,19,3,9,1,0,0,0
1,No,39,Travel_Rarely,Research & Development,3,2,3,Male,2,Laboratory Technician,3,Divorced,15,3,11,10,8,0,7
2,No,52,Travel_Frequently,Research & Development,25,4,3,Female,4,Manufacturing Director,4,Married,22,4,31,9,8,0,0
3,No,50,Non-Travel,Sales,1,3,4,Female,2,Sales Executive,3,Married,12,3,19,18,7,0,13
4,No,44,Travel_Rarely,Research & Development,4,3,4,Male,2,Healthcare Representative,2,Single,12,3,10,5,2,2,3


### Question 1

In [2]:
df["left_company"].value_counts()

No     500
Yes    500
Name: left_company, dtype: int64

#### Here we can see that there are a 50-50 split between the two cases, thus there is no imbalance at all

### Question 2

In [3]:
cat_vars = list(df.dtypes[df.dtypes == object].index)
num_vars = list(df.dtypes[df.dtypes == 'int64'].index)

In [4]:
cat_vars

['left_company',
 'frequency_of_travel',
 'department',
 'gender',
 'position',
 'married_or_single']

In [5]:
num_vars

['age',
 'commuting_distance',
 'education',
 'satisfaction_with_environment',
 'seniority_level',
 'satisfaction_with_job',
 'last_raise_pct',
 'last_performance_rating',
 'total_years_working',
 'years_at_company',
 'years_in_current_job',
 'years_since_last_promotion',
 'years_with_current_supervisor']

#### I would split them based on their dtype. When printing dtype, it gives the column name as index and the type. So we take the index values of all those where the value is object, for categorical, and int for num_vars. This works in this case as there are no float or other number types, else we would have had to account for those as well

### Question 3

In [6]:
#to check for colinearity, we need to see how heavily correlated our numerical columns are
df.corr()

  df.corr()


Unnamed: 0,age,commuting_distance,education,satisfaction_with_environment,seniority_level,satisfaction_with_job,last_raise_pct,last_performance_rating,total_years_working,years_at_company,years_in_current_job,years_since_last_promotion,years_with_current_supervisor
age,1.0,0.012074,0.199138,0.001556,0.522604,0.095242,0.027851,0.003629,0.673804,0.38476,0.31001,0.242456,0.273679
commuting_distance,0.012074,1.0,0.033003,-0.019556,0.038915,0.023859,0.104421,0.089282,0.025593,0.023017,0.03189,0.047552,0.03152
education,0.199138,0.033003,1.0,-0.059586,0.080685,0.015148,0.013515,-0.014162,0.160822,0.091614,0.073181,0.077218,0.083453
satisfaction_with_environment,0.001556,-0.019556,-0.059586,1.0,0.009462,-0.00616,0.014812,0.006943,-0.027203,0.001339,0.023698,0.042132,0.021875
seniority_level,0.522604,0.038915,0.080685,0.009462,1.0,0.040606,-0.022683,-0.029956,0.779351,0.572724,0.478151,0.392935,0.430047
satisfaction_with_job,0.095242,0.023859,0.015148,-0.00616,0.040606,1.0,-0.037273,-0.08903,0.029119,0.07192,0.037591,0.038015,0.001472
last_raise_pct,0.027851,0.104421,0.013515,0.014812,-0.022683,-0.037273,1.0,0.792791,-0.004905,0.004435,0.039691,0.000615,0.060882
last_performance_rating,0.003629,0.089282,-0.014162,0.006943,-0.029956,-0.08903,0.792791,1.0,0.014877,0.022364,0.087038,0.030595,0.100502
total_years_working,0.673804,0.025593,0.160822,-0.027203,0.779351,0.029119,-0.004905,0.014877,1.0,0.685955,0.548494,0.423619,0.506007
years_at_company,0.38476,0.023017,0.091614,0.001339,0.572724,0.07192,0.004435,0.022364,0.685955,1.0,0.801423,0.630344,0.781147


In [7]:
mask = (df.corr() > 0.9) & (df.corr() < 1)

if mask.any().any():
    print("There are values between 0.9 and 1 in the DataFrame.")
else:
    print("There are no values between 0.9 and 1 in the DataFrame.")

There are no values between 0.9 and 1 in the DataFrame.


  mask = (df.corr() > 0.9) & (df.corr() < 1)


#### Thus, we can see there is no high correlation between any of the numerical columns. Hence, we will not be dropping any columns as they all provide valuable information

### Question 4

In [8]:
X = df.drop(columns='left_company')
y = df['left_company']

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,random_state=124)

### Question 5

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler


num_pipeline = Pipeline([('impute_missing', SimpleImputer(strategy='median')),
                           ('standardize_num', StandardScaler())
                        ])

cat_pipeline = Pipeline([('impute_missing_cats', SimpleImputer(strategy='most_frequent')),
                          ('create_dummies_cats', OneHotEncoder(handle_unknown='ignore', drop='first'))])

processing_pipeline = ColumnTransformer(transformers=[('proc_numeric', num_pipeline, num_vars)])

In [11]:
processing_pipeline.fit_transform(df).shape

(1000, 13)

In [12]:
len(num_vars)

13

#### As we can see, this is matching. Now for categorical,

In [13]:
processing_pipeline = ColumnTransformer(transformers=[('create_dummies', cat_pipeline, cat_vars)])

In [14]:
processing_pipeline.fit_transform(df).shape

(1000, 16)

In [15]:
for i in cat_vars:
    print(df[i].unique())

['No' 'Yes']
['Travel_Rarely' 'Travel_Frequently' 'Non-Travel']
['Sales' 'Research & Development' 'Human Resources']
['Male' 'Female']
['Sales Executive' 'Laboratory Technician' 'Manufacturing Director'
 'Healthcare Representative' 'Manager' 'Research Scientist'
 'Research Director' 'Sales Representative' 'Human Resources']
['Divorced' 'Married' 'Single']


#### This adds up to 22, for the 6 different types of categorical variables. However, when we drop the redundant first columns of each of these in one hot encoding, it comes to 16, which matches

### Question 6

In [16]:
processing_pipeline = ColumnTransformer(transformers=[('proc_numeric', num_pipeline, num_vars),
                                                      ('create_dummies', cat_pipeline, cat_vars[1:])]) #taking all except left company

In [17]:
from sklearn.linear_model import LogisticRegression

modeling_pipeline = Pipeline([('data_processing', processing_pipeline), 
                              ('lm', LogisticRegression())
                             ])
modeling_pipeline.fit(X_train, y_train)

In [18]:
test_predictions = modeling_pipeline.predict(X_test)
train_predictions = modeling_pipeline.predict(X_train)

In [19]:
from sklearn.metrics import accuracy_score
print('Test data accuracy:' + str(accuracy_score(y_test, test_predictions)))

Test data accuracy:0.66


In [20]:
print('Train data accuracy:' + str(accuracy_score(y_train, train_predictions)))

Train data accuracy:0.73375


### Question 7

#### From the results, we can see that the training and test set are close to each other, so it is clear the model is not overfitting, which is a good thing. However, the accuracy itself is not that great. To fix this, multiple steps can bbe taken

#### We can work with creating more features, which is called feature engineering. This could involve combining features in wasy that could be relevant to the data, which can be treated as polynomial features

#### Another method is to use another machine learning algorithm itself. Maybe we might get better results from SVM, or random forest algorithms

#### To evaluate this, we can analyze the ROC curve and the area under ths curve. It is also good to look at precision recall beyong just the accuracy as a metric