In [177]:
# Dependencies

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests

from sklearn import tree
from sklearn import preprocessing

In [178]:
#Load IBM HR data set

ibm_df = pd.read_csv("../MachineLearningProject/IBM-Watson.csv")
ibm_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [179]:
# Want to view all the column names
originalheaders = ibm_df.dtypes.index
originalheaders

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [189]:
new_ibm_df = ibm_df.drop(columns=['DailyRate', 'EmployeeCount','EmployeeNumber', 'MonthlyIncome', 'MonthlyRate', 'Over18'])
new_ibm_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,Female,94,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,Male,61,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,Male,92,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,Female,56,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,Male,40,...,4,80,1,6,3,3,2,2,2,2


In [181]:
# Want to view all the column names
newheaders = new_ibm_df.dtypes.index
newheaders

Index(['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EmployeeCount',
       'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement',
       'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus',
       'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

In [182]:
# Check data quality
new_ibm_df.isnull().any().head()

Age                 False
Attrition           False
BusinessTravel      False
Department          False
DistanceFromHome    False
dtype: bool

In [183]:
#NOTE: not sure why businesstravel and dept aren't included
columns = ["BusinessTravel", "Department",
       "EducationField", "EmployeeCount",
       "Gender", "JobRole", "MaritalStatus", "OverTime",
       "StockOptionLevel"]
dummy = pd.get_dummies(new_ibm_df[columns])
dummy.head()

Unnamed: 0,EmployeeCount,StockOptionLevel,BusinessTravel_Non-Travel,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,Department_Human Resources,Department_Research & Development,Department_Sales,EducationField_Human Resources,EducationField_Life Sciences,...,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,OverTime_No,OverTime_Yes
0,1,0,0,0,1,0,0,1,0,1,...,0,0,0,1,0,0,0,1,0,1
1,1,1,0,1,0,0,1,0,0,1,...,0,0,1,0,0,0,1,0,1,0
2,1,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,1
3,1,0,0,1,0,0,1,0,0,1,...,0,0,1,0,0,0,1,0,0,1
4,1,1,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,1,0,1,0


In [188]:
new_ibm_df = pd.concat([new_ibm_df, dummy], axis=1)
new_ibm_df = new_ibm_df.drop(columns, axis=1)
new_ibm_df.head()

Unnamed: 0,Age,Attrition,DistanceFromHome,Education,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,NumCompaniesWorked,...,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,OverTime_No,OverTime_Yes
0,41,Yes,1,2,2,94,3,2,4,8,...,0,0,0,1,0,0,0,1,0,1
1,49,No,8,1,3,61,2,2,2,1,...,0,0,1,0,0,0,1,0,1,0
2,37,Yes,2,2,4,92,2,1,3,6,...,0,0,0,0,0,0,0,1,0,1
3,33,No,3,4,4,56,3,1,3,1,...,0,0,1,0,0,0,1,0,0,1
4,27,No,2,1,1,40,3,1,2,9,...,0,0,0,0,0,0,1,0,1,0


In [185]:
new_ibm_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 49 columns):
Age                                  1470 non-null int64
Attrition                            1470 non-null object
DistanceFromHome                     1470 non-null int64
Education                            1470 non-null int64
EnvironmentSatisfaction              1470 non-null int64
HourlyRate                           1470 non-null int64
JobInvolvement                       1470 non-null int64
JobLevel                             1470 non-null int64
JobSatisfaction                      1470 non-null int64
NumCompaniesWorked                   1470 non-null int64
PercentSalaryHike                    1470 non-null int64
PerformanceRating                    1470 non-null int64
RelationshipSatisfaction             1470 non-null int64
StandardHours                        1470 non-null int64
TotalWorkingYears                    1470 non-null int64
TrainingTimesLastYear                

In [186]:
#Delete redundant columns
# Delete the following columns: BusinessTravel_Travel_Rarely, OverTime_No, MaritalStatus_Single,JobRole_Sales Representative,
# Gender_Female, EducationField_Marketing, Department_Sales



In [187]:
from sklearn.preprocessing import LabelEncoder
test = LabelEncoder()
y = test.fit_transform(new_ibm_df['Attrition'].values)
y

array([1, 0, 1, ..., 0, 0, 0], dtype=int64)

In [None]:
#KMeans
#Logistic regression
#Decision trees or random forest - do this
#Neural network - change features and see which are most important to monitor