In [54]:
## Import Library ##
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## For Data Encoding ##
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

## For Model Evaluation ##
from sklearn.model_selection import KFold

## Machine Learning Model ##
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from xgboost.sklearn import XGBClassifier

## For Over Sampling
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler

## For Model Performance ##
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

In [55]:
## Model Performance
def evaluation(gt, pred):
    acc = accuracy_score(gt, pred)
    precision = precision_score(gt, pred)
    recall = recall_score(gt, pred)
    f1 = f1_score(gt, pred)
    matrix = confusion_matrix(gt, pred)
    
    return acc, precision, recall, f1, matrix

In [56]:
raw_data = pd.read_csv("./EmployeeAttrition.csv", index_col=0)
#raw_data = pd.read_csv("./fold/fold_0_train.csv")

In [57]:
categorical_col = []
numeric_col = []
for col in raw_data.columns:
    if raw_data[col].dtype == object and col != "Attrition":
        categorical_col.append(col)
        print(col, raw_data[col].unique())
        print("========================================================================")
    elif raw_data[col].dtype == int and col != "Attrition":
        numeric_col.append(col)

BusinessTravel ['Travel_Rarely' 'Travel_Frequently' 'Non-Travel']
Department ['Sales' 'Research & Development' 'Human Resources']
EducationField ['Life Sciences' 'Other' 'Medical' 'Marketing' 'Technical Degree'
 'Human Resources']
Gender ['Female' 'Male']
JobRole ['Sales Executive' 'Research Scientist' 'Laboratory Technician'
 'Manufacturing Director' 'Healthcare Representative' 'Manager'
 'Sales Representative' 'Research Director' 'Human Resources']
MaritalStatus ['Single' 'Married' 'Divorced']


In [58]:
## Encode Label
raw_data["Attrition"] = raw_data["Attrition"].astype("category").cat.codes

In [59]:
## Data Encoding (one-hot encoding)
one_hot_encoding_df = pd.get_dummies(raw_data, columns=categorical_col)

In [60]:
set(one_hot_encoding_df.columns) - set(numeric_col)

{'Age',
 'Attrition',
 'BusinessTravel_Non-Travel',
 'BusinessTravel_Travel_Frequently',
 'BusinessTravel_Travel_Rarely',
 'Department_Human Resources',
 'Department_Research & Development',
 'Department_Sales',
 'DistanceFromHome',
 'Education',
 'EducationField_Human Resources',
 'EducationField_Life Sciences',
 'EducationField_Marketing',
 'EducationField_Medical',
 'EducationField_Other',
 'EducationField_Technical Degree',
 'EmployeeNumber',
 'EnvironmentSatisfaction',
 'Gender_Female',
 'Gender_Male',
 'JobInvolvement',
 'JobLevel',
 'JobRole_Healthcare Representative',
 'JobRole_Human Resources',
 'JobRole_Laboratory Technician',
 'JobRole_Manager',
 'JobRole_Manufacturing Director',
 'JobRole_Research Director',
 'JobRole_Research Scientist',
 'JobRole_Sales Executive',
 'JobRole_Sales Representative',
 'JobSatisfaction',
 'MaritalStatus_Divorced',
 'MaritalStatus_Married',
 'MaritalStatus_Single',
 'MonthlyIncome',
 'NumCompaniesWorked',
 'PerformanceRating',
 'RelationshipSat

In [49]:
#one_hot_encoding_df = one_hot_encoding_df.drop(['DistanceFromHome','Education','EmployeeNumber','JobInvolvement','PerformanceRating','RelationshipSatisfaction','TrainingTimesLastYear','WorkLifeBalance','YearsSinceLastPromotion','BusinessTravel_Non-Travel','BusinessTravel_Travel_Frequently','BusinessTravel_Travel_Rarely','Department_Human Resources','Department_Research & Development','Department_Sales','EducationField_Human Resources','EducationField_Life Sciences','EducationField_Marketing','EducationField_Medical','EducationField_Other','EducationField_Technical Degree','Gender_Female','Gender_Male','JobRole_Healthcare Representative','JobRole_Human Resources','JobRole_Laboratory Technician','JobRole_Manager','JobRole_Manufacturing Director','JobRole_Research Director','JobRole_Research Scientist','JobRole_Sales Executive','JobRole_Sales Representative','MaritalStatus_Divorced','MaritalStatus_Married','MaritalStatus_Single'],axis=1)
#one_df.info()

In [61]:
# MinMaxScaler

minmax = MinMaxScaler()
data_minmax = minmax.fit_transform(one_hot_encoding_df)

In [62]:
one_hot_encoding_df = pd.DataFrame(data_minmax)
#one_hot_encoding_df.columns = ['Attrition','Education','EnvironmentSatisfaction','JobInvolvement','JobLevel','JobSatisfaction','MonthlyIncome','RelationshipSatisfaction','StockOptionLevel','TotalWorkingYears','TrainingTimesLastYear','WorkLifeBalance','YearsAtCompany', 'YearsInCurrentRole','YearsSinceLastPromotion','YearsWithCurrManager','Department_Human Resources','Department_Research & Development','Department_Sales','EducationField_Human Resources','EducationField_Life Sciences','EducationField_Marketing','EducationField_Medical','EducationField_Other','EducationField_Technical Degree','JobRole_Healthcare Representative','JobRole_Human Resources','JobRole_Laboratory Technician','JobRole_Manager','JobRole_Manufacturing Director','JobRole_Research Director','JobRole_Research Scientist','JobRole_Sales Executive','JobRole_Sales Representative']
one_hot_encoding_df.columns = ['Age','Attrition','DistanceFromHome','Education','EmployeeNumber','EnvironmentSatisfaction','JobInvolvement','JobLevel','JobSatisfaction','MonthlyIncome','NumCompaniesWorked','PerformanceRating','RelationshipSatisfaction','StockOptionLevel','TotalWorkingYears','TrainingTimesLastYear','WorkLifeBalance','YearsAtCompany','YearsInCurrentRole','YearsSinceLastPromotion','YearsWithCurrManager','BusinessTravel_Non-Travel','BusinessTravel_Travel_Frequently','BusinessTravel_Travel_Rarely','Department_Human Resources','Department_Research & Development','Department_Sales','EducationField_Human Resources','EducationField_Life Sciences','EducationField_Marketing','EducationField_Medical','EducationField_Other','EducationField_Technical Degree','Gender_Female','Gender_Male','JobRole_Healthcare Representative','JobRole_Human Resources','JobRole_Laboratory Technician','JobRole_Manager','JobRole_Manufacturing Director','JobRole_Research Director','JobRole_Research Scientist','JobRole_Sales Executive','JobRole_Sales Representative','MaritalStatus_Divorced','MaritalStatus_Married','MaritalStatus_Single']
#one_hot_encoding_df = one_hot_encoding_df.drop(['serial'],axis=1)
#one_hot_encoding_df.info()

#one_hot_encoding_df.columns = ['Age','Attrition','EnvironmentSatisfaction','JobLevel','JobSatisfaction','MonthlyIncome','NumCompaniesWorked','StockOptionLevel','TotalWorkingYears','YearsAtCompany','YearsInCurrentRole','YearsWithCurrManager']   
one_hot_encoding_df


Unnamed: 0,Age,Attrition,DistanceFromHome,Education,EmployeeNumber,EnvironmentSatisfaction,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,...,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single
0,0.547619,1.0,0.000000,0.25,0.000000,0.333333,0.666667,0.25,1.000000,0.262454,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0.738095,0.0,0.250000,0.00,0.000484,0.666667,0.333333,0.25,0.333333,0.217009,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.452381,1.0,0.035714,0.25,0.001451,1.000000,0.333333,0.00,0.666667,0.056925,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.357143,0.0,0.071429,0.75,0.001935,1.000000,0.666667,0.00,0.666667,0.100053,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,0.214286,0.0,0.035714,0.00,0.002903,0.000000,0.666667,0.00,0.333333,0.129489,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,0.428571,0.0,0.785714,0.25,0.996613,0.666667,1.000000,0.25,1.000000,0.082254,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1466,0.500000,0.0,0.178571,0.00,0.997097,1.000000,0.333333,0.50,0.000000,0.472986,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1467,0.214286,0.0,0.107143,0.50,0.998065,0.333333,1.000000,0.25,0.333333,0.270300,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1468,0.738095,0.0,0.035714,0.50,0.998549,1.000000,0.333333,0.25,0.333333,0.230700,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [37]:
importance_dict = {}
for col, importance in zip(train_X.columns, np.mean(np.array(avg_feature_importance), axis=0)):
    importance_dict[col] = importance

sorted(importance_dict.items(), key=lambda x: -x[1])[:10]

[('MonthlyIncome', 0.07986930503472876),
 ('Age', 0.07023974439272818),
 ('EmployeeNumber', 0.06139997489183877),
 ('TotalWorkingYears', 0.05758442111230984),
 ('DistanceFromHome', 0.056886962612780936),
 ('YearsAtCompany', 0.04769892346849075),
 ('NumCompaniesWorked', 0.039689572832535096),
 ('YearsWithCurrManager', 0.03686524283575744),
 ('EnvironmentSatisfaction', 0.034365518357885025),
 ('JobSatisfaction', 0.033839804363389195)]

In [38]:
## Data Splitting and Model Learning (Random Forest)
avg_acc = 0
avg_precision = 0
avg_recall = 0
avg_f1 = 0
avg_confusion_matrix = []
avg_feature_importance = []

kf = KFold(n_splits=5)
fold_count = 0
for train_index, test_index in kf.split(one_hot_encoding_df):
    print("Training Data: %d, Testing Data: %d" % (len(train_index), len(test_index)))
    train_X = one_hot_encoding_df.loc[train_index, one_hot_encoding_df.columns != 'Attrition']
    train_y = one_hot_encoding_df.loc[train_index]["Attrition"]
    test_X = one_hot_encoding_df.loc[test_index, one_hot_encoding_df.columns != 'Attrition']
    test_y = one_hot_encoding_df.loc[test_index]["Attrition"]
    
    #sm = SMOTE(random_state=42)
    #X_res, Y_res = sm.fit_resample(train_X, train_y)
  
    model = RandomForestClassifier(n_estimators=600)
    model = model.fit(train_X, train_y)
    #model = model.fit(X_res, Y_res)
    test_predict = model.predict(test_X)
       
    avg_feature_importance.append(model.feature_importances_)
    
    acc, precision, recall, f1, matrix = evaluation(test_y, test_predict)
    print("Fold: %d, Accuracy: %f, Precision: %f, Recall: %f, F1: %f" % (fold_count + 1, round(acc, 3), round(precision, 3), round(recall, 3), round(f1, 3)))
    avg_acc += acc
    avg_precision += precision
    avg_recall += recall
    avg_f1 += f1
    avg_confusion_matrix.append(matrix)
    fold_count += 1

print("=================================================================================")
print("Avg Accuracy: %f, Avg Precision: %f, Avg Recall: %f, Avg F1: %f" % (round(avg_acc / kf.get_n_splits(), 3), \
                                                                           round(avg_precision / kf.get_n_splits(), 3), \
                                                                           round(avg_recall / kf.get_n_splits(), 3), \
                                                                           round(avg_f1 / kf.get_n_splits(), 3)))

Training Data: 1176, Testing Data: 294
Fold: 1, Accuracy: 0.827000, Precision: 0.333000, Recall: 0.085000, F1: 0.136000
Training Data: 1176, Testing Data: 294
Fold: 2, Accuracy: 0.861000, Precision: 0.562000, Recall: 0.209000, F1: 0.305000
Training Data: 1176, Testing Data: 294
Fold: 3, Accuracy: 0.810000, Precision: 0.588000, Recall: 0.169000, F1: 0.263000
Training Data: 1176, Testing Data: 294
Fold: 4, Accuracy: 0.840000, Precision: 0.444000, Recall: 0.178000, F1: 0.254000
Training Data: 1176, Testing Data: 294
Fold: 5, Accuracy: 0.864000, Precision: 0.636000, Recall: 0.163000, F1: 0.259000
Avg Accuracy: 0.840000, Avg Precision: 0.513000, Avg Recall: 0.161000, Avg F1: 0.243000


In [64]:
## Data Splitting and Model Learning (SVM)
avg_acc = 0
avg_precision = 0
avg_recall = 0
avg_f1 = 0
avg_confusion_matrix = []

kf = KFold(n_splits=5)
fold_count = 0
for train_index, test_index in kf.split(one_hot_encoding_df):
    print("Training Data: %d, Testing Data: %d" % (len(train_index), len(test_index)))
    train_X = one_hot_encoding_df.loc[train_index, one_hot_encoding_df.columns != 'Attrition']
    train_y = one_hot_encoding_df.loc[train_index]["Attrition"]
    test_X = one_hot_encoding_df.loc[test_index, one_hot_encoding_df.columns != 'Attrition']
    test_y = one_hot_encoding_df.loc[test_index]["Attrition"]
    
    sm = SMOTE(random_state=24)
    X_res, Y_res = sm.fit_resample(train_X, train_y)

    model = SVC(C=2,kernel='linear',gamma=10,decision_function_shape='ovo')
    model = model.fit(train_X, train_y)
    #model = model.fit(X_res, Y_res)
    test_predict = model.predict(test_X)
       
    acc, precision, recall, f1, matrix = evaluation(test_y, test_predict)
    print("Fold: %d, Accuracy: %f, Precision: %f, Recall: %f, F1: %f" % (fold_count + 1, round(acc, 3), round(precision, 3), round(recall, 3), round(f1, 3)))
    avg_acc += acc
    avg_precision += precision
    avg_recall += recall
    avg_f1 += f1
    avg_confusion_matrix.append(matrix)
    fold_count += 1

print("=================================================================================")
print("Avg Accuracy: %f, Avg Precision: %f, Avg Recall: %f, Avg F1: %f" % (round(avg_acc / kf.get_n_splits(), 3), \
                                                                           round(avg_precision / kf.get_n_splits(), 3), \
                                                                           round(avg_recall / kf.get_n_splits(), 3), \
                                                                           round(avg_f1 / kf.get_n_splits(), 3)))


Training Data: 1176, Testing Data: 294
Fold: 1, Accuracy: 0.840000, Precision: 0.000000, Recall: 0.000000, F1: 0.000000
Training Data: 1176, Testing Data: 294
Fold: 2, Accuracy: 0.854000, Precision: 0.000000, Recall: 0.000000, F1: 0.000000
Training Data: 1176, Testing Data: 294
Fold: 3, Accuracy: 0.813000, Precision: 0.667000, Recall: 0.136000, F1: 0.225000
Training Data: 1176, Testing Data: 294
Fold: 4, Accuracy: 0.847000, Precision: 0.000000, Recall: 0.000000, F1: 0.000000
Training Data: 1176, Testing Data: 294
Fold: 5, Accuracy: 0.864000, Precision: 0.800000, Recall: 0.093000, F1: 0.167000
Avg Accuracy: 0.844000, Avg Precision: 0.293000, Avg Recall: 0.046000, Avg F1: 0.078000


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [65]:
## Data Splitting and Model Learning (OVERSAMPLING-XGBOOST)
avg_acc = 0
avg_precision = 0
avg_recall = 0
avg_f1 = 0
avg_confusion_matrix = []

kf = KFold(n_splits=5)
fold_count = 0
for train_index, test_index in kf.split(one_hot_encoding_df):
    print("Training Data: %d, Testing Data: %d" % (len(train_index), len(test_index)))
    train_X = one_hot_encoding_df.loc[train_index, one_hot_encoding_df.columns != 'Attrition']
    train_y = one_hot_encoding_df.loc[train_index]["Attrition"]
    test_X = one_hot_encoding_df.loc[test_index, one_hot_encoding_df.columns != 'Attrition']
    test_y = one_hot_encoding_df.loc[test_index]["Attrition"]
    
    #ros = RandomOverSampler(random_state=24)
    #X_res, Y_res = ros.fit_resample(train_X, train_y)
    sm = SMOTE(random_state=20)
    X_res, Y_res = sm.fit_resample(train_X, train_y)
   
    xgb_model = XGBClassifier()
    #xgb_model.fit(train_X, train_y)
    xgb_model.fit(X_res, Y_res)
    
    y_pred = xgb_model.predict(test_X)
    accuracy_score(test_y,y_pred)
    test_predict = xgb_model.predict(test_X)
    #conf_matrix(test_y,y_pred)   
    acc, precision, recall, f1, matrix = evaluation(test_y, test_predict)
    print("Fold: %d, Accuracy: %f, Precision: %f, Recall: %f, F1: %f" % (fold_count + 1, round(acc, 3), round(precision, 3), round(recall, 3), round(f1, 3)))
    avg_acc += acc
    avg_precision += precision
    avg_recall += recall
    avg_f1 += f1
    avg_confusion_matrix.append(matrix)
    fold_count += 1

print("=================================================================================")
print("Avg Accuracy: %f, Avg Precision: %f, Avg Recall: %f, Avg F1: %f" % (round(avg_acc / kf.get_n_splits(), 3), \
                                                                           round(avg_precision / kf.get_n_splits(), 3), \
                                                                           round(avg_recall / kf.get_n_splits(), 3), \
                                                                           round(avg_f1 / kf.get_n_splits(), 3)))

Training Data: 1176, Testing Data: 294
Fold: 1, Accuracy: 0.844000, Precision: 0.519000, Recall: 0.298000, F1: 0.378000
Training Data: 1176, Testing Data: 294
Fold: 2, Accuracy: 0.874000, Precision: 0.594000, Recall: 0.442000, F1: 0.507000
Training Data: 1176, Testing Data: 294
Fold: 3, Accuracy: 0.799000, Precision: 0.500000, Recall: 0.305000, F1: 0.379000
Training Data: 1176, Testing Data: 294
Fold: 4, Accuracy: 0.861000, Precision: 0.583000, Recall: 0.311000, F1: 0.406000
Training Data: 1176, Testing Data: 294
Fold: 5, Accuracy: 0.867000, Precision: 0.625000, Recall: 0.233000, F1: 0.339000
Avg Accuracy: 0.849000, Avg Precision: 0.564000, Avg Recall: 0.318000, Avg F1: 0.402000
