In [766]:
#importing libraries and packages and raw data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.preprocessing import LabelEncoder
import seaborn as sb
data=pd.read_excel("http://data.iabac.org/exam/p2/data/INX_Future_Inc_Employee_Performance_CDS_Project2_Data_V1.8.xls")
data.head()

Unnamed: 0,EmpNumber,Age,Gender,EducationBackground,MaritalStatus,EmpDepartment,EmpJobRole,BusinessTravelFrequency,DistanceFromHome,EmpEducationLevel,...,EmpRelationshipSatisfaction,TotalWorkExperienceInYears,TrainingTimesLastYear,EmpWorkLifeBalance,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition,PerformanceRating
0,E1001000,32,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,10,3,...,4,10,2,2,10,7,0,8,No,3
1,E1001006,47,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,14,4,...,4,20,2,3,7,7,1,7,No,3
2,E1001007,40,Male,Life Sciences,Married,Sales,Sales Executive,Travel_Frequently,5,4,...,3,20,2,3,18,13,1,12,No,4
3,E1001009,41,Male,Human Resources,Divorced,Human Resources,Manager,Travel_Rarely,10,4,...,2,23,2,2,21,6,12,6,No,3
4,E1001010,60,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,16,4,...,4,10,1,3,2,2,2,2,No,3


In [767]:
#listing out the data columns
data.columns

Index(['EmpNumber', 'Age', 'Gender', 'EducationBackground', 'MaritalStatus',
       'EmpDepartment', 'EmpJobRole', 'BusinessTravelFrequency',
       'DistanceFromHome', 'EmpEducationLevel', 'EmpEnvironmentSatisfaction',
       'EmpHourlyRate', 'EmpJobInvolvement', 'EmpJobLevel',
       'EmpJobSatisfaction', 'NumCompaniesWorked', 'OverTime',
       'EmpLastSalaryHikePercent', 'EmpRelationshipSatisfaction',
       'TotalWorkExperienceInYears', 'TrainingTimesLastYear',
       'EmpWorkLifeBalance', 'ExperienceYearsAtThisCompany',
       'ExperienceYearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager', 'Attrition', 'PerformanceRating'],
      dtype='object')

In [768]:
#EmpDepartment feature label encoding
le = LabelEncoder()
data.loc[:,"EmpDepartment"] = le.fit_transform(data.loc[:,"EmpDepartment"])
le.classes_


array(['Data Science', 'Development', 'Finance', 'Human Resources',
       'Research & Development', 'Sales'], dtype=object)

In [769]:
#selecting data without outliers in TotalWorkExperienceInYears feature
data=data[data.TotalWorkExperienceInYears < 30]

In [770]:
#selecting data without outliers in EmpLastSalaryHikePercent feature
data=data[data.EmpLastSalaryHikePercent < 20]

In [771]:
#selecting the X variables for modeling
X=data.loc[:,["EmpDepartment","EmpJobLevel","EmpJobInvolvement","EmpLastSalaryHikePercent","EmpJobSatisfaction","EmpEnvironmentSatisfaction","EmpRelationshipSatisfaction","TotalWorkExperienceInYears","ExperienceYearsInCurrentRole","YearsSinceLastPromotion"]]
X.head()

Unnamed: 0,EmpDepartment,EmpJobLevel,EmpJobInvolvement,EmpLastSalaryHikePercent,EmpJobSatisfaction,EmpEnvironmentSatisfaction,EmpRelationshipSatisfaction,TotalWorkExperienceInYears,ExperienceYearsInCurrentRole,YearsSinceLastPromotion
0,5,2,3,12,4,4,4,10,7,0
1,5,2,3,12,1,4,4,20,7,1
3,3,5,2,15,4,2,2,23,6,12
4,5,2,3,14,1,1,4,10,2,2
6,5,1,3,15,2,4,4,4,2,2


In [772]:
#selecting the Target variable for modeling
y = data["PerformanceRating"]

In [789]:
#splitting the data by using train_test_split method
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.2, random_state = 10)
X_train.to_excel('X_train_emp_per.xlsx',sheet_name='sheet1', index=False)
X_test.to_excel('X_test_emp_per.xlsx',sheet_name='sheet1', index=False)
y_train.to_excel('y_train_emp_per.xlsx',sheet_name='sheet1', index=False)
y_test.to_excel('y_test_emp_per.xlsx',sheet_name='sheet1', index=False)

In [790]:
#fitting into model
model=RandomForestClassifier()
model.fit(X_train,y_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [791]:
#predicting the model,checking accuracy score and confusion matrix
predictions = model.predict(X_test)
print(confusion_matrix(y_test, predictions))
print(accuracy_score(y_test, predictions))

[[ 29   4   0]
 [  1 160   0]
 [  0   3   0]]
0.9593908629441624


In [792]:
#importing joblib library
from sklearn.externals import joblib

In [793]:
#dumping model into a .ml file
joblib.dump(model, "model_emp_per.ml")

['model_emp_per.ml']