In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv("/Users/varun/Desktop/VS CODE/ML group project'/WA_Fn-UseC_-HR-Employee-Attrition.csv")
print(df.head())

   Age Attrition     BusinessTravel  DailyRate              Department  \
0   41       Yes      Travel_Rarely       1102                   Sales   
1   49        No  Travel_Frequently        279  Research & Development   
2   37       Yes      Travel_Rarely       1373  Research & Development   
3   33        No  Travel_Frequently       1392  Research & Development   
4   27        No      Travel_Rarely        591  Research & Development   

   DistanceFromHome  Education EducationField  EmployeeCount  EmployeeNumber  \
0                 1          2  Life Sciences              1               1   
1                 8          1  Life Sciences              1               2   
2                 2          2          Other              1               4   
3                 3          4  Life Sciences              1               5   
4                 2          1        Medical              1               7   

   ...  RelationshipSatisfaction StandardHours  StockOptionLevel  \
0  ...

In [3]:
for column in df.columns:
    print(f"{column}: Number of unique values {df[column].nunique()}")
    print("==========================================================")

Age: Number of unique values 43
Attrition: Number of unique values 2
BusinessTravel: Number of unique values 3
DailyRate: Number of unique values 886
Department: Number of unique values 3
DistanceFromHome: Number of unique values 29
Education: Number of unique values 5
EducationField: Number of unique values 6
EmployeeCount: Number of unique values 1
EmployeeNumber: Number of unique values 1470
EnvironmentSatisfaction: Number of unique values 4
Gender: Number of unique values 2
HourlyRate: Number of unique values 71
JobInvolvement: Number of unique values 4
JobLevel: Number of unique values 5
JobRole: Number of unique values 9
JobSatisfaction: Number of unique values 4
MaritalStatus: Number of unique values 3
MonthlyIncome: Number of unique values 1349
MonthlyRate: Number of unique values 1427
NumCompaniesWorked: Number of unique values 10
Over18: Number of unique values 1
OverTime: Number of unique values 2
PercentSalaryHike: Number of unique values 15
PerformanceRating: Number of uni

In [4]:
df.drop(['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours'], axis="columns", inplace=True)

In [5]:
categorical_col = []
for column in df.columns:
    if df[column].dtype == object and len(df[column].unique()) <= 50:
        categorical_col.append(column)
        
df['Attrition'] = df.Attrition.astype("category").cat.codes

In [6]:
label = LabelEncoder()
for column in categorical_col:
    df[column] = label.fit_transform(df[column])

In [7]:
df.drop_duplicates(inplace=True)

In [8]:
df

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1,2,1102,2,1,2,1,2,0,...,3,1,0,8,0,1,6,4,0,5
1,49,0,1,279,1,8,1,1,3,1,...,4,4,1,10,3,3,10,7,1,7
2,37,1,2,1373,1,2,2,4,4,1,...,3,2,0,7,3,3,0,0,0,0
3,33,0,1,1392,1,3,4,1,4,0,...,3,3,0,8,3,3,8,7,3,0
4,27,0,2,591,1,2,1,3,1,1,...,3,4,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,0,1,884,1,23,2,3,3,1,...,3,3,1,17,3,3,5,2,0,3
1466,39,0,2,613,1,6,1,3,4,1,...,3,1,1,9,5,3,7,7,1,7
1467,27,0,2,155,1,4,3,1,2,1,...,4,2,1,6,0,3,6,2,0,3
1468,49,0,1,1023,2,2,3,3,4,1,...,3,4,0,17,3,2,9,6,0,8


In [9]:
X = df.drop('Attrition', axis=1)
y = df.Attrition
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
X_train = MinMaxScaler().fit_transform(X_train)
X_test = MinMaxScaler().fit_transform(X_test)

In [11]:
model = LogisticRegression()
model.fit(X_train,y_train)
print(classification_report(y_test,model.predict(X_test)))
print("accuracy_score: ",accuracy_score(y_test,model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.90      0.99      0.94       255
           1       0.77      0.26      0.38        39

    accuracy                           0.89       294
   macro avg       0.83      0.62      0.66       294
weighted avg       0.88      0.89      0.87       294

accuracy_score:  0.891156462585034


In [12]:
model1 = DecisionTreeClassifier()
model1.fit(X_train,y_train)
print(classification_report(y_test,model1.predict(X_test)))
print("accuracy_score: ",accuracy_score(y_test,model1.predict(X_test)))

              precision    recall  f1-score   support

           0       0.87      0.87      0.87       255
           1       0.17      0.18      0.18        39

    accuracy                           0.78       294
   macro avg       0.52      0.53      0.52       294
weighted avg       0.78      0.78      0.78       294

accuracy_score:  0.7789115646258503


In [13]:
model2 = RandomForestClassifier()
model2.fit(X_train,y_train)
print(classification_report(y_test,model2.predict(X_test)))
print("accuracy_score: ",accuracy_score(y_test,model2.predict(X_test)))

              precision    recall  f1-score   support

           0       0.88      0.99      0.93       255
           1       0.67      0.10      0.18        39

    accuracy                           0.87       294
   macro avg       0.77      0.55      0.55       294
weighted avg       0.85      0.87      0.83       294

accuracy_score:  0.8741496598639455


In [14]:
model3 = GaussianNB()
model3.fit(X_train,y_train)
print(classification_report(y_test,model3.predict(X_test)))
print("accuracy_score: ",accuracy_score(y_test,model3.predict(X_test)))

              precision    recall  f1-score   support

           0       0.93      0.89      0.91       255
           1       0.44      0.59      0.51        39

    accuracy                           0.85       294
   macro avg       0.69      0.74      0.71       294
weighted avg       0.87      0.85      0.86       294

accuracy_score:  0.8469387755102041


In [15]:
model4 = SVC(kernel='rbf')
model4.fit(X_train,y_train)
print(classification_report(y_test,model4.predict(X_test)))
print("accuracy_score: ",accuracy_score(y_test,model4.predict(X_test)))

              precision    recall  f1-score   support

           0       0.88      1.00      0.94       255
           1       1.00      0.13      0.23        39

    accuracy                           0.88       294
   macro avg       0.94      0.56      0.58       294
weighted avg       0.90      0.88      0.84       294

accuracy_score:  0.8843537414965986


In [16]:
model4 = SVC(kernel='linear')
model4.fit(X_train,y_train)
print(classification_report(y_test,model4.predict(X_test)))
print("accuracy_score: ",accuracy_score(y_test,model4.predict(X_test)))

              precision    recall  f1-score   support

           0       0.89      1.00      0.94       255
           1       0.88      0.18      0.30        39

    accuracy                           0.89       294
   macro avg       0.88      0.59      0.62       294
weighted avg       0.89      0.89      0.85       294

accuracy_score:  0.8877551020408163
