In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,precision_score,recall_score,f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB


In [2]:
df=pd.read_csv("ET.csv")
df.head()

Unnamed: 0,Job_Satisfaction,Performance_Rating,Years_At_Company,Work_Life_Balance,Distance_From_Home,Monthly_Income,Education_Level,Age,Num_Companies_Worked,Employee_Role,Annual_Bonus,Training_Hours,Department,Annual_Bonus_Squared,Annual_Bonus_Training_Hours_Interaction,Employee_Turnover
0,0.562326,0.141129,0.123989,0.347583,0.330353,0.328853,0.600933,0.31599,0.768736,0.090671,0.324786,0.669193,0.602932,0.105486,0.217344,0
1,0.017041,0.559047,0.511203,0.793908,0.42355,0.55345,0.742009,0.897146,0.380035,0.601633,0.694611,0.043271,0.800761,0.482484,0.030056,0
2,0.774699,0.604371,0.798174,0.2605,0.804034,0.1318,0.775178,0.830947,0.218726,0.972936,0.153476,0.701336,0.705275,0.023555,0.107638,1
3,0.628174,0.385249,0.230104,0.516809,0.272248,0.589249,0.482409,0.090507,0.402746,0.132842,0.305973,0.549688,0.600531,0.09362,0.16819,0
4,0.799183,0.199967,0.839029,0.247927,0.341934,0.076818,0.055356,0.68086,0.923341,0.493017,0.844094,0.793751,0.664679,0.712494,0.67,0


In [3]:
df=df.drop(columns=["Training_Hours","Annual_Bonus"])


## USE Naive Bayes

In [4]:
X=df.drop("Employee_Turnover",axis=1)
y=df["Employee_Turnover"]

# train and test data split 

X_train,X_test,y_train,y_test=train_test_split(
    X,y,test_size=0.2,random_state=42
)

# # LogisticRegression model use for the ML
gnb_model=GaussianNB()
gnb_model.fit(X_train,y_train)

y_pred=gnb_model.predict(X_test)

# print the scores
print("accuracy :",accuracy_score(y_test,y_pred)*100,"%")
print("preciision:",precision_score(y_test,y_pred)*100,"%")
print("recall score :" ,recall_score(y_test,y_pred)*100,"%")



accuracy : 84.07407407407408 %
preciision: 83.60655737704919 %
recall score : 81.6 %


## USE KNN

In [5]:
X=df.drop("Employee_Turnover",axis=1)
y=df["Employee_Turnover"]

# train and test data split 

X_train,X_test,y_train,y_test=train_test_split(
    X,y,test_size=0.2,random_state=42
)

scaler=StandardScaler()
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)

# # KNN model use for the ML
knn_model=KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_scaled,y_train)

y_pred=knn_model.predict(X_test_scaled)

# print the scores
print("accuracy :",accuracy_score(y_test,y_pred)*100,"%")
print("preciision:",precision_score(y_test,y_pred)*100,"%")
print("recall score :" ,recall_score(y_test,y_pred)*100,"%")



accuracy : 79.25925925925927 %
preciision: 78.99159663865547 %
recall score : 75.2 %


## validation method use

In [6]:
X=df.drop("Employee_Turnover",axis=1)
y=df["Employee_Turnover"]

# train and test data split 

X_train,X_test,y_train,y_test=train_test_split(
    X,y,test_size=0.2,random_state=42
)

scaler=StandardScaler()
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)

# perameters define forthe cross validation
classifier =KNeighborsClassifier()
param_grid={"n_neighbors":[3,5,7,9]}

# use the gridsearchcv for cross validation
classifierCV=GridSearchCV(
    classifier,
    param_grid,
    cv=5,
    scoring="recall"
    
)

y_pred=knn_model.predict(X_test_scaled)

# print the scores
print("accuracy :",accuracy_score(y_test,y_pred)*100,"%")
print("preciision:",precision_score(y_test,y_pred)*100,"%")
print("recall score :" ,recall_score(y_test,y_pred)*100,"%")



accuracy : 79.25925925925927 %
preciision: 78.99159663865547 %
recall score : 75.2 %


## Pipeline

In [8]:
# cross validatioon for hyperparameters tunning using gridsearch

X=df.drop("Employee_Turnover",axis=1)
y=df["Employee_Turnover"]

X_train,X_test,y_train,y_test=train_test_split(
    X,y,test_size=0.2,random_state=42
)

# create pipeline
pipeline = Pipeline([
        ('scaler',StandardScaler()),
        ('knn',KNeighborsClassifier())
])

                                     
param_grid={"knn__n_neighbors":[3,5,7,9]}

classifierCV=GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring="recall"
    
)

classifierCV.fit(X_train,y_train)

y_pred=classifierCV.predict(X_test)

# print the scores
print("accuracy :",accuracy_score(y_test,y_pred)*100,"%")
print("preciision:",precision_score(y_test,y_pred)*100,"%")
print("recall score :" ,recall_score(y_test,y_pred)*100,"%")

# print the result
res=pd.DataFrame(classifierCV.cv_results_)
print("the best parameter according to the validation :",classifierCV.best_params_)


accuracy : 77.4074074074074 %
preciision: 77.58620689655173 %
recall score : 72.0 %
the best parameter according to the validation : {'knn__n_neighbors': 7}
