## Which is the best model: LR,KNN,GaussianNB?

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,log_loss
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV,StratifiedKFold,KFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import warnings as ws
ws.filterwarnings("ignore")

In [2]:
hr=pd.read_csv("HR_comma_sep.csv")

In [3]:
hr.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.1,0.77,6,247,4,0,1,0,sales,low
3,0.92,0.85,5,259,5,0,1,0,sales,low
4,0.89,1.0,5,224,5,0,1,0,sales,low


In [4]:
hr=pd.get_dummies(hr,drop_first=True)
hr.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department_RandD,Department_accounting,Department_hr,Department_management,Department_marketing,Department_product_mng,Department_sales,Department_support,Department_technical,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,1,0,False,False,False,False,False,False,True,False,False,True,False
1,0.8,0.86,5,262,6,0,1,0,False,False,False,False,False,False,True,False,False,False,True
2,0.1,0.77,6,247,4,0,1,0,False,False,False,False,False,False,True,False,False,True,False
3,0.92,0.85,5,259,5,0,1,0,False,False,False,False,False,False,True,False,False,True,False
4,0.89,1.0,5,224,5,0,1,0,False,False,False,False,False,False,True,False,False,True,False


In [5]:
X=hr.drop("left",axis=1)
y=hr['left']

## Logistic Regression

In [6]:
lr=LogisticRegression()

In [7]:
kfold=StratifiedKFold(n_splits=5,shuffle=True,random_state=23)

In [8]:
params={'penalty':['l1','l2','elasticnet',None],
        'solver':['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']}

In [9]:
gcv=GridSearchCV(lr,param_grid=params,cv=kfold,scoring='neg_log_loss')

In [10]:
gcv.fit(X,y)

In [11]:
print("best parameter :",gcv.best_params_)
print("best score :",gcv.best_score_)

best parameter : {'penalty': 'l2', 'solver': 'newton-cholesky'}
best score : -0.429305155426844


## Gaussian NB

In [12]:
nb=GaussianNB()

In [13]:
params={'var_smoothing':np.linspace(0,1,20)}

In [14]:
gcv=GridSearchCV(nb,param_grid=params,cv=kfold,scoring='neg_log_loss')
gcv.fit(X,y)

In [15]:
print("best parameter :",gcv.best_params_)
print("best score :",gcv.best_score_)

best parameter : {'var_smoothing': 0.05263157894736842}
best score : -0.5214781243205817


## KNN

In [16]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.pipeline import Pipeline

In [17]:
scaler=StandardScaler()
knn=KNeighborsClassifier()
pipe=Pipeline([('SCL',scaler),('KNN',knn)])

In [18]:
params={'KNN__n_neighbors':np.arange(1,51),'SCL':[StandardScaler(),MinMaxScaler()]}

In [19]:
gcv=GridSearchCV(pipe,param_grid=params,cv=kfold,scoring='neg_log_loss')
gcv.fit(X,y)

In [20]:
print("best parameter :",gcv.best_params_)
print("best score :",gcv.best_score_)

best parameter : {'KNN__n_neighbors': 50, 'SCL': StandardScaler()}
best score : -0.3150312991191325


## **Conclusion**

KNN is the best model because log loss is less and nearer to zero among the three