In [54]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.metrics import classification_report, roc_auc_score

In [8]:
df = pd.read_csv("Cases/Cancer/Cancer.csv", index_col = 0)

In [10]:
df.head()

Unnamed: 0_level_0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,Class
subjid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,40-49,premeno,15 to 19,0 to 2,yes,three,right,left_up,no,recurrence-events
2,50-59,ge40,15 to 19,0 to 2,no,one,right,central,no,no-recurrence-events
3,50-59,ge40,35 to 39,0 to 2,no,two,left,left_low,no,recurrence-events
4,40-49,premeno,35 to 39,0 to 2,yes,three,right,left_low,yes,no-recurrence-events
5,40-49,premeno,30 to 34,3 to 5,yes,two,left,right_up,no,recurrence-events


In [32]:
X = df.drop('Class', axis = 1)
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 24, test_size = .3)
ohe = OneHotEncoder(sparse_output = False, handle_unknown = 'ignore').set_output(transform = 'pandas')
nb = BernoulliNB()

In [34]:
pipe = Pipeline([('OHE', ohe), ('NB', nb)])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print(classification_report(y_test, y_pred))

                      precision    recall  f1-score   support

no-recurrence-events       0.84      0.81      0.82        69
   recurrence-events       0.32      0.35      0.33        17

            accuracy                           0.72        86
           macro avg       0.58      0.58      0.58        86
        weighted avg       0.73      0.72      0.73        86



In [40]:
y_pred_prob = pipe.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_pred_prob)

0.6730605285592498

In [48]:
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 24)
params = {'NB__alpha': np.linspace(0.0001, 3, 10)}
gcv = GridSearchCV(pipe, param_grid = params,cv = kfold, scoring = 'roc_auc')
gcv.fit(X,y)
print(gcv.best_params_, gcv.best_score_, sep = '\n')

{'NB__alpha': 2.6666777777777777}
0.7194010043041608


# Kernal naive bayes
- using hr dataset

In [66]:
df = pd.read_csv("Cases/human-resources-analytics/HR_comma_sep.csv")
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.1,0.77,6,247,4,0,1,0,sales,low
3,0.92,0.85,5,259,5,0,1,0,sales,low
4,0.89,1.0,5,224,5,0,1,0,sales,low


In [70]:
X = df.drop('left', axis = 1)
y = df['left']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 24, test_size = .3)
nbk = GaussianNB()
pipe = Pipeline([('OHE', ohe), ('NBK', nbk)])
pipe = pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print(classification_report(y_test, y_pred))

y_pred_proba = pipe.predict_proba(X_test)[:,1]
print(roc_auc_score(y_test, y_pred_proba))

              precision    recall  f1-score   support

           0       0.99      0.69      0.81      3445
           1       0.49      0.97      0.65      1054

    accuracy                           0.75      4499
   macro avg       0.74      0.83      0.73      4499
weighted avg       0.87      0.75      0.77      4499

0.8333627097545324


In [74]:
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 24)
params = {'NBK__var_smoothing': np.linspace(1e-15, 1, 10)}
gcv = GridSearchCV(pipe, cv = kfold, param_grid = params, scoring = 'roc_auc')
gcv.fit(X,y)

In [76]:
print(gcv.best_params_, gcv.best_score_, sep = '\n')

{'NBK__var_smoothing': 0.111111111111112}
0.9654732081687083
