In [31]:
from sklearn.naive_bayes import GaussianNB
import pandas as pd
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [32]:
diabetes = pd.read_csv("diabetes.csv")
X = diabetes.drop("Outcome", axis=1)
y = diabetes["Outcome"]

In [33]:
X_train, X_test, y_train, y_test = train_test_split(
X, y, random_state=42, test_size=0.2, shuffle=True)

In [34]:
gaussNB = GaussianNB()
gaussNB.fit(X_train, y_train)
cross_val_score(gaussNB, X_train, y_train, cv=3, scoring="accuracy")

array([0.77073171, 0.74634146, 0.7254902 ])

In [35]:
y_pred = gaussNB.predict(X_train)
conf_mx_1 = confusion_matrix(y_train, y_pred)
conf_mx_1

array([[338,  63],
       [ 89, 124]], dtype=int64)

In [36]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [37]:
gaussNB = GaussianNB()
gaussNB.fit(X_train_scaled, y_train)
cross_val_score(gaussNB, X_train_scaled, y_train, cv=3, scoring="accuracy")

array([0.77073171, 0.74634146, 0.7254902 ])

In [38]:
y_pred = gaussNB.predict(X_train_scaled)
conf_mx_2 = confusion_matrix(y_train, y_pred)
conf_mx_2

array([[338,  63],
       [ 89, 124]], dtype=int64)

In [39]:
params = [
    {"var_smoothing": [1e-3, 1e-2, 2e-3, 2e-2]}
]

grid_search = GridSearchCV(gaussNB, params, cv=3, 
                          scoring="neg_mean_squared_error",
                          return_train_score=True)

In [50]:
grid_search.fit(X_train_scaled, y_train)
best_estimator = grid_search.best_estimator_

best_estimator.fit(X_train_scaled, y_train)
cross_val_score(best_estimator, X_train_scaled, y_train, cv=3, scoring="accuracy")

array([0.77560976, 0.74634146, 0.73039216])

In [51]:
y_pred = best_estimator.predict(X_train_scaled)
conf_mx_3 = confusion_matrix(y_train, y_pred),
conf_mx_3

(array([[339,  62],
        [ 90, 123]], dtype=int64),)

In [48]:
grid_search.fit(X_train, y_train)
best_estimator = grid_search.best_estimator_

best_estimator.fit(X_train, y_train)
cross_val_score(best_estimator, X_train, y_train, cv=3, scoring="accuracy")

array([0.76585366, 0.74634146, 0.72058824])

In [49]:
y_pred = best_estimator.predict(X_train)
conf_mx_4 = confusion_matrix(y_train, y_pred)
conf_mx_4

array([[347,  54],
       [ 99, 114]], dtype=int64)

In [42]:
conf_mx_1 - conf_mx_2

array([[0, 0],
       [0, 0]], dtype=int64)

In [52]:
conf_mx_2 - conf_mx_3

array([[[-1,  1],
        [-1,  1]]], dtype=int64)

In [53]:
conf_mx_2 - conf_mx_4

array([[ -9,   9],
       [-10,  10]], dtype=int64)