Q1 (Gaussian Naïve Bayes Classifier) Implement Gaussian Naïve Bayes
Classifier on the Iris dataset from sklearn.datasets using

(i) Step-by-step implementation

(ii) In-built function

In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

data=load_iris()
X=data.data; y=data.target
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42,stratify=y)

def fit_gaussian_nb(X,y):
    classes=np.unique(y)
    means={}; vars_={}; priors={}
    for c in classes:
        Xc=X[y==c]
        means[c]=Xc.mean(axis=0)
        vars_[c]=Xc.var(axis=0)+1e-9
        priors[c]=Xc.shape[0]/X.shape[0]
    return classes,means,vars_,priors

def gaussian_log_likelihood(x,mean,var):
    return -0.5*np.sum(np.log(2*np.pi*var)) -0.5*np.sum(((x-mean)**2)/var)

def predict_gaussian_nb(X,model):
    classes,means,vars_,priors=model
    y_pred=[]
    for x in X:
        scores=[gaussian_log_likelihood(x,means[c],vars_[c])+np.log(priors[c]) for c in classes]
        y_pred.append(classes[np.argmax(scores)])
    return np.array(y_pred)

model=fit_gaussian_nb(X_train,y_train)
y_pred_manual=predict_gaussian_nb(X_test,model)
print(accuracy_score(y_test,y_pred_manual))

0.9111111111111111


In [2]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

data=load_iris()
X=data.data; y=data.target
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42,stratify=y)

clf=GaussianNB()
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
print(accuracy_score(y_test,y_pred))


0.9111111111111111


Q2 Explore about GridSearchCV toot in scikit-learn. This is a tool that is
often used for tuning hyperparameters of machine learning models. Use
this tool to find the best value of K for K-NN Classifier using any dataset.

In [7]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

knn = KNeighborsClassifier()
param_grid = {"n_neighbors": list(range(1, 31))}

grid = GridSearchCV(
    knn, param_grid, cv=5, scoring="accuracy",
    return_train_score=True
)
grid.fit(X_train, y_train)

best_k = grid.best_params_["n_neighbors"]
best_cv = grid.best_score_
y_pred = grid.predict(X_test)
test_acc = accuracy_score(y_test, y_pred)

results = pd.DataFrame(grid.cv_results_)[
    ["param_n_neighbors", "mean_test_score", "std_test_score", "mean_train_score"]
]
results.columns = ["k", "cv_mean_accuracy", "cv_std", "train_mean_accuracy"]

print("Best K:", best_k)
print("Best CV Accuracy:", round(best_cv, 4))
print("Test Accuracy:", round(test_acc, 4))
print(results.head())



Best K: 9
Best CV Accuracy: 0.981
Test Accuracy: 0.9556
   k  cv_mean_accuracy    cv_std  train_mean_accuracy
0  1          0.961905  0.019048             1.000000
1  2          0.961905  0.019048             0.988095
2  3          0.961905  0.019048             0.966667
3  4          0.933333  0.038095             0.966667
4  5          0.942857  0.035635             0.973810
