1. (Gaussian Naïve Bayes Classifier) Implement Gaussian Naïve Bayes
Classifier on the Iris dataset from sklearn.datasets using
(i) Step-by-step implementation
(ii) In-built function

In [None]:
#1(i)
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


iris_bunch = datasets.load_iris()
flowers_data = iris_bunch.data
flowers_labels = iris_bunch.target


data_train, data_test, labels_train, labels_test = train_test_split(
    flowers_data, flowers_labels, test_size=0.3, random_state=42
)




def train_my_gaussian_nb(train_x, train_y):
    class_list = np.unique(train_y)


    mean_dict = {}
    var_dict = {}
    prior_dict = {}

    for cls in class_list:

        cls_rows = train_x[train_y == cls]


        mean_dict[cls] = np.mean(cls_rows, axis=0)
        var_dict[cls] = np.var(cls_rows, axis=0) + 1e-6

        prior_dict[cls] = cls_rows.shape[0] / float(train_x.shape[0])

    return class_list, mean_dict, var_dict, prior_dict
def my_gaussian_pdf(x_value, mean_value, var_value):
    numerator = np.exp(- ((x_value - mean_value) ** 2) / (2 * var_value))
    denominator = np.sqrt(2 * np.pi * var_value)
    return numerator / denominator


def predict_one_sample(sample_x, class_list, mean_dict, var_dict, prior_dict):
    class_probabilities = {}

    for cls in class_list:

        log_prob = np.log(prior_dict[cls])
        feature_means = mean_dict[cls]
        feature_vars = var_dict[cls]


        for j in range(len(sample_x)):
            prob_ij = my_gaussian_pdf(sample_x[j], feature_means[j], feature_vars[j])
            log_prob += np.log(prob_ij)

        class_probabilities[cls] = log_prob


    best_class = max(class_probabilities, key=class_probabilities.get)
    return best_class


def predict_my_gaussian_nb(test_x, class_list, mean_dict, var_dict, prior_dict):
    all_predictions = []
    for i in range(test_x.shape[0]):
        one_pred = predict_one_sample(test_x[i], class_list, mean_dict, var_dict, prior_dict)
        all_predictions.append(one_pred)
    return np.array(all_predictions)


class_list, mean_dict, var_dict, prior_dict = train_my_gaussian_nb(data_train, labels_train)


manual_predictions = predict_my_gaussian_nb(data_test, class_list, mean_dict, var_dict, prior_dict)


manual_accuracy = accuracy_score(labels_test, manual_predictions)
print("Manual Gaussian Naive Bayes accuracy:", manual_accuracy)


Manual Gaussian Naive Bayes accuracy: 0.9777777777777777


In [None]:
from sklearn.naive_bayes import GaussianNB

simple_gnb_model = GaussianNB()

simple_gnb_model.fit(data_train, labels_train)


sklearn_predictions = simple_gnb_model.predict(data_test)

sklearn_accuracy = accuracy_score(labels_test, sklearn_predictions)
print("Sklearn GaussianNB accuracy:", sklearn_accuracy)

Sklearn GaussianNB accuracy: 0.9777777777777777


Q2) Explore about GridSearchCV toot in scikit-learn. This is a tool that is
often used for tuning hyperparameters of machine learning models. Use
this tool to find the best value of K for K-NN Classifier using any dataset.

In [None]:

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

cancer_bunch = load_breast_cancer()
x_stuff = cancer_bunch.data
y_stuff = cancer_bunch.target


x_train_part, x_test_part, y_train_part, y_test_part = train_test_split(
    x_stuff,
    y_stuff,
    test_size=0.3,
    random_state=5
)


knn_box = KNeighborsClassifier()


k_values_list = list(range(1, 21))
grid_dictionary = {
    'n_neighbors': k_values_list
}


grid_machine = GridSearchCV(
    estimator=knn_box,
    param_grid=grid_dictionary,
    cv=5,
    scoring='accuracy'
)

grid_machine.fit(x_train_part, y_train_part)

print("Best K value found:", grid_machine.best_params_)
print("Best cross-validation accuracy:", grid_machine.best_score_)

best_model_from_grid = grid_machine.best_estimator_
y_test_predictions = best_model_from_grid.predict(x_test_part)


final_test_accuracy = accuracy_score(y_test_part, y_test_predictions)
print("Test accuracy with best K:", final_test_accuracy)


Best K value found: {'n_neighbors': 10}
Best cross-validation accuracy: 0.9220569620253165
Test accuracy with best K: 0.9766081871345029
