In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import numpy as np

# Load the dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step-by-step Gaussian Naïve Bayes implementation

# Function to calculate mean and variance for each feature in each class
def calculate_class_statistics(X, y):
    class_statistics = {}
    for class_value in np.unique(y):
        X_class = X[y == class_value]
        class_statistics[class_value] = {
            'mean': np.mean(X_class, axis=0),
            'variance': np.var(X_class, axis=0)
        }
    return class_statistics

# Function to calculate Gaussian probability density function (PDF)
def gaussian_pdf(x, mean, variance):
    # Add a small value to the variance to avoid division by zero
    variance = variance + 1e-6
    exponent = np.exp(-((x - mean)**2) / (2 * variance))
    return (1 / (np.sqrt(2 * np.pi * variance))) * exponent

# Function to calculate the posterior probability for each class
def calculate_posterior_probability(x, class_statistics, prior_probabilities):
    posteriors = {}
    for class_value, stats in class_statistics.items():
        prior = prior_probabilities[class_value]
        likelihood = np.prod(gaussian_pdf(x, stats['mean'], stats['variance']))
        posteriors[class_value] = prior * likelihood
    return posteriors

# Function to make predictions
def predict(X, class_statistics, prior_probabilities):
    predictions = []
    for x in X:
        posteriors = calculate_posterior_probability(x, class_statistics, prior_probabilities)
        best_class = max(posteriors, key=posteriors.get)
        predictions.append(best_class)
    return predictions

# Train the model (calculate statistics and prior probabilities)
class_statistics = calculate_class_statistics(X_train, y_train)
prior_probabilities = {class_value: len(y_train[y_train == class_value]) / len(y_train) for class_value in np.unique(y_train)}

# Evaluate the model on the testing data
y_pred = predict(X_test, class_statistics, prior_probabilities)

# Calculate accuracy
accuracy = np.mean(y_pred == y_test)
print(f"Custom Gaussian Naïve Bayes Accuracy: {accuracy}")

Custom Gaussian Naïve Bayes Accuracy: 1.0


**Reasoning**:
The custom Gaussian Naïve Bayes model has been implemented, trained, and evaluated. The next step is to implement the Gaussian Naïve Bayes classifier using the in-built function from scikit-learn and compare its performance.



In [2]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# In-built Gaussian Naïve Bayes implementation

# Initialize and train the in-built model
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# Make predictions on the testing data
y_pred_inbuilt = gnb.predict(X_test)

# Calculate accuracy
accuracy_inbuilt = accuracy_score(y_test, y_pred_inbuilt)
print(f"In-built Gaussian Naïve Bayes Accuracy: {accuracy_inbuilt}")

In-built Gaussian Naïve Bayes Accuracy: 1.0


## Explore gridsearchcv for k-nn

### Subtask:
Choose a suitable dataset for K-NN (e.g., the Iris dataset), split the data into training and testing sets, define a parameter grid for the number of neighbors (K) for K-NN, use `GridSearchCV` to find the best value of K, train a K-NN model with the best K found by GridSearchCV, and evaluate the K-NN model.


**Reasoning**:
Load the Iris dataset and split it into training and testing sets as per the instructions.



In [3]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# Load the dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**Reasoning**:
Import necessary modules, instantiate KMeansClassifier, define the parameter grid, and instantiate GridSearchCV.



In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# Instantiate a KNeighborsClassifier object
knn = KNeighborsClassifier()

# Define a dictionary named param_grid with a single key 'n_neighbors'
param_grid = {'n_neighbors': [3, 5, 7, 9, 11]}

# Instantiate a GridSearchCV object
grid_search = GridSearchCV(knn, param_grid, cv=5)

**Reasoning**:
Fit the GridSearchCV object to the training data and print the best value of K.



In [5]:
# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Print the best value of K found by GridSearchCV
print(f"Best K value found by GridSearchCV: {grid_search.best_params_['n_neighbors']}")

Best K value found by GridSearchCV: 3


**Reasoning**:
Train a K-NN model with the best K found by GridSearchCV, make predictions on the test set, and calculate the accuracy.



In [6]:
from sklearn.metrics import accuracy_score

# Train a K-NN model with the best K value
best_k = grid_search.best_params_['n_neighbors']
knn_best = KNeighborsClassifier(n_neighbors=best_k)
knn_best.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = knn_best.predict(X_test)

# Evaluate the performance by calculating the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of K-NN model with best K on the test set: {accuracy}")

Accuracy of K-NN model with best K on the test set: 1.0


## Summary:

### Data Analysis Key Findings

*   Both the custom and the in-built Gaussian Naïve Bayes implementations achieved perfect accuracy (1.0) on the Iris dataset test set.
*   Using `GridSearchCV`, the best value for the `n_neighbors` parameter (K) for the K-NN classifier on the Iris dataset was found to be 3.
*   A K-NN model trained with the best K (K=3) achieved an accuracy of 1.0 on the test set.

