In [1]:
from sklearn import datasets
from math import exp, sqrt, pi
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.naive_bayes import GaussianNB
import pandas as pd

In [2]:
def load_iris_dataset():
    iris = datasets.load_iris()
    df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
    df['target'] = iris.target
    return df

In [3]:
def summarize_dataset(dataset):
    return [(mean, std, count) for mean, std, count in zip(dataset.mean(), dataset.std(), dataset.count())]

In [4]:
def separate_by_class(dataset):
    return dataset.groupby('target')

In [5]:
def summarize_by_class(dataset):
    separated = separate_by_class(dataset)
    summaries = {}
    for class_value, group in separated:
        summaries[class_value] = summarize_dataset(group.drop('target', axis=1))
    return summaries

In [6]:
def calculate_probability(x, mean, stdev):
    exponent = exp(-((x - mean)**2 / (2 * stdev**2)))
    return (1 / (sqrt(2 * pi) * stdev)) * exponent

In [7]:
def calculate_class_probabilities(summaries, row):
    total_rows = sum([summaries[label][0][2] for label in summaries])
    probabilities = {}
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] = summaries[class_value][0][2] / float(total_rows)
        for i, (mean, stdev, _) in enumerate(class_summaries):
            probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
    return probabilities

In [8]:
def predict(summaries, row):
    probabilities = calculate_class_probabilities(summaries, row)
    return int(max(probabilities, key=probabilities.get))

In [9]:
def train_naive_bayes_scratch(dataset):
    summaries = summarize_by_class(dataset)
    predictions = [predict(summaries, row[:-1]) for _, row in dataset.iterrows()]
    return predictions

In [10]:
def train_naive_bayes_sklearn(X, y):
    gnb = GaussianNB()
    naive_bayes_pred = gnb.fit(X, y).predict(X)
    return naive_bayes_pred

In [11]:
# Load Iris dataset
iris_df = load_iris_dataset()

In [12]:
# Train Naive Bayes from scratch
naive_bayes_scratch_pred = train_naive_bayes_scratch(iris_df)

# Add scratch predictions to the DataFrame
iris_df['naive_bayes_scratch_pred'] = naive_bayes_scratch_pred

# Calculate and print confusion matrix and accuracy score for scratch predictions
cnf_scratch = confusion_matrix(iris_df['target'], iris_df['naive_bayes_scratch_pred'])
acc_scratch = accuracy_score(iris_df['target'], iris_df['naive_bayes_scratch_pred'])
print("Confusion Matrix (Scratch):")
print(cnf_scratch)
print("Accuracy Score (Scratch):", acc_scratch)

Confusion Matrix (Scratch):
[[50  0  0]
 [ 0 47  3]
 [ 0  3 47]]
Accuracy Score (Scratch): 0.96


In [13]:
# Train Naive Bayes using scikit-learn
naive_bayes_sklearn_pred = train_naive_bayes_sklearn(iris_df.drop('target', axis=1), iris_df['target'])

# Add scikit-learn predictions to the DataFrame
iris_df['naive_bayes_sklearn_pred'] = naive_bayes_sklearn_pred

# Calculate and print confusion matrix and accuracy score for scikit-learn predictions
cnf_sklearn = confusion_matrix(iris_df['target'], iris_df['naive_bayes_sklearn_pred'])
acc_sklearn = accuracy_score(iris_df['target'], iris_df['naive_bayes_sklearn_pred'])
print("Confusion Matrix (scikit-learn):")
print(cnf_sklearn)
print("Accuracy Score (scikit-learn):", acc_sklearn)

Confusion Matrix (scikit-learn):
[[50  0  0]
 [ 0 47  3]
 [ 0  3 47]]
Accuracy Score (scikit-learn): 0.96


In [14]:
# Display the DataFrame
print("Updated DataFrame:")
print(iris_df)

Updated DataFrame:
     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                  5.1               3.5                1.4               0.2   
1                  4.9               3.0                1.4               0.2   
2                  4.7               3.2                1.3               0.2   
3                  4.6               3.1                1.5               0.2   
4                  5.0               3.6                1.4               0.2   
..                 ...               ...                ...               ...   
145                6.7               3.0                5.2               2.3   
146                6.3               2.5                5.0               1.9   
147                6.5               3.0                5.2               2.0   
148                6.2               3.4                5.4               2.3   
149                5.9               3.0                5.1               1.8   

     tar