In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# 1. Separate dataset into features (X) and labels (y)

In [2]:
data_file = "mushrooms_data.csv"
features_file = "mushroom_feature_names.csv"
data = pd.read_csv(data_file)
features = pd.read_csv(
    features_file, header=None
) 

features_list = features[0].tolist()

data.columns = features_list
X = data.drop("class", axis=1)
y = data["class"]  # Labels

data

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
1,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
2,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
3,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
4,e,x,y,y,t,a,f,c,b,n,...,s,w,w,p,w,o,p,k,n,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8118,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8119,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8120,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8121,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


# 2. Split the dataset into training and test sets

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 3. Difference between Naive Bayes and LDA 
Naive Bayes assumes independence between features, whereas LDA assumes normally distributed classes with equal covariance matrices.

# 4. Model parameters of Naive Bayes
The model parameters for Naive Bayes include class priors and feature likelihoods. Each parameter plays a role in computing the likelihood of observing a certain class given the features.

# 5. Implement Naive Bayes from scratch

In [4]:
class NaiveBayesClassifier:
    def __init__(self):
        self.class_probs = {}  
        self.feature_probs = {} 

    def fit(self, X, y):
        total_samples = len(y)
        classes, counts = np.unique(y, return_counts=True)
        for cls, count in zip(classes, counts):
            self.class_probs[cls] = count / total_samples

        for feature in X.columns:
            self.feature_probs[feature] = {} 
            for cls in classes:
                feature_values = X[y == cls][feature]
                value_counts = feature_values.value_counts()
                for value, count in value_counts.items():
                    self.feature_probs[feature][(value, cls)] = (
                        count / counts[np.where(classes == cls)]
                    )

    def predict(self, X):
        predictions = []
        for _, sample in X.iterrows():
            max_prob, max_class = -1, None
            for cls in self.class_probs:
                cls_prob = np.log(self.class_probs[cls]) 
                for feature in X.columns:
                    value = sample[feature]
                    if (value, cls) in self.feature_probs[feature]:
                        cls_prob += np.log(self.feature_probs[feature][(value, cls)])
                    else:
                        cls_prob += np.log(1e-10)
                if cls_prob > max_prob or max_class is None:
                    max_prob = cls_prob
                    max_class = cls
            predictions.append(max_class)
        return predictions

    def calculate_accuracy(self, y_true, y_pred):
        correct = sum(1 for true, pred in zip(y_true, y_pred) if true == pred)
        return correct / len(y_true)


nb_classifier = NaiveBayesClassifier()
nb_classifier.fit(X_train, y_train)

predicted_test = nb_classifier.predict(X_test)

accuracy = nb_classifier.calculate_accuracy(y_test, predicted_test)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9969230769230769


# 6. Evaluate the models

In [5]:
def calculate_precision(y_true, y_pred, target_class):
    true_positive = sum(
        1
        for true, pred in zip(y_true, y_pred)
        if pred == target_class and true == target_class
    )
    predicted_positive = sum(1 for pred in y_pred if pred == target_class)
    return true_positive / predicted_positive if predicted_positive > 0 else 0


def calculate_recall(y_true, y_pred, target_class):
    true_positive = sum(
        1
        for true, pred in zip(y_true, y_pred)
        if pred == target_class and true == target_class
    )
    actual_positive = sum(1 for true in y_true if true == target_class)
    return true_positive / actual_positive if actual_positive > 0 else 0


def calculate_f1_score(precision, recall):
    return (
        2 * (precision * recall) / (precision + recall)
        if (precision + recall) > 0
        else 0
    )


def confusion_matrix(y_true, y_pred):
    unique_classes = list(set(y_true))
    matrix = []
    for true_cls in unique_classes:
        row = []
        for pred_cls in unique_classes:
            count = sum(
                1
                for true, pred in zip(y_true, y_pred)
                if true == true_cls and pred == pred_cls
            )
            row.append(count)
        matrix.append(row)
    return matrix


predicted_train = nb_classifier.predict(X_train)
accuracy_train = nb_classifier.calculate_accuracy(y_train, predicted_train)
precision_train = calculate_precision(y_train, predicted_train, "p")
recall_train = calculate_recall(y_train, predicted_train, "p")
f1_train = calculate_f1_score(precision_train, recall_train)
conf_matrix_train = confusion_matrix(y_train, predicted_train)

predicted_test = nb_classifier.predict(X_test)
accuracy_test = nb_classifier.calculate_accuracy(y_test, predicted_test)
precision_test = calculate_precision(y_test, predicted_test, "p")
recall_test = calculate_recall(y_test, predicted_test, "p")
f1_test = calculate_f1_score(precision_test, recall_test)
conf_matrix_test = confusion_matrix(y_test, predicted_test)

print("Training Set Metrics:")
print(f"Accuracy: {accuracy_train}")
print(f"Precision (class 'p'): {precision_train}")
print(f"Recall (class 'p'): {recall_train}")
print(f"F1-score (class 'p'): {f1_train}")
print(f"Confusion Matrix:")
for row in conf_matrix_train:
    print(row)

print("\nTest Set Metrics:")
print(f"Accuracy: {accuracy_test}")
print(f"Precision (class 'p'): {precision_test}")
print(f"Recall (class 'p'): {recall_test}")
print(f"F1-score (class 'p'): {f1_test}")
print(f"Confusion Matrix:")
for row in conf_matrix_test:
    print(row)

Training Set Metrics:
Accuracy: 0.9964604493690367
Precision (class 'p'): 0.9936708860759493
Recall (class 'p'): 0.9990454979319122
F1-score (class 'p'): 0.9963509439949231
Confusion Matrix:
[3140, 3]
[20, 3335]

Test Set Metrics:
Accuracy: 0.9969230769230769
Precision (class 'p'): 0.9935649935649936
Recall (class 'p'): 1.0
F1-score (class 'p'): 0.9967721110393803
Confusion Matrix:
[772, 0]
[5, 848]


# 7. Misclassified samples

In [6]:
misclassified_train = [
    (true, pred) for true, pred in zip(y_train, predicted_train) if true != pred
]

false_positives_train = sum(
    1 for true, pred in misclassified_train if pred == "p" and true != pred
)
false_negatives_train = sum(
    1 for true, pred in misclassified_train if true == "p" and true != pred
)

misclassified_test = [
    (true, pred) for true, pred in zip(y_test, predicted_test) if true != pred
]

false_positives_test = sum(
    1 for true, pred in misclassified_test if pred == "p" and true != pred
)
false_negatives_test = sum(
    1 for true, pred in misclassified_test if true == "p" and true != pred
)

print("Training Set Misclassified Samples:")
print(f"False Positives (FP): {false_positives_train}")
print(f"False Negatives (FN): {false_negatives_train}")

print("\nTest Set Misclassified Samples:")
print(f"False Positives (FP): {false_positives_test}")
print(f"False Negatives (FN): {false_negatives_test}")

Training Set Misclassified Samples:
False Positives (FP): 20
False Negatives (FN): 3

Test Set Misclassified Samples:
False Positives (FP): 5
False Negatives (FN): 0


# 8. Report model parameters

In [7]:
class_priors = nb_classifier.class_probs
feature_probs = nb_classifier.feature_probs

print("Class Priors:")
for cls, prob in class_priors.items():
    print(f"Class '{cls}': {prob}")

print("\nConditional Probabilities (Feature|Class):")
for feature, value_probs in feature_probs.items():
    print(f"\nFeature: {feature}")
    for (value, cls), prob in value_probs.items():
        print(f"Class '{cls}': Probability for '{value}' of '{feature}': {prob}")

Class Priors:
Class 'e': 0.5163127116035703
Class 'p': 0.48368728839642966

Conditional Probabilities (Feature|Class):

Feature: cap-shape
Class 'e': Probability for 'x' of 'cap-shape': [0.45782414]
Class 'e': Probability for 'f' of 'cap-shape': [0.38390462]
Class 'e': Probability for 'b' of 'cap-shape': [0.09567809]
Class 'e': Probability for 'k' of 'cap-shape': [0.05633383]
Class 'e': Probability for 's' of 'cap-shape': [0.00625931]
Class 'p': Probability for 'x' of 'cap-shape': [0.43493478]
Class 'p': Probability for 'f' of 'cap-shape': [0.39580019]
Class 'p': Probability for 'k' of 'cap-shape': [0.15685651]
Class 'p': Probability for 'b' of 'cap-shape': [0.01145402]
Class 'p': Probability for 'c' of 'cap-shape': [0.0009545]

Feature: cap-surface
Class 'e': Probability for 'f' of 'cap-surface': [0.37377049]
Class 'e': Probability for 'y' of 'cap-surface': [0.35737705]
Class 'e': Probability for 's' of 'cap-surface': [0.26885246]
Class 'p': Probability for 'y' of 'cap-surface': [0.44