In [1]:
import pandas as pd
import numpy as np 
from collections import defaultdict
from math import sqrt, pi, exp

In [2]:
def gaussian_prob(x, mean, var):
    if var == 0:
        var = 1e-6
    exponent = exp(-((x - mean) ** 2) / (2 * var))
    return (1 / sqrt(2 * pi * var)) * exponent

In [3]:
class GaussianNaiveBayes:
    def __init__(self):
        self.class_priors = {} 
        self.class_means = defaultdict(dict)  
        self.class_vars = defaultdict(dict)  
        self.classes = None
    
    def fit(self, X, y):
        self.classes = np.unique(y)
        n_samples = len(y)
        
        for cls in self.classes:
            self.class_priors[cls] = np.sum(y == cls) / n_samples
        
        for cls in self.classes:
            X_cls = X[y == cls]
            for feature_idx in range(X.shape[1]):
                self.class_means[cls][feature_idx] = np.mean(X_cls[:, feature_idx])
                self.class_vars[cls][feature_idx] = np.var(X_cls[:, feature_idx])
    
    def predict(self, X):
        predictions = []
        for sample in X:
            posteriors = {}
            for cls in self.classes:
                posterior = np.log(self.class_priors[cls])
                for feature_idx, x in enumerate(sample):
                    mean = self.class_means[cls][feature_idx]
                    var = self.class_vars[cls][feature_idx]
                    posterior += np.log(gaussian_prob(x, mean, var) + 1e-10) 
                posteriors[cls] = posterior
            predictions.append(max(posteriors, key=posteriors.get))
        return np.array(predictions)

In [None]:
data = pd.read_csv('D:/Code/Bayes/letter-recognition.data', header = None)
X = data.iloc[:, 1:].values
Y = data.iloc[:, 0].values
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,T,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,I,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
2,D,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
3,N,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
4,G,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10


: 

In [22]:
model = GaussianNaiveBayes()
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)

In [24]:
accuracy = np.mean(Y_pred == Y_test) * 100
print(f'Accuracy: {accuracy:.2f}%')

Accuracy: 62.52%


In [25]:
idx = np.random.randint(0, len(X_test))
sample = X_test[idx]
true_label = Y_test[idx]
pred_label = model.predict([sample])[0]
print(f'Sample features: {sample}')
print(f'True label: {true_label}, Predicted: {pred_label}')

Sample features: [ 4  8  5  6  4 10  5  2  6 11  4  7  3  7  3  9]
True label: D, Predicted: J
