In [31]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import numpy as np


with open('car.data') as f:
  header = f.readline().strip().split(',')
  data = np.loadtxt(f, delimiter=',', dtype=str)


X_encoded = np.zeros_like(data[:, :-1], dtype=np.float64)
for i in range(data.shape[1] - 1):
  if isinstance(data[0, i], str):
    le = LabelEncoder()
    X_encoded[:, i] = le.fit_transform(data[:, i])
  else:
    X_encoded[:, i] = data[:, i].astype(np.float64)

y = X_encoded[:, -1]


class NaiveBayes:
    def __init__(self):
        self.class_prior = {}
        self.class_mean = {}
        self.class_var = {}
        self.unique_classes = []
        
    def fit(self, X_train, y_train):
        self.unique_classes = np.unique(y_train)
        n_samples, n_features = X_train.shape
        
        for class_label in self.unique_classes:
            X_class = X_train[y_train == class_label]
            self.class_prior[class_label] = X_class.shape[0] / n_samples
        

        for class_label in self.unique_classes:
            X_class = X_train[y_train == class_label]
            self.class_mean[class_label] = np.mean(X_class, axis=0)
            self.class_var[class_label] = np.var(X_class, axis=0)
    
    def predict(self, X_test):
        n_samples, n_features = X_test.shape
        y_pred = np.zeros(n_samples, dtype=self.unique_classes.dtype)
        
        for i in range(n_samples):
            posteriors = []
            for class_label in self.unique_classes:
                prior = np.log(self.class_prior[class_label])
                likelihood = 0
                for j in range(n_features):
                    likelihood += np.log(self.pdf(X_test[i,j], self.class_mean[class_label][j], self.class_var[class_label][j]))
                posteriors.append(prior + likelihood)
            y_pred[i] = self.unique_classes[np.argmax(posteriors)]
            
        return y_pred
    
    def pdf(self, x, mean, var):
        return (1.0 / np.sqrt(2 * np.pi * var)) * np.exp(-(x - mean)**2 / (2 * var))



def kfold(data, n_folds):
  np.random.shuffle(data)  
  total_size = len(data)
  fold_size = total_size // n_folds
  print(fold_size)
  accuracies = list()
  for i in range(n_folds):
    start_test = i * fold_size
    end_test = (i + 1) * fold_size
    test = data[start_test:end_test]
    train = np.vstack((data[:start_test], data[end_test:]))
    


    train_x, train_y = train[:, :-1], train[:, -1]
    test_x, test_y = test[:, :-1], test[:, -1]
    nb = NaiveBayes()
    nb.fit(train_x, train_y)
    y_pred = nb.predict(test_x)
    accuracy = accuracy_score(test_y, y_pred)
    accuracies.append(accuracy)
  return accuracies

n_folds = 10
accuracies = kfold(X_encoded, n_folds)
accuracies = np.array(accuracies)
print(f"Average Accuracy Cars Dataset: {np.mean(accuracies)*100}%")
for i in range(len(accuracies)):
  print("Fold {} accuracy score: {:.1%}".format(i + 1, accuracies[i]))

172
Average Accuracy Cars Dataset: 23.313953488372093%
Fold 1 accuracy score: 23.3%
Fold 2 accuracy score: 22.1%
Fold 3 accuracy score: 24.4%
Fold 4 accuracy score: 21.5%
Fold 5 accuracy score: 22.7%
Fold 6 accuracy score: 20.9%
Fold 7 accuracy score: 24.4%
Fold 8 accuracy score: 23.3%
Fold 9 accuracy score: 23.3%
Fold 10 accuracy score: 27.3%
