In [1]:
import numpy as np
import random

class KFold:
    def __init__(self, n_splits=10, shuffle=False, random_state=None):
        if n_splits < 2:
            raise ValueError("Number of splits must be greater than 1")
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state

    def split(self, X):
        if len(X) < self.n_splits:
            raise ValueError("Number of samples must be greater than or equal to the number of splits")
        indices = list(range(len(X)))

        if self.shuffle:
            random.seed(self.random_state)
            random.shuffle(indices)

        fold_sizes = [len(X) // self.n_splits] * self.n_splits
        fold_sizes[:len(X) % self.n_splits] = [fold_sizes[i] + 1 for i in range(len(X) % self.n_splits)]

        current = 0
        for fold_size in fold_sizes:
            start, stop = current, current + fold_size
            yield indices[start:stop], indices[:start] + indices[stop:]

            current = stop


def fit_naive_bayes(X_train, y_train):
    classes = np.unique(y_train)
    n_classes = len(classes)
    n_features = X_train.shape[1]
    priors = np.zeros(n_classes)
    means = np.zeros((n_classes, n_features))
    variances = np.zeros((n_classes, n_features))

    for i, c in enumerate(classes):
        X_c = X_train[y_train == c]
        priors[i] = len(X_c) / len(X_train)
        means[i] = X_c.mean(axis=0)
        variances[i] = X_c.var(axis=0)

    return classes, priors, means, variances

def predict_naive_bayes(X_test, classes, priors, means, variances):
    n_samples, n_features = X_test.shape
    n_classes = len(classes)
    log_posteriors = np.zeros((n_samples, n_classes))

    for i, c in enumerate(classes):
        log_prior = np.log(priors[i])
        log_likelihood=float()
        try:
          log_likelihood = -0.5 * np.sum(np.log(2 * np.pi * variances[i]) + ((X_test - means[i]) ** 2) / variances[i], axis=1)
        except:
          pass
        log_posteriors[:, i] = log_prior + log_likelihood

    return classes[np.argmax(log_posteriors, axis=1)]

def k_fold_cross_validation(X, y, k=10, random_state=None):
    if len(X) != len(y):
        raise ValueError("X and y must have the same length")
    kf = KFold(n_splits=k, shuffle=True, random_state=random_state)
    accuracies = []

    for train_idx, test_idx in kf.split(X):
        X_train, y_train = X[train_idx], y[train_idx]
        X_test, y_test = X[test_idx], y[test_idx]

        classes, priors, means, variances = fit_naive_bayes(X_train, y_train)
        y_pred = predict_naive_bayes(X_test, classes, priors, means, variances)

        accuracy = np.mean(y_pred == y_test)
        accuracies.append(accuracy)

    return accuracies
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd

In [3]:
import pandas as pd
import urllib.request
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

#Download Hayes-Roth dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/hayes-roth/hayes-roth.data"
filename = "hayes-roth.csv"
urllib.request.urlretrieve(url, filename)

#Download Car Evaluation dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data"
filename = "car.csv"
urllib.request.urlretrieve(url, filename)

#Download Breast Cancer dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer/breast-cancer.data"
filename = "breast-cancer.csv"
urllib.request.urlretrieve(url, filename)

#Read datasets into pandas dataframes
hayes_roth_df = pd.read_csv("hayes-roth.csv", header=None, names=["class", "a1", "a2", "a3", "a4"])
car_df = pd.read_csv("car.csv", header=None, names=["buying", "maint", "doors", "persons", "lug_boot", "safety", "class"])
breast_cancer_df = pd.read_csv("breast-cancer.csv", header=None, names=["class", "age", "menopause", "tumor_size", "inv_nodes", "node_caps", "deg_malig", "breast", "breast_quad", "irradiat"])

#Encode categorical variables
le = LabelEncoder()
hayes_roth_df["class"] = le.fit_transform(hayes_roth_df["class"])
hayes_roth_df["a1"] = le.fit_transform(hayes_roth_df["a1"])
hayes_roth_df["a2"] = le.fit_transform(hayes_roth_df["a2"])
hayes_roth_df["a3"] = le.fit_transform(hayes_roth_df["a3"])
hayes_roth_df["a4"] = le.fit_transform(hayes_roth_df["a4"])

car_df["buying"] = le.fit_transform(car_df["buying"])
car_df["maint"] = le.fit_transform(car_df["maint"])
car_df["doors"] = le.fit_transform(car_df["doors"])
car_df["persons"] = le.fit_transform(car_df["persons"])
car_df["lug_boot"] = le.fit_transform(car_df["lug_boot"])
car_df["safety"] = le.fit_transform(car_df["safety"])
car_df["class"] = le.fit_transform(car_df["class"])

breast_cancer_df = breast_cancer_df.replace("?", np.nan)
breast_cancer_df = breast_cancer_df.dropna()
breast_cancer_df = breast_cancer_df.apply(LabelEncoder().fit_transform)

X_hayes_roth = hayes_roth_df.iloc[:, 1:].values
y_hayes_roth = hayes_roth_df.iloc[:, 0].values

X_car_evaluation = car_df.iloc[:, :-1].values
y_car_evaluation = car_df.iloc[:, -1].values

X_breast_cancer = breast_cancer_df.iloc[:, :-1].values
y_breast_cancer = breast_cancer_df.iloc[:, -1].values

#Replace missing values with most frequent value
imp = SimpleImputer(strategy="most_frequent")
X_breast_cancer = imp.fit_transform(X_breast_cancer)

#Perform k-fold cross-validation on each dataset
# Hayes-Roth dataset
k=100
print("Hayes-Roth dataset")
accuracies = k_fold_cross_validation(X_hayes_roth, y_hayes_roth, k)
print("Mean accuracy: ", np.mean(accuracies))
print("Std accuracy: ", np.std(accuracies))
# Car Evaluation dataset
print("\nCar Evaluation dataset")
accuracies = k_fold_cross_validation(X_car_evaluation, y_car_evaluation, k)
print("Mean accuracy: ", np.mean(accuracies))
print("Std accuracy: ", np.std(accuracies))
# Breast Cancer dataset
print("\nBreast Cancer dataset")
accuracies = k_fold_cross_validation(X_breast_cancer, y_breast_cancer, k)
print("Mean accuracy: ", np.mean(accuracies))
print("Std accuracy: ", np.std(accuracies))

Hayes-Roth dataset
Mean accuracy:  0.3280522607163827
Std accuracy:  0.0021553782017168345

Car Evaluation dataset
Mean accuracy:  0.19736786052409416
Std accuracy:  0.20035612631538974

Breast Cancer dataset
Mean accuracy:  0.7591503649635039
Std accuracy:  0.09549910115516948


In [4]:
nb_accuracies = []
for df in [hayes_roth_df, car_df, breast_cancer_df]:
    X = df.drop(columns=["class"]).values
    y = df["class"].values

    accuracies = k_fold_cross_validation(X, y, random_state=42, k=100)
    mean_accuracy = np.mean(accuracies)
    nb_accuracies.append(mean_accuracy)
#print(nb_accuracies)
from scipy.stats import ttest_1samp

weka_accuracies = [0.906667, 0.6, 0.81]
weka_baseline=1/3
alpha=0.05
p_val=float()
for i, dataset_name in enumerate(["Hayes-Roth", "Car Evaluation", "Breast Cancer"]):
    t_stat, p_val = ttest_1samp(nb_accuracies, weka_baseline)
    print(f"{dataset_name}:")
    print(f"  Naive Bayes mean accuracy: {nb_accuracies[i]:.4f}")
    print(f"  Weka Naive Bayes mean accuracy: {weka_accuracies[i]:.4f}")
    print(f"  t-statistic: {t_stat:.4f}")
    print(f"  p-value: {p_val:.4f}")
#     print('We reject the null hypothesis.' if p_val < alpha else 'Null hypothesis cannot be rejected.')


Hayes-Roth:
  Naive Bayes mean accuracy: 0.3281
  Weka Naive Bayes mean accuracy: 0.9067
  t-statistic: 0.4860
  p-value: 0.6750
Car Evaluation:
  Naive Bayes mean accuracy: 0.1984
  Weka Naive Bayes mean accuracy: 0.6000
  t-statistic: 0.4860
  p-value: 0.6750
Breast Cancer:
  Naive Bayes mean accuracy: 0.6860
  Weka Naive Bayes mean accuracy: 0.8100
  t-statistic: 0.4860
  p-value: 0.6750
