### Mount Google drive

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### General code

In [None]:
import pandas as pd

def get_X_and_y(is_train_data=True):
    '''X is the feature matrix and y is the label vector'''
    filename_ending = 'train' if is_train_data else 'test'

    data = pd.read_csv(f'/content/drive/MyDrive/SPECT/SPECT.{filename_ending}', header=None)

    return data.iloc[:, 1:], data.iloc[:, 0]

X_train, y_train = get_X_and_y()
X_test, y_test = get_X_and_y(False)

### Applying the Scikit-learn implementation of Naive Bayes classifier

In [None]:
from sklearn.naive_bayes import BernoulliNB

# Instantiate the classifier
clfBNB = BernoulliNB(binarize=0)

# Train the model
clfBNB.fit(X_train, y_train)

# Test the model
clfBNB.score(X_test, y_test)

0.7486631016042781

### Implementation (from scratch) of the Naive Bayes classifier

In [None]:
class NaiveBayesClassifier:
  nr_of_features = 0

  # probabilities based on relative frequencies
  pbrf = {
      'y_is_1': {
          'X_is_1': [],
          'X_is_0': [],
          'prob': 0,
      },
      'y_is_0': {
          'X_is_1': [],
          'X_is_0': [],
          'prob': 0,
      }
  }

  def fit(self, X_train, y_train):
    self.nr_of_features = len(X_train.columns)

    X_is_1 = {
        'y_is_1': [0] * self.nr_of_features,
        'y_is_0': [0] * self.nr_of_features,
    }

    nr_of_samples = {
        'y_is_1': 0,
        'y_is_0': 0,
    }

    for i, x in X_train.iterrows():
      y_is_1_or_0 = f"y_is_{y_train[i]}"

      nr_of_samples[y_is_1_or_0] += 1

      for j in range(self.nr_of_features):
        X_is_1[y_is_1_or_0][j] += x[j + 1]

    self.pbrf['y_is_1']['X_is_1'] = [x / nr_of_samples['y_is_1'] for x in X_is_1['y_is_1']]
    self.pbrf['y_is_0']['X_is_1'] = [x / nr_of_samples['y_is_0'] for x in X_is_1['y_is_0']]

    self.pbrf['y_is_1']['X_is_0'] = [1 - x for x in self.pbrf['y_is_1']['X_is_1']]
    self.pbrf['y_is_0']['X_is_0'] = [1 - x for x in self.pbrf['y_is_0']['X_is_1']]

    total_nr_of_samples = nr_of_samples['y_is_1'] + nr_of_samples['y_is_0']

    self.pbrf['y_is_1']['prob'] = nr_of_samples['y_is_1'] / total_nr_of_samples
    self.pbrf['y_is_0']['prob'] = nr_of_samples['y_is_0'] / total_nr_of_samples

  def get_1_or_0_label_prob(self, features, label = 1):
    label_prob = 1

    for j in range(self.nr_of_features):
      label_prob *= self.pbrf[f'y_is_{label}'][f'X_is_{features[j + 1]}'][j]

    label_prob *= self.pbrf[f'y_is_{label}']['prob']

    return label_prob


  def predict(self, features):
    return int(self.get_1_or_0_label_prob(features) > self.get_1_or_0_label_prob(features, 0))

  def score(self, X_test, y_test):
    nr_of_correct_predictions = 0

    for i, x in X_test.iterrows():

      if y_test[i] == self.predict(x):
        nr_of_correct_predictions += 1

    return nr_of_correct_predictions / len(X_test.index)

# Instantiate the classifier
clfNB = NaiveBayesClassifier()

# Train the model
clfNB.fit(X_train, y_train)

# Test the model
clfNB.score(X_test, y_test)

0.7754010695187166