In [1]:
import numpy as np, pandas as pd
from collections import defaultdict
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('spam_dataset.csv')
x = np.array(df.drop(columns=['label']))
y = np.array(df['label'])
df.head()

Unnamed: 0,money,free,win,prize,discount,offer,urgent,cash,guaranteed,click,...,act_now,apply_now,risk_free,winner,bonus,million,message_length,contains_url,num_special_chars,label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,58,1,1,0
1,1,1,1,1,0,1,1,1,1,1,...,1,1,0,1,1,1,250,1,9,1
2,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,283,1,12,1
3,0,0,1,1,0,0,0,0,0,1,...,0,0,0,0,0,1,42,0,4,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,24,0,6,0


In [3]:
x_train, x_test, y_train, y_test = train_test_split(x, y , test_size=0.2, random_state=None)

In [4]:
class NaiveBayes:
    
    def __init__(self):
        self.class_priors = {}
        self.words_probs = defaultdict(dict)
        self.classes = None

    def fit(self, x, y):
        self.classes = np.unique(y)
        n_rows = len(y)
        
        for clas in self.classes:
            self.class_priors[clas] = np.sum(y == clas) / n_rows

        for feature_idx in range(x.shape[1]):
            for clas in self.classes:
                clas_x = x[y == clas, feature_idx]
                # smoothing:
                prob = (np.sum(clas_x == 1) + 1) / (len(clas_x) + 2)
                self.words_probs[(feature_idx, clas)] = prob

    def predict(self, x):
        predictions = []

        for sample in x:
            max_prob = -np.inf
            perfect_class = None
            
            for clas in self.classes:
                prob = np.log(self.class_priors[clas])

                for feature_idx, value in enumerate(sample):
                    p = self.words_probs[(feature_idx, clas)]
                    prob += np.log(p if value == 1 else (1 - p))

                if prob > max_prob:
                    max_prob = prob
                    perfect_class = clas
            predictions.append(perfect_class)
            
        return np.array(predictions)            

In [5]:
nb = NaiveBayes()
nb.fit(x_train, y_train)

In [6]:
y_pred = nb.predict(x_test)

In [7]:
from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[26  0]
 [ 0 14]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        26
           1       1.00      1.00      1.00        14

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40

