## Spam Filter using Naive Bayes

### Import Library

In [34]:
# !pip install spacy
# !python -m spacy download en_core_web_sm
# !pip install pandas
# !pip install numpy
# !pip install -U scikit-learn
import pandas as pd
import math
import numpy as np
import spacy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

### Load DataSet

In [35]:
nlp = spacy.load('en_core_web_sm')
spam_df = pd.read_csv("spam.csv")
spam_df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [36]:
X = spam_df.Message
Y = spam_df['Category'].apply(lambda x: 1 if x == 'spam' else 0)

### 0. Data Preprocessing

In [37]:
def preprocess_text(text):
    doc = nlp(text)
    filtered_text = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(filtered_text)

X_new = X.apply(lambda x: preprocess_text(x))

### 1. Extract Features

In [38]:
cv = CountVectorizer()
def nb_train_test_split(X, Y):
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.33)
    x_train_count = cv.fit_transform(x_train.values).toarray()
    x_test_count = cv.transform(x_test.values).toarray()
    y_train_array = np.array(y_train)
    y_test_array = np.array(y_test)
    return x_train_count, x_test_count, y_train_array, y_test_array

x_train, x_test, y_train, y_test = nb_train_test_split(X_new,Y)

### Bayes Theorem

Bayes formulation
$$\begin{equation}
P\left(A|B\right)= \dfrac{P\left(B|A\right)P\left(A\right)}{P\left(B\right)}
\end{equation}$$

If $B$ is our data $\mathcal{D}$, $A$ and $w$ are parameters we need to estimate:

$$\begin{align}
    \underbrace{P(w|\mathcal{D})}_{Posterior}= \dfrac{1}{\underbrace{P(\mathcal{D})}_{Normalization}} \overbrace{P(\mathcal{D}|w)}^{\text{Likelihood}} \overbrace{P(w)}^{Prior}
    \end{align}$$

### 2. Train Our Model

In [39]:
class NaiveBayesModel:
    def __init__(self):
        self.hist = dict()
        self.mean = dict()
        self.std = dict()
        
    def _gauss(self, std, mean, x):
        f = (1 / (std * math.sqrt(2 * math.pi))) * math.exp(-((x - mean)**2) / (2 * std**2))
        return f
    
    def _likelihood(self, data, hypo):
        std = self.std[hypo]
        mean = self.mean[hypo]

        res = 1
        n_attr = len(data)

        for j in range(n_attr):
            p_xi_hypo = self._gauss(std[j], mean[j], data[j])
            res *= p_xi_hypo

        return res
    
    def _update(self, data):
        for hypo in self.hist.keys():
            self.hist[hypo] = self._likelihood(data, hypo) * self.hist[hypo]

        s = sum(self.hist.values())
        for hypo in self.hist.keys():
            self.hist[hypo] = self.hist[hypo] / s
         
    def _max_hypo(self):
        """
        Find label with the highest probability
        -----------
        return: label of data
        """ 

        max_hypo = 0
        for hypo in self.hist.keys():
            if self.hist[hypo] > self.hist[max_hypo]:
                max_hypo = hypo
        return max_hypo
    
    def _predict(self, data):
        """
        Predict label for only 1 data sample
        ------------
        Parameters:
        data: data sample
        -----------
        return: label of data
        """ 
        self._update(data)
        return self._max_hypo()
        
    def predict(self, data):
        """Parameters:
        Data: test data
        ----------
        return labels of test data"""
        
        pred = np.zeros(len(data))
        for i, c in enumerate(data):
            pred[i] = self._predict(c)
        return pred

    def score(self, X_test, y_test):
        pred = self.predict(X_test)
        return sum(y_test == pred) / len(pred)
    
    def fit(self, X, y):
        """Parameters:
        X: training data
        y: labels of training data"""

        n = len(X)
        # number of spam species
        n_species = len(set(y))

        for hypo in range(0, n_species):

            y_hypo = [label == hypo for label in y]

            rows = X[y_hypo]

            count_each_hypo = np.sum(y_hypo)

            # Prior
            probability = count_each_hypo / n

            self.hist[hypo] = probability

            # Each hypothesis represented by its mean and standard derivation
            # mean and standard derivation should be calculated for each column (or each attribute)

            count_of_attribute = len(X[0])

            mean_hypo = [0.0] * count_of_attribute
            std_hypo = [0.0] * count_of_attribute

            # traversal and calculate mean, std for each column (attribute)
            for j in range(count_of_attribute):
                col_j = [row[j] for row in X]
                mean_hypo[j] = sum(col_j) / len(col_j)
                std_hypo[j] = (sum((x - mean_hypo[j])**2 for x in col_j) / len(col_j))**0.5

            self.mean[hypo] = mean_hypo
            self.std[hypo] = std_hypo

In [40]:
model = NaiveBayesModel()
model.fit(x_train, y_train)

### 3. Evaluate Our Model

In [41]:
accuracy_score = model.score(x_test, y_test)

  res *= p_xi_hypo
  self.hist[hypo] = self.hist[hypo] / s
  res *= p_xi_hypo


In [42]:
print("Accuracy of our gaussian naive bayes model: ", accuracy_score)

Accuracy of our gaussian naive bayes model:  0.8814573137574769
