## Spam Filter using Naive Bayes

### Import Library

In [99]:
# !pip install spacy
# !python -m spacy download en_core_web_sm
import pandas as pd
import numpy as np
import spacy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

### Load DataSet

In [100]:
nlp = spacy.load('en_core_web_sm')
spam_df = pd.read_csv("spam.csv")
spam_df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [101]:
X = spam_df.Message
Y = spam_df['Category'].apply(lambda x: 1 if x == 'spam' else 0)

### 0. Data Preprocessing

In [102]:
def preprocess_text(text):
    doc = nlp(text)
    filtered_text = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(filtered_text)

X_new = X.apply(lambda x: preprocess_text(x))

### 1. Extract Features

In [103]:
cv = CountVectorizer()
def nb_train_test_split(X, Y):
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.33)
    x_train_count = cv.fit_transform(x_train.values).toarray()
    x_test_count = cv.transform(x_test.values).toarray()
    y_train_array = np.array(y_train)
    y_test_array = np.array(y_test)
    return x_train_count, x_test_count, y_train_array, y_test_array

x_train, x_test, y_train, y_test = nb_train_test_split(X_new,Y)

### 2. Train Our Model

In [104]:
model = MultinomialNB()
model.fit(x_train, y_train)

### 3. Evaluate our model

In [105]:
accuracy_score = model.score(x_test, y_test)

In [106]:
print("Accuracy of our gaussian naive bayes model: ", accuracy_score)

Accuracy of our gaussian naive bayes model:  0.9853181076672104
