## Import all required libraries

In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

## Load Dataset from File

In [None]:
data = pd.read_csv("./spam-dataset.csv", encoding= 'latin-1')
data.head()

## Verify dataset is correctly labeled

In [None]:
print(data[data['class'] == 'spam'].head()['message'])
print(data[data['class'] == 'ham'].head()['message'])

## Set Feature as 'x' and Label as 'y'

In [None]:
x = np.array(data["message"])
y = np.array(data["class"])

## Using CountVectorizer to Vectorize Input Feature Texts

In [None]:
cv = CountVectorizer()
X = cv.fit_transform(x)

## Split Vectorized Dataset and Fit to Model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
model = MultinomialNB()
model.fit(X_train,y_train)

## Export CountVectorizer and Model

In [None]:
pickle.dump(cv, open("./cv.pickle", "wb"))
pickle.dump(model, open(f'./spam-detection-model.sav', 'wb'))

## Import CountVectorizer and Model From Saved Files

In [2]:
cv = pickle.load(open(f'./cv.pickle', 'rb'))
model = pickle.load(open(f'./spam-detection-model.sav', 'rb'))

## Test Model with Sample Text

### Non-spam Text Example

In [3]:
text = """
FREE DONATION GIVEAWAY
Need gone
works good
Just smashed on the back other then that it’s works completely fine
Open sim
£80-100"""
text = text.replace('\n',' ')

vectorised_input = cv.transform([text]).toarray()
print(model.predict(vectorised_input))

['ham']


In [4]:
text = """Garden items for free works just fine
Small drawers
39/40, height 61cm"""
text = text.replace('\n',' ')

vectorised_input = cv.transform([text]).toarray()
print(model.predict(vectorised_input))

['ham']


### Spam Text Example

In [5]:
text = """
FREE GIVEAWAY
Call 070239381 CODE FREENOW
If you sign up now you will win a 1000$ prize GUARANTEED. 
FREE GIVEAWAY
Call 070239381 CODE FREENOW
If you sign up now you will win a 1000$ prize GUARANTEED. 
FREE GIVEAWAY
Call 070239381 CODE FREENOW
If you sign up now you will win a 1000$ prize GUARANTEED. 
"""
text = text.replace('\n',' ')

vectorised_input = cv.transform([text]).toarray()
print(model.predict(vectorised_input))

['spam']
