## Import all required libraries

In [9]:
import pickle
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

## Load Dataset from File

In [39]:
data = pd.read_csv("./spam-dataset.csv", encoding= 'latin-1')
data.head()

Unnamed: 0,class,message,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


## Verify dataset is correctly labeled

In [43]:
print(data[data['class'] == 'spam'].head()['message'])
print(data[data['class'] == 'ham'].head()['message'])

2     Free entry in 2 a wkly comp to win FA Cup fina...
5     FreeMsg Hey there darling it's been 3 week's n...
8     WINNER!! As a valued network customer you have...
9     Had your mobile 11 months or more? U R entitle...
11    SIX chances to win CASH! From 100 to 20,000 po...
Name: message, dtype: object
0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
6    Even my brother is not like to speak with me. ...
Name: message, dtype: object


## Set Feature as 'x' and Label as 'y'

In [13]:
x = np.array(data["message"])
y = np.array(data["class"])

## Using CountVectorizer to Vectorize Input Feature Texts

In [14]:
cv = CountVectorizer()
X = cv.fit_transform(x)

## Split Vectorized Dataset and Fit to Model

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
model = MultinomialNB()
model.fit(X_train,y_train)

MultinomialNB()

## Export CountVectorizer and Model

In [11]:
pickle.dump(cv, open("./cv.pickle", "wb"))
pickle.dump(model, open(f'./spam-detection-model.sav', 'wb'))

## Import CountVectorizer and Model From Saved Files

In [18]:
cv = pickle.load(open(f'./cv.pickle', 'rb'))
model = pickle.load(open(f'./spam-detection-model.sav', 'rb'))

## Test Model with Sample Text

### Non-spam Text Example

In [35]:
text = """
FREE DONATION GIVEAWAY
Need gone
works good
Just smashed on the back other then that it’s works completely fine
Open sim
£80-100"""
text = text.replace('\n',' ')

vectorised_input = cv.transform([text]).toarray()
print(model.predict(vectorised_input))

['ham']


In [62]:
text = """Garden items for free works just fine
Small drawers
39/40, height 61cm"""
text = text.replace('\n',' ')

vectorised_input = cv.transform([text]).toarray()
print(model.predict(vectorised_input))

['ham']


### Spam Text Example

In [40]:
text = """
FREE GIVEAWAY
Call 070239381 CODE FREENOW
If you sign up now you will win a 1000$ prize GUARANTEED. 
"""
text = text.replace('\n',' ')

vectorised_input = cv.transform([text]).toarray()
print(model.predict(vectorised_input))

['spam']
