# Creating a classifier model of sms spam detection using a Multinomial Naïve Bayes

Link to the [dataset](https://www.kaggle.com/uciml/sms-spam-collection-dataset)

In [33]:
from pathlib import Path
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

import joblib

In [34]:
wd_path = Path.cwd()
data_path = wd_path.joinpath("data")

In [35]:
spam = pd.read_csv(data_path.joinpath("datasets_483_982_spam.csv"), encoding="latin-1")

In [36]:
spam.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [37]:
spam.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


## We keep v1 as the target and v2 as the message

In [38]:
spam = spam[["v1", "v2"]]

In [39]:
spam.describe(include="all")

Unnamed: 0,v1,v2
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


The label is unbalanced in the dataset

In [40]:
spam["v1"].value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

Mapping the label: ham=0, spam=1

In [41]:
spam["target"] = spam["v1"].map({"ham": 0, "spam": 1})

In [42]:
X = spam["v2"]
y = spam["target"]

Using a count vectorizer to tokenize and transform the messages into a matrix

In [43]:
cv = CountVectorizer()

In [44]:
X = cv.fit_transform(X)

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [46]:
clf = MultinomialNB()

In [47]:
clf.fit(X_train, y_train)

MultinomialNB()

In [48]:
clf.score(X_test, y_test)

0.9793365959760739

The score is quite good without too much pre-processing

Let see the classification report

In [49]:
y_pred = clf.predict(X_test)

In [50]:
print(classification_report(y_test, y_pred))

precision    recall  f1-score   support

           0       0.99      0.99      0.99      1587
           1       0.93      0.92      0.92       252

    accuracy                           0.98      1839
   macro avg       0.96      0.95      0.96      1839
weighted avg       0.98      0.98      0.98      1839



Using joblib to create pickles from the count vectorizer and the model

In [51]:
joblib.dump(clf, wd_path.joinpath("model/spam_clf.pkl"))

['d:\\github\\jedha-fs-s8-project\\sms spam detector\\model\\spam_clf.pkl']

In [52]:
joblib.dump(cv, wd_path.joinpath("model/cv.pkl"))

['d:\\github\\jedha-fs-s8-project\\sms spam detector\\model\\cv.pkl']

In [53]:
X_t0 = 'Funny fact Nobody teaches volcanoes 2 erupt, tsunamis 2 arise, hurricanes 2 sway aroundn no 1 teaches hw 2 choose a wife Natural disasters just happens'

In [54]:
clf.predict(cv.transform([X_t0]))

array([1], dtype=int64)

In [55]:
cv2 = joblib.load(wd_path.joinpath("model/cv.pkl"))

Testing the count vectorizer pickle

In [56]:
clf.predict(cv2.transform([X_t0]))

array([1], dtype=int64)