In [3]:
import pandas as pd

#data lodading
dataset_path = "spam.csv"
spam = pd.read_csv(dataset_path, sep=",", encoding="latin-1")
spam = spam.rename(columns={"v1": "class", "v2": "text"})
spam = spam.drop(spam.columns[2:], axis=1)

In [4]:
#defining the text processing method to eliminate punctuation and stopwords

import string

#importing stopwords from the natural language toolkit library 
from nltk.corpus import stopwords

def text_process(mess):
    nopunc =[char for char in mess if char not in string.punctuation]
    nopunc=''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

spam['text'].apply(text_process)

0       [Go, jurong, point, crazy, Available, bugis, n...
1                          [Ok, lar, Joking, wif, u, oni]
2       [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3           [U, dun, say, early, hor, U, c, already, say]
4       [Nah, dont, think, goes, usf, lives, around, t...
                              ...                        
5567    [2nd, time, tried, 2, contact, u, U, å£750, Po...
5568                   [Ì, b, going, esplanade, fr, home]
5569                     [Pity, mood, Soany, suggestions]
5570    [guy, bitching, acted, like, id, interested, b...
5571                                   [Rofl, true, name]
Name: text, Length: 5572, dtype: object

In [6]:
#splitting of test and training data

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test  = train_test_split(spam['text'],spam['class'],test_size=0.2, random_state = 42)

In [7]:
#importing our model and methods

#we choose to work with a multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB


#CountVectorizer converts a collection of text documents to a matrix of token counts
from sklearn.feature_extraction.text import CountVectorizer


#The TfidfTransformer converts a collection of documents to a matrix of TF-IDF features. 
# TFIDF is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus
from sklearn.feature_extraction.text import TfidfTransformer



#result presentation
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


#fitting the vectorized X_train to the prediction model
cv = CountVectorizer(max_features = 1500)
cv.fit(X_train)

X_train_cv = cv.transform(X_train)

X_test_cv = cv.transform(X_test)


#fitting the vectorized X train and the Y train to the MNB model
mnb = MultinomialNB(alpha = 0.5)
mnb.fit(X_train_cv,y_train)

y_mnb = mnb.predict(X_test_cv)

print('Naive Bayes Accuracy: ', accuracy_score( y_mnb , y_test))

print('Naive Bayes classification report: ', classification_report(y_mnb, y_test))

Naive Bayes Accuracy:  0.9829596412556054
Naive Bayes classification report:                precision    recall  f1-score   support

         ham       0.99      0.99      0.99       974
        spam       0.91      0.96      0.93       141

    accuracy                           0.98      1115
   macro avg       0.95      0.98      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [13]:
#pipeline creation to accelerate sample predictions

from sklearn.pipeline import Pipeline

pipeline = Pipeline([
   ( 'bow',CountVectorizer(analyzer=text_process)),
    ('tfidf',TfidfTransformer()),
    ('classifier',mnb),
])


pipeline.fit(X_train,y_train)

Pipeline(steps=[('bow',
                 CountVectorizer(analyzer=<function text_process at 0x0000018AFEE72C10>)),
                ('tfidf', TfidfTransformer()),
                ('classifier', MultinomialNB(alpha=0.5))])

In [14]:
pipeline.fit(X_train,y_train)

Pipeline(steps=[('bow',
                 CountVectorizer(analyzer=<function text_process at 0x0000018AFEE72C10>)),
                ('tfidf', TfidfTransformer()),
                ('classifier', MultinomialNB(alpha=0.5))])

In [16]:
import joblib

joblib.dump(mnb, 'spam_detector_model.pkl')
joblib.dump(text_process, 'text_process.pkl')
joblib.dump(pipeline, 'spam_detection_pipeline.pkl')
joblib.dump(X_train, 'xtrain.pkl')
joblib.dump(y_train, 'ytrain.pkl')

['ytrain.pkl']