In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

In [2]:
data_json_file = 'SpamData/01_Processing/email-text-data.json'

In [3]:
data = pd.read_json(data_json_file)

In [4]:
data.tail()

Unnamed: 0,CATEGORY,MESSAGE,FILE_NAME
5791,0,"I'm one of the 30,000 but it's not working ver...",00609.dd49926ce94a1ea328cce9b62825bc97
5792,0,Damien Morton quoted:\n\n>W3C approves HTML 4 ...,00957.e0b56b117f3ec5f85e432a9d2a47801f
5793,0,"On Mon, 2002-07-22 at 06:50, che wrote:\n\n\n\...",01127.841233b48eceb74a825417d8d918abf8
5794,0,"Once upon a time, Manfred wrote :\n\n\n\n> I w...",01178.5c977dff972cd6eef64d4173b90307f0
5795,0,"If you run Pick, and then use the ""New FTOC"" b...",00747.352d424267d36975a7b40b85ffd0885e


In [5]:
vectorizer  = CountVectorizer(stop_words='english')

In [6]:
all_features = vectorizer.fit_transform(data.MESSAGE)

In [7]:
all_features.shape

(5796, 102694)

In [8]:
vectorizer.vocabulary_

{'dear': 32719,
 'homeowner': 48034,
 'rates': 76350,
 'lowest': 59365,
 'point': 72297,
 '40': 7824,
 'years': 98506,
 'help': 47200,
 'best': 23129,
 'rate': 76347,
 'situation': 82318,
 'matching': 60930,
 'needs': 64750,
 'hundreds': 48607,
 'lenders': 58021,
 'home': 48006,
 'improvement': 51399,
 'refinance': 77074,
 'second': 80968,
 'mortgage': 63026,
 'equity': 38990,
 'loans': 59058,
 'perfect': 70478,
 'credit': 30975,
 'service': 81359,
 '100': 1496,
 'free': 42773,
 'owners': 68715,
 'new': 64988,
 'buyers': 25617,
 'obligation': 66813,
 'just': 55049,
 'quick': 75547,
 'simple': 82172,
 'form': 42425,
 'jump': 55000,
 'start': 84135,
 'future': 43330,
 'plans': 71939,
 'today': 88039,
 'visit': 92921,
 'http': 48497,
 '61': 10092,
 '145': 2275,
 '116': 1873,
 '186': 2748,
 'user0201': 91339,
 'index': 51639,
 'asp': 20429,
 'afft': 17606,
 'qm10': 75108,
 'unsubscribe': 90955,
 'light': 58472,
 'watch': 94281,
 'attention': 20740,
 'computer': 29755,
 'users': 91367,
 'sp

In [9]:
x_train, x_test , y_train ,y_test = train_test_split(all_features, data.CATEGORY, 
                                                     test_size=0.3, random_state=88)

In [10]:
x_train.shape

(4057, 102694)

In [11]:
classifier = MultinomialNB()

In [12]:
classifier.fit(x_train, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [13]:
#number of students correctly predicted
nr_correct = (y_test == classifier.predict(x_test)).sum()

#number of students incorrectly predicted
nr_incorrect = (y_test != classifier.predict(x_test)).sum()

#fraction of documents classified incorrectly == Accuracy
fraction_wrong  = nr_incorrect/(nr_correct + nr_incorrect )

In [14]:
print(f'{nr_correct} documents classified correctly')
print(f'{nr_incorrect} documents classified incorrectly')
print(f'{1-fraction_wrong:.2%} fraction of documents classified incorrectly')


1660 documents classified correctly
79 documents classified incorrectly
95.46% fraction of documents classified incorrectly


In [15]:
recall_score(y_test, classifier.predict(x_test))

0.8646209386281588

In [16]:
precision_score(y_test, classifier.predict(x_test))

0.9917184265010351

In [17]:
f1_score(y_test, classifier.predict(x_test))

0.9238187078109933

In [21]:
example = ["I want to gets some free viagra",
           "Lets go to Murree",
           "I will go to learn python",
           "need Mortage? reply to arrange a call with spacialist and get a  quote"]

In [22]:
doc_term_matrix = vectorizer.transform(example)

In [23]:
classifier.predict(doc_term_matrix)

array([0, 0, 0, 1])