In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer # vocabulary generation
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import recall_score, precision_score, f1_score

In [2]:
DATA_JSON_FILE = 'C:/github/SpamData/01_Processing/email-text-data.json'

In [3]:
data = pd.read_json(DATA_JSON_FILE)
data.tail()

Unnamed: 0,CATEGORY,MESSAGE,FILE_NAME
995,1,This is a multi-part message in MIME format.\n...,00497.353a61b265f11dd0bae116c0149abbe1
996,1,PROMOTE YOUR PRODUCT OR SERVICE TO MILLIONS TO...,00498.7f293b818e2e46d3a8bad44eda672947
997,1,<html>\n\n<head>\n\n</head>\n\n<body>\n\n\n\n<...,00499.257302b8f6056eb85e0daa37bfcd2c68
998,1,As to\n\n\n\n\n\n\n\nWant to refinance?\n\n\n\...,00500.87320162ab5b79f67978406cf909c3d1
999,1,"Dear Sirs,\n\nWe know your esteemed company in...",00501.32679091b0520132ad888ef3b134ce48


In [4]:
data.shape # not in index order

(5847, 3)

In [5]:
data.sort_index(inplace=True) # sort according to index

In [6]:
data.tail() # last five rows

Unnamed: 0,CATEGORY,MESSAGE,FILE_NAME
5842,0,http://news.bbc.co.uk/1/hi/england/2515127.stm...,01396.61983fbe6ec43f55fd44e30fce24ffa6
5843,0,"> >-- be careful when using this one.) Also, t...",01397.9f9ef4c2a8dc012d80f2ce2d3473d3b7
5844,0,">>>>> ""SM"" == Skip Montanaro <skip@pobox.com> ...",01398.169b51731fe569f42169ae8f948ec676
5845,0,"So then, ""Mark Hammond"" <mhammond@skippinet.co...",01399.ca6b00b7b341bbde9a9ea3dd6a7bf896
5846,0,"Hi there,\n\n\n\nNow this is probably of no us...",01400.f897f0931e461e7b2e964d28e927c35e


In [7]:
vectorizer = CountVectorizer(stop_words='english')

In [8]:
all_features = vectorizer.fit_transform(data.MESSAGE)

In [9]:
all_features.shape

(5847, 102277)

In [10]:
vectorizer.vocabulary_ # individual word present

{'doctype': 34757,
 'html': 48296,
 'public': 73722,
 'w3c': 93388,
 'dtd': 36243,
 'transitional': 88204,
 'en': 38314,
 'head': 46845,
 'meta': 61469,
 'content': 30159,
 '3d': 6370,
 'text': 86628,
 'charset': 27724,
 '3dwindows': 7282,
 '1252': 2023,
 'http': 48321,
 'equiv': 38871,
 '3dcontent': 6893,
 'ype': 98641,
 'mshtml': 63171,
 '00': 0,
 '2314': 4227,
 '1000': 1495,
 '3dgenerator': 6972,
 'body': 24336,
 'inserted': 51935,
 'calypso': 26491,
 'table': 85768,
 'border': 24524,
 '3d0': 6371,
 'cellpadding': 27303,
 'cellspacing': 27311,
 '3d2': 6510,
 'id': 49648,
 '3d_calyprintheader_': 6743,
 'ules': 89863,
 '3dnone': 7115,
 'style': 84374,
 'color': 29286,
 'black': 23840,
 'display': 34299,
 'width': 95083,
 '100': 1494,
 'tbody': 86106,
 'tr': 88067,
 'td': 86192,
 'colspan': 29309,
 '3d3': 6555,
 'hr': 48197,
 '3dblack': 6848,
 'noshade': 65596,
 'size': 82015,
 '3d1': 6402,
 'end': 38378,
 'font': 42116,
 '000000': 4,
 'face': 40374,
 '3dverdana': 7268,
 'arial': 20078

In [11]:
X_train, X_test, y_train, y_test = train_test_split(all_features, data.CATEGORY)

In [12]:
X_train.shape

(4385, 102277)

In [13]:
X_test.shape

(1462, 102277)

In [14]:
classifier = MultinomialNB() # model

In [15]:
classifier.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [16]:
nr_correct = (y_test == classifier.predict(X_test)).sum()

In [17]:
print(f'{nr_correct} documents classfied correctly')

1391 documents classfied correctly


In [18]:
nr_incorrect = y_test.size - nr_correct

In [19]:
print(f'Number of documents incorrectly classified is {nr_incorrect}')

Number of documents incorrectly classified is 71


In [20]:
fraction_wrong = nr_incorrect / (nr_correct + nr_incorrect)
print(f'The (testing) accuracy of the model is {1-fraction_wrong:.2%}')

The (testing) accuracy of the model is 95.14%


In [21]:
classifier.score(X_test, y_test)

0.9514363885088919

In [22]:
recall_score(y_test, classifier.predict(X_test))

0.8574468085106383

In [23]:
precision_score(y_test, classifier.predict(X_test))

0.9901719901719902

In [24]:
f1_score(y_test, classifier.predict(X_test))

0.9190421892816419

In [31]:
examples = ['i cant pick the call right now, please send a message',
         'congratulations youre awarded $500',
         'need a mortgage? Reply to arrange a call with a specialist and get a quote',
         'We are going to implement two techniques',
         'Confirm once youve registered',
         'want bitcoin for free? register now link below']

In [32]:
mail_class = vectorizer.transform(examples)

In [33]:
classifier.predict(mail_class)

array([0, 1, 1, 0, 0, 1], dtype=int64)