In [3]:
import pandas as pd
import numpy as np
import spacy

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import  CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, ConfusionMatrixDisplay

In [4]:
df = pd.read_csv('email.csv')
df.head()

Unnamed: 0,text,spam
0,Subject: re : 1 . your thur . / fri . austin t...,0
1,Subject: re : alex ' s paper comments : 1 . ...,0
2,"Subject: rice cfos conference christie , thi...",0
3,Subject: new computers hi lyn : hope things ...,0
4,"Subject: intranet site dear dale / all , tha...",0


In [5]:
df.spam.value_counts()

0    4358
1    1368
Name: spam, dtype: int64

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    df.text, df.spam, test_size=0.2, random_state=42
)

In [112]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4580,)
(1146,)
(4580,)
(1146,)


In [113]:
type(X_train)

pandas.core.series.Series

In [114]:
X_train[:5]

5000    Subject: all graphics software available , che...
1520    Subject: vlady gorny  barbara ,  i called vlad...
2083    Subject: viagra is it the right medication for...
527     Subject: re : qian ( frank ) feng interview wi...
4318    Subject: notification from sky bank # 6521 - 3...
Name: text, dtype: object

In [115]:
y_train[:5]

5000    1
1520    0
2083    1
527     0
4318    1
Name: spam, dtype: int64

In [116]:
vector = CountVectorizer()
X_train_cv = vector.fit_transform(X_train)
X_train_cv

<4580x33759 sparse matrix of type '<class 'numpy.int64'>'
	with 566886 stored elements in Compressed Sparse Row format>

In [117]:
X_train_cv.toarray()[:5][0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [118]:
X_train_cv.shape

(4580, 33759)

In [119]:
dir(vector)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_char_ngrams',
 '_char_wb_ngrams',
 '_check_feature_names',
 '_check_n_features',
 '_check_stop_words_consistency',
 '_check_vocabulary',
 '_count_vocab',
 '_get_param_names',
 '_get_tags',
 '_limit_features',
 '_more_tags',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_sort_features',
 '_stop_words_id',
 '_validate_data',
 '_validate_params',
 '_validate_vocabulary',
 '_warn_for_unused_params',
 '_white_spaces',
 '_word_ngrams',
 'analyzer',
 'binary',
 'build_analyzer',
 'build_preprocessor',
 'build_tokenizer',
 'decode',
 'decode_error',
 'dtype',
 'encoding',
 'fit',

In [120]:
vector.vocabulary_

{'subject': 29343,
 'all': 4022,
 'graphics': 14680,
 'software': 28409,
 'available': 5183,
 'cheap': 7562,
 'oem': 21968,
 'versions': 32104,
 'good': 14566,
 'morning': 20712,
 'we': 32622,
 'offer': 21978,
 'latest': 18370,
 'packages': 22657,
 'of': 21972,
 'and': 4307,
 'publishinq': 24802,
 'from': 13890,
 'corel': 9034,
 'macromedia': 19296,
 'adobe': 3609,
 'others': 22427,
 '80': 2683,
 'photoshop': 23429,
 'cs': 9396,
 '140': 501,
 'studio': 29301,
 'mx': 20999,
 '2004': 812,
 '120': 414,
 'acrobat': 3449,
 'professionai': 24481,
 '150': 538,
 'premiere': 24208,
 'pro': 24406,
 '90': 2914,
 'corei': 9033,
 'desiqner': 10252,
 '10': 290,
 'quickbooks': 25044,
 'professional': 24483,
 'edition': 11518,
 '75': 2548,
 'pagemaker': 22683,
 '70': 2419,
 'xara': 33353,
 'vl': 32318,
 'audition': 5075,
 'discreet': 10656,
 'max': 19776,
 '115': 397,
 'golive': 14553,
 '135': 487,
 'after': 3751,
 'effects': 11582,
 'standard': 28928,
 '45': 1708,
 'elements': 11728,
 '125': 439,
 'p

In [121]:
x_train_np = X_train_cv.toarray()
x_train_np[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [122]:
np.where(x_train_np[0]!=0)

(array([  290,   369,   397,   414,   439,   487,   501,   538,   812,
          972,  1176,  1708,  1862,  2419,  2548,  2683,  2914,  3449,
         3609,  3751,  3852,  4022,  4307,  5075,  5183,  5604,  6011,
         6663,  7562,  8964,  9033,  9034,  9278,  9396, 10252, 10656,
        11518, 11582, 11728, 13768, 13890, 14553, 14566, 14680, 15642,
        16151, 18370, 18501, 18934, 19240, 19296, 19776, 20700, 20712,
        20752, 20999, 21968, 21972, 21978, 22427, 22657, 22683, 22705,
        23429, 24207, 24208, 24406, 24467, 24481, 24483, 24802, 25044,
        25045, 28093, 28409, 28928, 29301, 29343, 29464, 31330, 32104,
        32318, 32622, 33353], dtype=int64),)

In [123]:
x_train_np[0][290]

1

In [124]:
model = MultinomialNB()
model.fit(X_train_cv, y_train)

MultinomialNB()

In [125]:
x_test_cv = vector.transform(X_test)

In [126]:
y_pred = model.predict(x_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       869
           1       0.99      0.99      0.99       277

    accuracy                           0.99      1146
   macro avg       0.99      0.99      0.99      1146
weighted avg       0.99      0.99      0.99      1146



In [105]:
email =[
    'Subject: undeliverable : home based business for grownups  your message  subject : home based business for grownups  sent : sun , 21 jan 2001 09 : 24 : 27 + 0100  did not reach the following recipient ( s ) :  75 @ tfi . kpn . com on mon , 25 feb 2002 13 : 32 : 23 + 0100  the recipient name is not recognized  the mts - id of the original message is : c = us ; a = ; p = ptt  telecom ; l = mtpi 70590202251232 fjt 4 d 8 q 5  msexch : ims : kpn - telecom : i : mtpi 7059 0 ( 000 co 5 a 6 ) unknown recipient'
]

email_count = vector.transform(email)
model.predict(email_count)

array([1], dtype=int64)