#### Importing the libraries

In [115]:
import pandas as pd
import numpy as np

#### Exploratory Data Analysis

In [116]:
DATA_CSV = 'spam_ham_dataset.csv'

In [117]:
df = pd.read_csv(DATA_CSV) 

In [118]:
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [119]:
df.shape

(5171, 4)

In [120]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [121]:
df.columns = ['category', 'message', 'spam']

In [122]:
df.groupby('category').describe()

Unnamed: 0_level_0,spam,spam,spam,spam,spam,spam,spam,spam
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
ham,3672.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
spam,1499.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0


In [123]:
df.head()

Unnamed: 0,category,message,spam
0,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,spam,"Subject: photoshop , windows , office . cheap ...",1
4,ham,Subject: re : indian springs\r\nthis deal is t...,0


#### Train test split

In [124]:
from sklearn.model_selection import train_test_split

In [125]:
X_train, X_test, y_train, y_test = train_test_split(df.message,df.spam)

#### Vectorization

In [126]:
from sklearn.feature_extraction.text import CountVectorizer

In [127]:
v = CountVectorizer()
X_train_count = v.fit_transform(X_train.values)
X_train_count.toarray()[:5]

array([[0, 0, 0, ..., 0, 0, 0],
       [3, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [3, 1, 0, ..., 0, 0, 0]])

In [128]:
v.vocabulary_

{'subject': 37129,
 'fw': 18417,
 'enron': 16030,
 'forms': 18004,
 'weehaukan': 41210,
 'fax': 17176,
 'is': 22537,
 'on': 28924,
 'the': 38250,
 'form': 17983,
 'but': 9061,
 'here': 20368,
 'it': 22598,
 'again': 4707,
 '201': 973,
 '558': 2380,
 '4622': 2070,
 'thanks': 38234,
 'notice': 28331,
 'regarding': 32986,
 'entry': 16101,
 'of': 28708,
 'orders': 29134,
 'and': 5448,
 'instructions': 22160,
 'please': 30747,
 'do': 14521,
 'not': 28316,
 'transmit': 39008,
 'or': 29101,
 'your': 42457,
 'painewebber': 29540,
 'account': 4206,
 'by': 9154,
 'mail': 25557,
 'transmitted': 39010,
 'will': 41508,
 'be': 7228,
 'accepted': 4165,
 'responsible': 33423,
 'for': 17923,
 'carrying': 9647,
 'out': 29300,
 'such': 37192,
 'privacy': 31460,
 'confidentiality': 11608,
 'reserves': 33361,
 'right': 33717,
 'to': 38678,
 'monitor': 27100,
 'review': 33552,
 'content': 11841,
 'all': 5060,
 'communications': 11327,
 'sent': 35077,
 'received': 32735,
 'its': 22625,
 'employees': 15865,
 

#### Training a model

In [129]:
from sklearn.naive_bayes import MultinomialNB

In [130]:
spam_detect_model = MultinomialNB().fit(X_train_count,y_train)

In [131]:
emails = [
    "Hey! We're going to play football, do you want to go?",
    "Upto 20% discount on parking, exclusive offer just for you. Don't miss this reward!"
]
emails_count = v.transform(emails)
spam_detect_model.predict(emails_count)

array([0, 1])

#### Model performance

In [None]:
X_test_count = v.transform(X_test)
spam_detect_model.score(X_test_count, y_test)

0.9791183294663574

#### Sklearn Pipeline

In [None]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', MultinomialNB())
])

In [None]:
pipeline.fit(X_train,y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()),
                ('classifier', MultinomialNB())])

In [None]:
predictions = pipeline.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(predictions,y_test))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99       909
           1       0.95      0.98      0.97       384

    accuracy                           0.98      1293
   macro avg       0.97      0.98      0.98      1293
weighted avg       0.98      0.98      0.98      1293

