In [1]:
#We would work on a small dataset of 5000 or so emails with spam and not spam(ham)
#dataset can be found on kaggle here : https://www.kaggle.com/balakishan77/spam-or-ham-email-classification
#Although there are very big datasets too in kaggle with 500000 emails, 
#but for ease of training and understanind, we are using 5600

## Loading libraries

In [1]:
import numpy as np 
import pandas as pd

import nltk
from nltk.corpus import stopwords

## Data Loading and checking for nulls and other

In [2]:
data = pd.read_csv("spamham.csv")
data.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [3]:
print(data.columns)
data.shape

Index(['text', 'spam'], dtype='object')


(5728, 2)

In [4]:
data.drop_duplicates(inplace = True)
data.isnull().sum()

text    0
spam    0
dtype: int64

In [5]:
data.shape

(5695, 2)

## Tokenizing our dataset and creating a function for that

In [6]:
from nltk.tokenize import RegexpTokenizer

def clean_text(text):
    tokenizer = RegexpTokenizer(r'\w+')

    cleaned_text = tokenizer.tokenize(text)
    return " ".join(cleaned_text).lower()           #We are using join to ceate clean_text into sentence rather than list

#unlike previously, we will download onl;y stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords

def clean_with_stopwords(clean_text):

    sr = stopwords.words('english')

    summary_words = []
    for word in clean_text.split():

        if word.lower() not in sr:
            summary_words.append(word.lower())

    return summary_words

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
def process_text(text):

    cleaned_text             = clean_text(text)
    cleaned_without_stopword = clean_with_stopwords(cleaned_text) 

    return cleaned_without_stopword

In [9]:
data['text'].head().apply(process_text)

0    [subject, naturally, irresistible, corporate, ...
1    [subject, stock, trading, gunslinger, fanny, m...
2    [subject, unbelievable, new, homes, made, easy...
3    [subject, 4, color, printing, special, request...
4    [subject, money, get, software, cds, software,...
Name: text, dtype: object

## Vectorizing our data

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

messages_in_vector = CountVectorizer(analyzer=process_text).fit_transform(data['text'])

In [11]:
messages_in_vector.shape

(5695, 37189)

## Train-Test Split

In [12]:
from sklearn.model_selection import train_test_split
X = messages_in_vector
y = data["spam"]

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.20, random_state = 7)

In [13]:
for i in (X_train, X_test, y_train, y_test):
    print(i.shape)

(4556, 37189)
(1139, 37189)
(4556,)
(1139,)


## Creating our data

In [14]:
from sklearn.naive_bayes import MultinomialNB

spam_filter = MultinomialNB()
spam_filter.fit(X_train, y_train)

MultinomialNB()

In [15]:
from sklearn.metrics import confusion_matrix, classification_report

predictions = spam_filter.predict(X_test)
actual      = y_test

confusion_matrix(actual, predictions)

array([[860,   7],
       [  4, 268]], dtype=int64)

In [16]:
print( classification_report(actual, predictions) )

              precision    recall  f1-score   support

           0       1.00      0.99      0.99       867
           1       0.97      0.99      0.98       272

    accuracy                           0.99      1139
   macro avg       0.98      0.99      0.99      1139
weighted avg       0.99      0.99      0.99      1139

