# Email Spam Detection with Natural Language Processing

In [16]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string

In [3]:
df = pd.read_csv('emails.csv')
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [4]:
df.shape

(5728, 2)

In [5]:
df.columns

Index(['text', 'spam'], dtype='object')

### Check for duplicates

In [6]:
df.drop_duplicates(inplace=True)
print(df.shape)

(5695, 2)


### See the number of missing data for each column

In [7]:
print(df.isnull().sum())

text    0
spam    0
dtype: int64


### Download the stopwords

In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
def process(text):
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = "".join(nopunc)
    
    clean = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    return clean

df['text'].head().apply(process)

0    [Subject, naturally, irresistible, corporate, ...
1    [Subject, stock, trading, gunslinger, fanny, m...
2    [Subject, unbelievable, new, homes, made, easy...
3    [Subject, 4, color, printing, special, request...
4    [Subject, money, get, software, cds, software,...
Name: text, dtype: object

### Convert the text into a matrix of a token counts

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
message = CountVectorizer(analyzer=process).fit_transform(df['text'])

In [None]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(message, df['spam'],test_size=0.20, random_state=0)
print(message.shape)

In [None]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB().fit(xtrain,ytrain)

### See the classifiers prediction and actual values on the dataset

In [None]:
print(classifier.predict(xtrain))
print(ytrain.values)

In [None]:
# evaluating the model on the training set
from sklearn.metrics import classification_report, confusion_matrix, accucary_score
pred = classifier.predict(xtrain)
print(classification_report(ytrain,pred))
print()
print('Confusion Matrix: \n', confusion_matrix(ytrain, pred))
print('Accuracy: \n', accuracy_score(ytrain, pred))

In [None]:
print(classifier.predict(xtest))
print(ytest.values)

### evaluate the model on the test dataset

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = Classifier.predict(xtest)
print(classification_report(ytest, pred))
print()
print('Confusion Matrix: \n', confusion_matrix(ytest, pred))
print('Accuracy: \n', accuracy_score(ytest, pred))