<a href="https://colab.research.google.com/github/vishalgimhan/NLP-Projects/blob/main/Spam_Email_Detection_using_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#import the Libraries
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string

In [2]:
#Load the data
df = pd.read_csv('/content/drive/MyDrive/Datasets/spam_emails.csv')
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [3]:
#Explore the data
df.shape

(5728, 2)

In [4]:
df.columns

Index(['text', 'spam'], dtype='object')

In [8]:
#Check duplicates
df.drop_duplicates(inplace=True)
df.shape

(5695, 2)

In [9]:
#Check Missing data
df.isnull().sum()

text    0
spam    0
dtype: int64

#NLP

In [10]:
#StopWords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [13]:
#Removing stopwords and punctuations
def process(text):
  nopunc = [char for char in text if char not in string.punctuation]
  nopunc = ''.join(nopunc)

  clean = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
  return clean

df['text'].head().apply(process)

0    [Subject, naturally, irresistible, corporate, ...
1    [Subject, stock, trading, gunslinger, fanny, m...
2    [Subject, unbelievable, new, homes, made, easy...
3    [Subject, 4, color, printing, special, request...
4    [Subject, money, get, software, cds, software,...
Name: text, dtype: object

In [14]:
#Convert text into a matrix of token counts
from sklearn.feature_extraction.text import CountVectorizer

message = CountVectorizer(analyzer=process).fit_transform(df['text'])

In [18]:
print(message.shape)

(5695, 37229)


In [19]:
#Split the data
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(message, df['spam'], test_size=0.2, random_state=0)

In [21]:
#Train Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB().fit(xtrain, ytrain)

In [23]:
#Classifier's prediction vs actual value on training data
print(classifier.predict(xtrain))
print(ytrain.values)

[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]


In [25]:
#Evaluate the Model on training data
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

ypred = classifier.predict(xtrain)
print(classification_report(ytrain, ypred))
print()
print("Confusion Matrix: \n", confusion_matrix(ytrain, ypred))
print("Accuracy: \n", accuracy_score(ytrain, ypred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3457
           1       0.99      1.00      0.99      1099

    accuracy                           1.00      4556
   macro avg       0.99      1.00      1.00      4556
weighted avg       1.00      1.00      1.00      4556


Confusion Matrix: 
 [[3445   12]
 [   1 1098]]
Accuracy: 
 0.9971466198419666


In [None]:
#Classifier's prediction vs actual value on training data
print(classifier.predict(xtrain))
print(ytrain.values)

In [26]:
#Evaluate the Model on testing data
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

ypred2 = classifier.predict(xtest)
print(classification_report(ytest, ypred2))
print()
print("Confusion Matrix: \n", confusion_matrix(ytest, ypred2))
print("Accuracy: \n", accuracy_score(ytest, ypred2))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99       870
           1       0.97      1.00      0.98       269

    accuracy                           0.99      1139
   macro avg       0.98      0.99      0.99      1139
weighted avg       0.99      0.99      0.99      1139


Confusion Matrix: 
 [[862   8]
 [  1 268]]
Accuracy: 
 0.9920983318700615
