In [8]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string

In [9]:
df = pd.read_csv("../Dataset/email-dataset.csv")
df

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
...,...,...
5690,Subject: re : research and development charges...,0
5691,"Subject: re : receipts from visit jim , than...",0
5692,Subject: re : enron case study update wow ! a...,0
5693,"Subject: re : interest david , please , call...",0


In [10]:
# df.to_csv("../src/email-dataset.csv")

In [11]:
df.shape

(5695, 2)

In [12]:
df.columns

Index(['text', 'spam'], dtype='object')

In [13]:
df.drop_duplicates(inplace=True)
df.shape

(5695, 2)

In [14]:
df.isnull().sum()

text    0
spam    0
dtype: int64

In [15]:
# downloading stopwords
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\varun\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [17]:
def process(text):
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)

    clean = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    return clean
# to show the tokenization
df['text'].head().apply(process)

0    [Subject, naturally, irresistible, corporate, ...
1    [Subject, stock, trading, gunslinger, fanny, m...
2    [Subject, unbelievable, new, homes, made, easy...
3    [Subject, 4, color, printing, special, request...
4    [Subject, money, get, software, cds, software,...
Name: text, dtype: object

In [22]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
cv = CountVectorizer()
tfidf = TfidfVectorizer(max_features=3000)

In [23]:
message = cv.fit_transform(df["text"])

In [32]:
#split the data into 80% training and 20% testing
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(message, df['spam'], test_size=0.20, random_state=0)


In [30]:
# createing and training the SVM Classifier
from sklearn.svm import SVC
SVM_classifier = SVC(kernel = "rbf", random_state = 0)
SVM_classifier.fit(xtrain, ytrain)

SVC(random_state=0)

In [31]:
print(SVM_classifier.predict(xtrain))
print(ytrain.values)

[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]


In [33]:
# Evaluating the SVM Classifier model on the training data set
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = SVM_classifier.predict(xtrain)
print(classification_report(ytrain, pred))
print()
print("Confusion Matrix: \n", confusion_matrix(ytrain, pred))
print("Accuracy: \n", accuracy_score(ytrain, pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      3457
           1       0.99      0.87      0.92      1099

    accuracy                           0.97      4556
   macro avg       0.97      0.93      0.95      4556
weighted avg       0.97      0.97      0.96      4556


Confusion Matrix: 
 [[3446   11]
 [ 148  951]]
Accuracy: 
 0.9651009657594382


In [34]:
print(SVM_classifier.predict(xtest))
print(ytest.values)

[1 0 0 ... 0 0 0]
[1 0 0 ... 0 0 0]


In [36]:
# Evaluating the SVM Classifier model on the test data set
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = SVM_classifier.predict(xtest)
print(classification_report(ytest, pred))
print()
print("Confusion Matrix: \n", confusion_matrix(ytest, pred))
print("Accuracy: \n", accuracy_score(ytest, pred))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97       870
           1       0.97      0.82      0.89       269

    accuracy                           0.95      1139
   macro avg       0.96      0.91      0.93      1139
weighted avg       0.95      0.95      0.95      1139


Confusion Matrix: 
 [[864   6]
 [ 49 220]]
Accuracy: 
 0.95171202809482


In [25]:
# creating and training the Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB().fit(xtrain, ytrain)

In [26]:
print(classifier.predict(xtrain))
print(ytrain.values)

[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]


In [27]:
# Evaluating the Naive Bayes Classifier model on the training data set
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = classifier.predict(xtrain)
print(classification_report(ytrain, pred))
print()
print("Confusion Matrix: \n", confusion_matrix(ytrain, pred))
print("Accuracy: \n", accuracy_score(ytrain, pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3457
           1       0.99      1.00      0.99      1099

    accuracy                           1.00      4556
   macro avg       0.99      1.00      0.99      4556
weighted avg       1.00      1.00      1.00      4556


Confusion Matrix: 
 [[3441   16]
 [   1 1098]]
Accuracy: 
 0.996268656716418


In [33]:
print(classifier.predict(xtest))
print(ytest.values)

[1 0 0 ... 0 0 0]
[1 0 0 ... 0 0 0]


In [34]:
# Evaluating the Naive Bayes Classifier model on the test data set
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = classifier.predict(xtest)
print(classification_report(ytest, pred))
print()
print("Confusion Matrix: \n", confusion_matrix(ytest, pred))
print("Accuracy: \n", accuracy_score(ytest, pred))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99       870
           1       0.97      1.00      0.98       269

    accuracy                           0.99      1139
   macro avg       0.98      0.99      0.99      1139
weighted avg       0.99      0.99      0.99      1139


Confusion Matrix: 
 [[861   9]
 [  1 268]]
Accuracy: 
 0.9912203687445127


## Testing

In [36]:
testt = process(test)

In [38]:
message = cv.transform(testt)

In [40]:
print(classifier.predict(message)[0])

1


In [41]:
import pickle 

In [43]:
with open("../classifier.pkl", "wb")as f:
    pickle.dump(classifier, f)

In [44]:
with open("../vectorizer.pkl", "wb") as v:
    pickle.dump(cv, v)

In [45]:
with open("../classifier.pkl", "rb")as f:
    classi = pickle.load(f)
    
with open("../vectorizer.pkl", "rb") as v:
    cvv = pickle.load(v)

In [51]:
test = '''1 new job for 'full stack engineer'
     Your job alert for full stack engineer
1 new job in Pune, Maharashtra, India matches your preferences.

DoorMonk	
Full Stack Engineer
DoorMonk Â· Pune, Maharashtra, India (Remote)

See all jobs
 
premium
Varun More
See jobs where you're a top applicant
Try Premium for free
 
 
 
This email was intended for Varun More (ðŸ’»Computer Science + ðŸ§ Artificial Intelligence | Python, C++, C Programming, JavaScript | GSSoC'22). Learn why we included this.

 
You are receiving Job Alert emails.

Manage job alerts  Â·  Unsubscribe  Â·  Help
LinkedIn
 
Â© 2022 LinkedIn Ireland Unlimited Company, Wilton Plaza, Wilton Place, Dublin 2. LinkedIn is a registered business name of LinkedIn Ireland Unlimited Company. LinkedIn and the LinkedIn logo are registered trademarks of LinkedIn.
 
'''

In [52]:
test_text = process(test)

In [53]:
test_text_m = cvv.transform(test_text)

In [54]:
print(classi.predict(test_text_m)[0])

0


Accuracy for Support Vector Machine Classifier is _0.9651009657594382_ on traning dataset.

Accuracy for Support Vector Machine Classifier is _0.95171202809482_ on test dataset.


Accuracy for Multinomial Naive Bayes Classifier is _0.996268656716418_ on traning dataset.

Accuracy for Multinomial Naive Bayes Classifier is _0.9912203687445127_ on test dataset.

Since the accuracy of Multinomial Naive Bayes Classifier is more, so we decided to go with Multinomial Naive Bayes Classifier for this problem statement.