In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

import nltk
from nltk.corpus import stopwords

%matplotlib inline


In [2]:
def tokenize_func(text):
    chars = [i for i in list(range(65, 91)) + list(range(97, 123)) + \
            list(range(48, 58)) + [32]]
    output = "".join([i for i in text if ord(i) in chars])
    return [word for word in output.split() if word not in
            stopwords.words('english')]


In [3]:
df = pd.read_csv('emails.csv')
df.spam = df.spam.map({0: 'ham', 1: 'spam'})
df.rename({'spam': 'label'}, axis=1, inplace=True)
for i in range(len(df)):
    if df.iloc[i].text.find('Subject: re :') == 0:
        df.iloc[i] = [df.iloc[i].text[14:], df.iloc[i].label]
    elif df.iloc[i].text.find('Subject:') == 0:
        df.iloc[i] = [df.iloc[i].text[9:], df.iloc[i].label]

test_df = pd.read_csv('spam.csv')


In [4]:
first_df = train_test_split(df.text, df.label)
second_df = train_test_split(test_df.v2, test_df.v1)
train_data, train_labels = list(first_df[0]) + list(second_df[0]), \
                            list(first_df[2]) + list(second_df[2])
test_data, test_labels = list(first_df[1]) + list(second_df[1]), \
                            list(first_df[3]) + list(second_df[3])


In [5]:
bow_transformer = CountVectorizer(analyzer=tokenize_func).fit(train_data)
message_bow = bow_transformer.transform(train_data)

tfidf_transformer = TfidfTransformer().fit(message_bow)
messages_tfidf = tfidf_transformer.transform(message_bow)

spam_detect_model = MultinomialNB().fit(messages_tfidf, train_labels)
all_predictions = spam_detect_model.predict(messages_tfidf)


In [15]:
print(classification_report(train_labels, all_predictions))

              precision    recall  f1-score   support

         ham       0.93      1.00      0.96      6867
        spam       1.00      0.66      0.80      1608

    accuracy                           0.94      8475
   macro avg       0.96      0.83      0.88      8475
weighted avg       0.94      0.94      0.93      8475



In [31]:
mess_test = bow_transformer.transform(test_data)
test = tfidf_transformer.transform(mess_test)
test_pred = spam_detect_model.predict(test)

print((classification_report(test_labels, test_pred)))


              precision    recall  f1-score   support

         ham       0.92      1.00      0.96      2301
        spam       1.00      0.63      0.78       524

    accuracy                           0.93      2825
   macro avg       0.96      0.82      0.87      2825
weighted avg       0.94      0.93      0.93      2825



In [34]:
validate_df = pd.read_csv('spam_or_not_spam.csv')
validate_df.label = validate_df.label.map({0: 'ham', 1: 'spam'})
mess_test = bow_transformer.transform(validate_df.email)
test = tfidf_transformer.transform(mess_test)
test_pred = spam_detect_model.predict(test)

print((classification_report(validate_df.label, test_pred)))


              precision    recall  f1-score   support

         ham       0.86      1.00      0.92      2500
        spam       0.97      0.17      0.29       499

    accuracy                           0.86      2999
   macro avg       0.91      0.58      0.61      2999
weighted avg       0.88      0.86      0.82      2999

