In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [27]:
import email
import email.policy
import os
import random
from bs4 import BeautifulSoup
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

In [3]:
base_dir = '/content/drive/MyDrive/project_data/'

In [4]:
label = 'spam'

In [5]:
files = os.listdir(base_dir + label)
len(files)

501

In [6]:
spam_email_dir = os.listdir(base_dir + 'spam')
ham_email_dir = os.listdir(base_dir + 'ham')

In [7]:
def load_email(is_spam, filename):
    directory = base_dir + ('spam' if is_spam else 'ham')
    with open(os.path.join(directory, filename), 'rb') as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

In [8]:
spam_emails = [load_email(True, filename) for filename in spam_email_dir]
ham_emails = [load_email(False, filename) for filename in ham_email_dir]

In [9]:
random.shuffle(spam_emails)
random.shuffle(ham_emails)
len(ham_emails)

2551

In [None]:
for mail in spam_emails:
    payload = mail.get_payload()
    print(mail['Content-Type'])

In [11]:
def process_email(emails, label, data_dictionary, default_topic=None):
    for mail in emails:
        payload = mail.get_payload()
        if isinstance(payload, list):
            process_email(payload, label, data_dictionary, default_topic=mail["Subject"])
        else:
            if 'Content-Type' in mail.keys():
                if 'html' in mail['Content-Type'].lower():
                    try: 
                        soup = BeautifulSoup(mail.get_content())
                        print(soup)
                        topic = mail['Subject']
                        if topic == None:
                            topic = default_topic
                        content = soup.body.text
                        data_dictionary['topic'].append(topic)
                        data_dictionary['content'].append(content)
                        data_dictionary['label'].append(label)
                    except:
                        pass
                elif "plain" in mail['Content-Type'].lower():
                    try: 
                        topic = mail['Subject']
                        if topic == None:
                            topic = default_topic
                        content = mail.get_content()
                        data_dictionary['topic'].append(topic)
                        data_dictionary['content'].append(content)
                        data_dictionary['label'].append(label)
                    except:
                        pass
                else:
                    pass

In [None]:
data_dictionary = {'topic': [], 'content': [], 'label': []}
process_email(spam_emails, 1, data_dictionary)
process_email(ham_emails, 0, data_dictionary)
df = pd.DataFrame(data_dictionary)
df.dropna(inplace=True)
df = df.sample(frac=1)

In [None]:
data_dictionary

In [14]:
df.head(10)

Unnamed: 0,topic,content,label
101,Get the Child Support You Deserve ...,1) Join the Web's Hottest & Fastest Growing Co...,1
918,RE: [ILUG] Newby to Linux looking for informat...,"Hello all,\nFirstly I'd like to thank all of y...",0
1582,Internet saturation (but not in Iceland),Gary's news service at teledyn.com has an arti...,0
1939,Re: revocation of grlygrl201@,"\nWell Beberg, unless you're really into Anime...",0
1899,Re: xmms and .mp3 files.,humberto@hpcf.upr.edu wrote:\n\n>Redhat 8 disa...,0
699,Re[2]: Selling Wedded Bliss (was Re: Ouch...),"On Mon, 9 Sep 2002 bitbitch@magnesium.net wrot...",0
1111,Re: From,"This is not an exmh problem, but an interactio...",0
2427,RE: sprint delivers the next big thing??,"right Mike,\n\ni will agree to disagree but i ...",0
2613,[Razor-users] removing Razor1,Forgive me for being a partially stupid end-us...,0
1797,Pensioners and housebuyers suffer,"URL: http://www.newsisfree.com/click/-2,841883...",0


In [16]:
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]
def preprocess_text(content):
    content = content.lower()
    cleaner = re.compile('<.*?>')
    content = re.sub(cleaner, '', content)
    content = content.replace('\n',' ')
    content = re.sub(r"[^a-zA-Z0-9]+", ' ', content)
    for stopword in stopwords:
        content = content.replace(stopword + " ", "")
        content = content.replace(" " + stopword, "")
    return content

In [18]:
topic_and_contents = []
for (topic, content) in zip(df["topic"], df["content"]):
    topic_and_contents.append(preprocess_text(topic + " " + content))
df["topic_content"] = topic_and_contents

In [20]:
vectorizer = CountVectorizer()
x = vectorizer.fit_transform(df["topic_content"])
x = x.toarray()

X = []            #Vector after encode email
for i in x:
    X.append(i.flatten())
Y = df['label']

In [22]:
Y

101     1
918     0
1582    0
1939    0
1899    0
       ..
1311    0
1321    0
1594    0
2305    0
61      1
Name: label, Length: 2658, dtype: int64

In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

print("Number of emails in traning: ",len(y_train))
print("Number of emails in testing: ",len(y_test))

Number of emails in traning:  2126
Number of emails in testing:  532


In [25]:
from sklearn.naive_bayes import GaussianNB
clf_NB = GaussianNB()
clf_NB.fit(X_train, y_train)

GaussianNB()

In [28]:
y_pred = clf_NB.predict(X_test)
print (f"Accuracy in testing dataset:",(100*accuracy_score(y_test, y_pred)))

Accuracy in testing dataset: 97.74436090225564
