# **Spam Email Classification**

There are various supervised learning algorithms that can be used for classification problems. One of the applications of these learning algorithms is classifying spam emails. Here, various learning algorithms such as Decision tree, Random forest, SVC, Logistic Regression and MLP are used to classify the mails and their results are compared. The email data used for the analysis is obtained from the spamassassin website (https://spamassassin.apache.org/old/publiccorpus/).

## Downloading dataset

In [8]:
from urllib.request import urlretrieve
import tarfile

# function to download dataset from the spam corpus using URLs
def download_file(url, folderPath):
    filename = url.split('/')[-1]

    if not os.path.isdir(folderPath):
        os.makedirs(folderPath)
         
    filepath, headers = urlretrieve(url, os.path.join(folderPath, filename))
    return filepath

def extract_file(filepath, extractionPath):
    tar = tarfile.open(filepath)
    folderPath = os.path.join(extractionPath, filepath.split('/')[-1].split('.')[0])
    tar.extractall(folderPath)
    return folderPath

## Preprocessing and Feature Extraction

In [9]:
import re
import copy

def get_processed_msg(message_orig):
    """
    function to preprocess the email message body to remove unnecessary content
    
    Arguments:
    message_orig -- raw text content of the email body
    
    Returns:
    message -- email body after processing the text contents
    """
    
    message = copy.deepcopy(message_orig)
    
    # remove white space
    message = message.replace("\n", " ")
    
    # change the texts to lower case
    message = message.lower()
    
    # remove html tags and parse them
    message = re.sub(r"<(“[^”]*”|'[^’]*’|[^'”>])*>", " ", message)
    
    # replace emails with 'email'
    message = re.sub(r"[\S]+@[\S]+\.[\S]+", "EMAIL", message)
    
    # replace URLs with 'url'
    message = re.sub(r"http[s]?://[\S]+", "URL", message)
    
    # replace the currency symbols with 'currency'
    message = re.sub(r"\$([ ]?(\d)+)?", "AMOUNT", message)
    
    # replace numbers with 'number'
    message = re.sub(r"\b(\d)+\b", "NUMBER", message)
    
    # remove unnecessary punctuations and special characters
    message = re.sub(r"[!@#$%^&*()_+\-=\[\]{};`~':\"\\|,.<>\/?]+", " ", message)
    
    return message

In [49]:
import glob
import email

def get_mail_contents(folderPath):
    """
    function to fetch the message text contents from list of emails
    
    Arguments:
    mail_type -- type of email data to be fetched - spam/ham
    
    Returns:
    messages -- an array of text content from body of list of emails
    """
    
    messages = []
    mailList = glob.glob(folderPath + "/*/*", recursive = True)
    for email_file in mailList:
        message = ""
        try:
            fp = open(email_file, encoding= 'latin-1')
            email_content = email.message_from_file(fp)
            for part in email_content.walk():
                if part.get_content_type() == 'text/plain':
                    message = part.get_payload()
        except:
            print("Error in parsing document %r" % email_file)
        messages.append(get_processed_msg(message))

    return messages

In [50]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.utils import shuffle

# creating features using the 5000 most frequently found words in the mail dataset
def get_frequent_words(messages, featureCount = 5000):
    vectorizer = CountVectorizer(stop_words = 'english', max_features = featureCount)
    vectorizer.fit(messages)
    frequentWords = vectorizer.get_feature_names_out()
    return frequentWords

def extract_features(messages, wordList):
    vectorizer = CountVectorizer(vocabulary=wordList)
    X = vectorizer.fit_transform(messages).toarray()
    return X

In [57]:
from sklearn.model_selection import train_test_split

def get_messages(url):
    downloadPath = os.path.join("dataset")
    extractionPath = os.path.join(downloadPath, "extracted")
    
    filePath = download_file(url, downloadPath)
    folderPath = extract_file(filePath, extractionPath)
    messages = get_mail_contents(folderPath)
    
    return messages

def load_dataset(urls, testSize, featureCount):
    
    messages = []
    
    for url in urls["spam"]:
        messages.extend(get_messages(url))
        
    spamCount = len(messages)
    
    for url in urls["ham"]:
        messages.extend(get_messages(url))
    
    hamCount = len(messages) - spamCount
    
    y = np.concatenate((np.ones((spamCount)), np.zeros((hamCount))))
    
    train_messages, test_messages, train_y, test_y = train_test_split(messages, y, test_size=testSize, random_state = 4)
       
    vocabulary = get_frequent_words(train_messages, featureCount)
    
    train_X = extract_features(train_messages, vocabulary)
    test_X = extract_features(test_messages, vocabulary)
    
    return train_X, test_X, train_y, test_y

In [52]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

def display_classifier_metrics(classifier, y_actual, y_pred):
    print("\n\n", classifier)
    print("Confusion matrics : ", confusion_matrix(y_actual, y_pred))
    print("Precision : ", precision_score(y_actual, y_pred))
    print("Recall : ", recall_score(y_actual, y_pred))
    print("F1 score : ", f1_score(y_actual, y_pred))
    print("Accuracy score : ", accuracy_score(y_actual, y_pred))

## Training the models

In [58]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
import os

urls = {'spam': ["https://spamassassin.apache.org/old/publiccorpus/20021010_spam.tar.bz2", "https://spamassassin.apache.org/old/publiccorpus/20030228_spam.tar.bz2", "https://spamassassin.apache.org/old/publiccorpus/20030228_spam_2.tar.bz2", "https://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2"],
        'ham': ["https://spamassassin.apache.org/old/publiccorpus/20021010_easy_ham.tar.bz2", "https://spamassassin.apache.org/old/publiccorpus/20021010_hard_ham.tar.bz2", "https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham.tar.bz2", "https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham_2.tar.bz2","https://spamassassin.apache.org/old/publiccorpus/20030228_hard_ham.tar.bz2"]}
test_size = 0.33
featureCount = 5000 

X_train, X_test, y_train, y_test = load_dataset(urls, test_size, featureCount)

classifiers = {
    'svc' : SVC(),
    'decision tree' : DecisionTreeClassifier(),
    'random forest' : RandomForestClassifier(),
    'logistic regression' : LogisticRegression(solver = 'newton-cg'),
    'mlp' : MLPClassifier()
    }

## Result

In [59]:
for classifier_name, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    display_classifier_metrics(classifier_name, y_test, y_pred)



 svc
Confusion matrics :  [[2284   21]
 [ 728  515]]
Precision :  0.960820895522388
Recall :  0.414320193081255
F1 score :  0.5789769533445756
Accuracy score :  0.7888951521984217


 decision tree
Confusion matrics :  [[2148  157]
 [  19 1224]]
Precision :  0.8863142650253439
Recall :  0.9847144006436042
F1 score :  0.9329268292682927
Accuracy score :  0.9503945885005637


 random forest
Confusion matrics :  [[2200  105]
 [  23 1220]]
Precision :  0.9207547169811321
Recall :  0.9814963797264682
F1 score :  0.9501557632398754
Accuracy score :  0.963923337091319






 logistic regression
Confusion matrics :  [[2144  161]
 [  26 1217]]
Precision :  0.8831640058055152
Recall :  0.9790828640386162
F1 score :  0.9286531858069439
Accuracy score :  0.9472942502818489


 mlp
Confusion matrics :  [[2198  107]
 [  16 1227]]
Precision :  0.9197901049475262
Recall :  0.9871279163314561
F1 score :  0.9522700814901048
Accuracy score :  0.9653325817361894


The result of the prediction using various learning algorithms shows that SVC has the least accuracy and F1 score while all the other algorithms perform well in classifying spam emails.