# **Spam Email Classification**

One of the applications of supervised learning algorithms is classifying spam emails. The email data used for the following analysis was obtained from the spamassassin website (https://spamassassin.apache.org/old/publiccorpus/).

## Downloading dataset

In [36]:
from urllib.request import urlretrieve
import tarfile
import os

# function to download dataset from the spam corpus using URLs
def download_file(url, folderPath):
    """
    Downloads the tar file from the given url
    
    Arguments:
    url -- the URL of the compressed file from spamassassin website
    folderPath -- the destination path where the file will be downloaded
    
    Returns:
    filePath -- the absolute path of the downloaded file
    """
    filename = url.split('/')[-1]

    if not os.path.isdir(folderPath):
        os.makedirs(folderPath)
         
    filepath, headers = urlretrieve(url, os.path.join(folderPath, filename))
    return filepath

In [37]:
def extract_file(filepath, extractionPath):
    """
    method to extract the downloaded compressed file
    
    Arguments:
    filepath -- compressed file path
    extractionPath -- destination folder where the file will be extracted
    
    Returns:
    folderPath -- the path of the extracted file
    """
    tar = tarfile.open(filepath)
    folderPath = os.path.join(extractionPath, filepath.split('/')[-1].split('.')[0])
    tar.extractall(folderPath)
    return folderPath

## Preprocessing and Feature Extraction

In [38]:
import re
import copy

def get_processed_msg(message_orig):
    """
    function to preprocess the email message body to remove unnecessary content
    
    Arguments:
    message_orig -- raw text content of the email body
    
    Returns:
    message -- email body after processing the text contents
    """
    
    message = copy.deepcopy(message_orig)
    
    # remove white space
    message = message.replace("\n", " ")
    
    # change the texts to lower case
    message = message.lower()
    
    # remove html tags and parse them
    message = re.sub(r"<(“[^”]*”|'[^’]*’|[^'”>])*>", " ", message)
    
    # replace emails with 'email'
    message = re.sub(r"[\S]+@[\S]+\.[\S]+", "EMAIL", message)
    
    # replace URLs with 'url'
    message = re.sub(r"http[s]?://[\S]+", "URL", message)
    
    # replace the currency symbols with 'currency'
    message = re.sub(r"\$([ ]?(\d)+)?", "AMOUNT", message)
    
    # replace numbers with 'number'
    message = re.sub(r"\b(\d)+\b", "NUMBER", message)
    
    # remove unnecessary punctuations and special characters
    message = re.sub(r"[!@#$%^&*()_+\-=\[\]{};`~':\"\\|,.<>\/?]+", " ", message)
    
    return message

In [39]:
import glob
import email

def get_mail_contents(folderPath):
    """
    method to fetch the message text contents from list of emails
    
    Arguments:
    folderPath -- the folder which has a list of the email files
    
    Returns:
    messages -- an array of text content from the body of the list of emails from the given folder
    """
    
    messages = []
    mailList = glob.glob(folderPath + "/*/*", recursive = True)
    for email_file in mailList:
        message = ""
        try:
            fp = open(email_file, encoding= 'latin-1')
            email_content = email.message_from_file(fp)
            for part in email_content.walk():
                if part.get_content_type() == 'text/plain':
                    message = part.get_payload()
        except:
            print("Error in parsing document %r" % email_file)
        messages.append(get_processed_msg(message))

    return messages

In [40]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# creating features using the 5000 most frequently found words in the mail dataset
def get_frequent_words(messages, featureCount = 5000):
    """
    method to get the most frequently present words in the messages
    
    Arguments:
    messages -- list of messages obtained from the email body
    featureCount -- number of words to be extracted
    
    Returns:
    frequentWords -- 'frequentCount' number of frequently present words in the messages
    """
    vectorizer = CountVectorizer(stop_words = 'english', max_features = featureCount)
    vectorizer.fit(messages)
    frequentWords = vectorizer.get_feature_names_out()
    return frequentWords

In [41]:
def extract_features(messages, wordList):
    """
    method to extract feature from the messages using a vocabulary
    
    Arguments:
    messages -- list of messages from which the features are extracted
    wordList -- vocabulary which is used to extract the features
    
    Returns:
    X -- an array of records with features extracted using the vocabulary
    """
    vectorizer = CountVectorizer(vocabulary=wordList)
    X = vectorizer.fit_transform(messages).toarray()
    return X

In [42]:
def get_messages(url):
    """
    method to fetch the messages from the url
    
    Arguments:
    url -- the source file URL
    
    Returns:
    messages -- list of messages preprocessed after downloaded from the spamassassin website
    """
    downloadPath = os.path.join("dataset")
    extractionPath = os.path.join(downloadPath, "extracted")
    
    filePath = download_file(url, downloadPath)
    folderPath = extract_file(filePath, extractionPath)
    messages = get_mail_contents(folderPath)
    
    return messages

In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split

def load_dataset(urls, testSize = 0.33, featureCount = 5000):
    """
    method to load the dataset
    
    Arguments:
    urls -- list of urls to download the email files
    testSize -- train test dataset split
    featureCount -- number of features to be extracted from the messages using frequently occuring words
    
    Returns:
    train_X -- training input features
    test_X -- test input features
    train_y -- training output values
    test_y -- test output values
    """
    messages = []
    
    for url in urls["spam"]:
        messages.extend(get_messages(url))
        
    spamCount = len(messages)
    
    for url in urls["ham"]:
        messages.extend(get_messages(url))
    
    hamCount = len(messages) - spamCount
    
    y = pd.DataFrame({
        'spam' : np.concatenate((np.ones((spamCount)), np.zeros((hamCount))))
    })
    
    train_messages, test_messages, train_y, test_y = train_test_split(messages, y, test_size=testSize, random_state = 4)
       
    vocabulary = get_frequent_words(train_messages, featureCount)
    
    train_X = extract_features(train_messages, vocabulary)
    train_X = pd.DataFrame(train_X, columns=vocabulary)
    test_X = extract_features(test_messages, vocabulary)
    test_X = pd.DataFrame(test_X, columns=vocabulary)
    
    return train_X, test_X, train_y, test_y

## Training the models

In [44]:
urls = {'spam': ["https://spamassassin.apache.org/old/publiccorpus/20021010_spam.tar.bz2", "https://spamassassin.apache.org/old/publiccorpus/20030228_spam.tar.bz2", "https://spamassassin.apache.org/old/publiccorpus/20030228_spam_2.tar.bz2", "https://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2"],
        'ham': ["https://spamassassin.apache.org/old/publiccorpus/20021010_easy_ham.tar.bz2", "https://spamassassin.apache.org/old/publiccorpus/20021010_hard_ham.tar.bz2", "https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham.tar.bz2", "https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham_2.tar.bz2","https://spamassassin.apache.org/old/publiccorpus/20030228_hard_ham.tar.bz2"]}

X_train, X_test, y_train, y_test = load_dataset(urls)

In [45]:
# examining the dataset obtained so far
print('X train shape : ', X_train.shape)
print('y train shape : ', y_train.shape)
print('X test shape : ', X_test.shape)
print('y test shape : ', y_test.shape)

X train shape :  (7203, 5000)
y train shape :  (7203, 1)
X test shape :  (3548, 5000)
y test shape :  (3548, 1)


Since the number of features and the total number of records are not very significantly different, a logistic regression classifier is used as the learning algorithm to classify the emails.

In [46]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter = 5000)

clf.fit(X_train, np.ravel(y_train))

## Result

In [47]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

y_pred = clf.predict(X_test)
print("Confusion matrics : ", confusion_matrix(y_test, y_pred))
print("Precision : ", precision_score(y_test, y_pred))
print("Recall : ", recall_score(y_test, y_pred))
print("F1 score : ", f1_score(y_test, y_pred))
print("Accuracy score : ", accuracy_score(y_test, y_pred))

Confusion matrics :  [[2183  122]
 [  17 1226]]
Precision :  0.9094955489614244
Recall :  0.9863234111021721
F1 score :  0.9463527595522965
Accuracy score :  0.9608229988726043


The result of the prediction using logistic regression algorithm shows an accuracy of 96% with default hyperparameters and iteration = 5000.