# ECS750P MSc Project - Spam Email Detection

#### Imports the necessary modules

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
from sklearn.metrics import precision_recall_fscore_support
from sklearn import svm
import os

#### Function to load one file

In [2]:
# Treat the entire message as a string, and removes the '\n' and '\r'
def load_one_file(filename):
    x = ""
    with open(filename, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            line = line.strip('\n')
            line = line.strip('\r')
            x += line
    return x

#### Function to load all files from a folder

In [3]:
# Iterate through all files in the specified folder and load the data
def load_files_from_dir(rootdir):
    x = []
    list = os.listdir(rootdir)
    for i in range(0, len(list)):
        path = os.path.join(rootdir, list[i])
        if os.path.isfile(path):
            v = load_one_file(path)
            x.append(v)
    return x

#### Function to load all the emails

In [4]:
# The folder where the data stored is located, stores normal emails inn ham and spam emails in spam
def load_all_files():
    ham = []
    spam = []
    # load from the first folder enron1
    for i in range(1, 2): 
        path = "data/enron%d/ham/" % i
        print("Load %s" % path)
        ham += load_files_from_dir(path)
        path = "data/enron%d/spam/" % i
        print("Load %s" % path)
        spam += load_files_from_dir(path)
    return ham, spam

In [5]:
ham, spam = load_all_files()

Load data/enron1/ham/
Load data/enron1/spam/


#### Use bag-of-words modelling to vectorise email samples, ham with label 0, spam with label 1

In [6]:
def get_features_by_wordbag(ham, spam):
    x = ham + spam
    y = [0] * len(ham) + [1] * len(spam)
    vectorizer = CountVectorizer(
        decode_error = 'ignore',
        strip_accents = 'ascii',
        max_features = 5000,
        stop_words = 'english',
        max_df = 1.0,
        min_df = 1)
    x = vectorizer.fit_transform(x)
    x = x.toarray()
    return x, y

In [7]:
x, y = get_features_by_wordbag(ham, spam)

#### Split into train and test subsets.

In [8]:
# 60% train and 40% test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0)

#### Building a Naive Bayes model

In [9]:
def nb_model(x_train, x_test, y_train, y_test):
    print("Naive Bayes Model")
    clf = GaussianNB()
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print("Accuracy Score: ", metrics.accuracy_score(y_test, y_pred))
    final_scores = precision_recall_fscore_support(y_test, y_pred, average='weighted')
    print("Precision =  %f Recall = %f F1 Score = %f" % final_scores[:3])
    print(metrics.confusion_matrix(y_test, y_pred))

In [10]:
nb_model(x_train, x_test, y_train, y_test)

Naive Bayes Model
Accuracy Score:  0.9545674238762687
Precision =  0.955284 Recall = 0.954567 F1 Score = 0.954800
[[1405   58]
 [  36  570]]


#### The accuracy was 95.46% and the accuracy of the evaluation results is shown in the table below
  

|   | Related | Unrelated |
|:--------:|:--------:|:--------:|
|  Detected   |  1405   |  58   |
|  Undetected   |  36   |  570   |

#### Building a Support Vector Machine (SVM) model

In [11]:
def svm_model(x_train, x_test, y_train, y_test):
    print("SVM Model")
    clf = svm.SVC()
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print("Accuracy Score: ", metrics.accuracy_score(y_test, y_pred))
    final_scores = precision_recall_fscore_support(y_test, y_pred, average='weighted')
    print("Precision =  %f Recall = %f F1 Score = %f" % final_scores[:3])
    print(metrics.confusion_matrix(y_test, y_pred))

In [12]:
svm_model(x_train, x_test, y_train, y_test)

SVM Model
Accuracy Score:  0.9647172547124214
Precision =  0.965454 Recall = 0.964717 F1 Score = 0.964922
[[1414   49]
 [  24  582]]


#### Building a K-Nearest Neighbours (KNN) model

In [13]:
def knn_model(x_train, x_test, y_train, y_test):
    print("KNN Model")
    clf = KNeighborsClassifier(n_neighbors=5)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print("Accuracy Score: ", metrics.accuracy_score(y_test, y_pred))
    final_scores = precision_recall_fscore_support(y_test, y_pred, average='weighted')
    print("Precision =  %f Recall = %f F1 Score = %f" % final_scores[:3])
    print(metrics.confusion_matrix(y_test, y_pred))

In [14]:
knn_model(x_train, x_test, y_train, y_test)

KNN Model
Accuracy Score:  0.8032866118898019
Precision =  0.871162 Recall = 0.803287 F1 Score = 0.812049
[[1075  388]
 [  19  587]]


#### Building a Logistic Regression (LR) model

In [15]:
def lr_model(x_train, x_test, y_train, y_test):
    print("LR Model")
    clf = LogisticRegression()
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print("Accuracy Score: ", metrics.accuracy_score(y_test, y_pred))
    final_scores = precision_recall_fscore_support(y_test, y_pred, average='weighted')
    print("Precision =  %f Recall = %f F1 Score = %f" % final_scores[:3])
    print(metrics.confusion_matrix(y_test, y_pred))

In [16]:
lr_model(x_train, x_test, y_train, y_test)

LR Model
Accuracy Score:  0.9777670372160464
Precision =  0.978323 Recall = 0.977767 F1 Score = 0.977891
[[1428   35]
 [  11  595]]


#### Building a Deep Neural Network (DNN) model

In [17]:
def dnn_model(x_train, x_test, y_train, y_test):
    print("DNN Model")
    # Building deep neural network
    clf = MLPClassifier(solver='lbfgs',
                        alpha=1e-5,
                        hidden_layer_sizes=(5, 2),
                        random_state=1)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print("Accuracy Score: ", metrics.accuracy_score(y_test, y_pred))
    final_scores = precision_recall_fscore_support(y_test, y_pred, average='weighted')
    print("Precision =  %f Recall = %f F1 Score = %f" % final_scores[:3])
    print(metrics.confusion_matrix(y_test, y_pred))

In [18]:
dnn_model(x_train, x_test, y_train, y_test)

DNN Model
Accuracy Score:  0.9792170130497825
Precision =  0.979188 Recall = 0.979217 F1 Score = 0.979140
[[1449   14]
 [  29  577]]


#### Apply TF-IDF to refines the representation by weighing terms based on their significance in the document relative to the entire corpus

In [19]:
def get_features_by_wordbag_tfidf(ham, spam):
    x = ham + spam
    y = [0] * len(ham) + [1] * len(spam)
    vectorizer = CountVectorizer(binary=False,
                                 decode_error='ignore',
                                 strip_accents='ascii',
                                 max_features=5000,
                                 stop_words='english',
                                 max_df=1.0,
                                 min_df=1)
    x = vectorizer.fit_transform(x)
    x = x.toarray()
    transformer = TfidfTransformer(smooth_idf=False)
    tfidf = transformer.fit_transform(x)
    x = tfidf.toarray()
    return x, y

In [20]:
# reset the word bag and get a new train and test sets
x, y = get_features_by_wordbag_tfidf(ham, spam)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0)

In [22]:
nb_model(x_train, x_test, y_train, y_test)
svm_model(x_train, x_test, y_train, y_test)
knn_model(x_train, x_test, y_train, y_test)
lr_model(x_train, x_test, y_train, y_test)
dnn_model(x_train, x_test, y_train, y_test)

Naive Bayes Model
Accuracy Score:  0.958434026099565
Precision =  0.958245 Recall = 0.958434 F1 Score = 0.958183
[[1432   31]
 [  55  551]]
SVM Model
Accuracy Score:  0.9903334944417593
Precision =  0.990333 Recall = 0.990333 F1 Score = 0.990333
[[1453   10]
 [  10  596]]
KNN Model
Accuracy Score:  0.9647172547124214
Precision =  0.964662 Recall = 0.964717 F1 Score = 0.964458
[[1441   22]
 [  51  555]]
LR Model
Accuracy Score:  0.9840502658289029
Precision =  0.984023 Recall = 0.984050 F1 Score = 0.984023
[[1450   13]
 [  20  586]]
DNN Model
Accuracy Score:  0.9821169647172547
Precision =  0.982177 Recall = 0.982117 F1 Score = 0.982023
[[1455    8]
 [  29  577]]


#### Table I: PERFORMANCE METRICS OF DIFFERENT MODELS USING ONLY BoW

| Model              | Accuracy (%) | Precision (%) | Recall (%) | F1 Score (%) |
|--------------------|--------------|---------------|------------|--------------|
| Naive Bayes        | 95.46        | 95.53         | 95.46      | 95.48        |
| SVM                | 96.47        | 96.55         | 96.47      | 96.49        |
| KNN                | 80.33        | 87.12         | 80.33      | 81.20        |
| Logistic Regression| 97.78        | 97.83         | 97.78      | 97.79        |
| DNN                | 97.92        | 97.92         | 97.92      | 97.91     | 98.20        |


#### Table II: PERFORMANCE METRICS OF DIFFERENT MODELS USING BOW INTEGRATED WITH TF-IDF

| Model              | Accuracy (%) | Precision (%) | Recall (%) | F1 Score (%) |
|--------------------|--------------|---------------|------------|--------------|
| Naive Bayes        | 95.84        | 95.82         | 95.84      | 95.82        |
| SVM                | 99.03        | 99.03         | 99.03      | 99.03        |
| KNN                | 96.47        | 96.47         | 96.47      | 96.45        |
| Logistic Regression| 98.41        | 98.40         | 98.41      | 98.40        |
| DNN                | 98.21        | 98.22         | 98.21      | 98.20        |