In [1]:
# Import libaries
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
%matplotlib inline

In [2]:
# Load the data
train = joblib.load('data//train.pkl')
validate = joblib.load('data//validate.pkl')
test = joblib.load('data//test.pkl')

In [3]:
train.columns = train.columns.astype(str)
validate.columns = validate.columns.astype(str)
test.columns = test.columns.astype(str)

In [4]:
# Split the data into X and y
X_train = train.drop('label', axis=1)
y_train = train.label

X_validate = validate.drop('label', axis=1)
y_validate = validate.label

X_test = test.drop('label', axis=1)
y_test = test.label

In [5]:
def evaluate(model, X, y):
    predictions = model.predict(X)
    score = accuracy_score(y, predictions)
    print(f'Model Accuracy: {round(score*100,3)} %\n')
    cm = confusion_matrix(y, predictions)
    print(f'Confusion Matrix:\n{cm}\n')
    report = classification_report(y, predictions,
            target_names=["Ham", "Spam"],output_dict=True,
            zero_division=np.nan)
    precision = report["Spam"]["precision"]
    recall = report["Spam"]["recall"]
    print(f'Precision (Spam): {round(precision*100,3)} %')
    print(f'Recall (Spam): {round(recall*100,3)} %')

In [6]:
def compare_models(models, X, y):
    model_scores = []
    for (key, value) in models.items():
        predictions = value.predict(X)
        clf_report = classification_report(y, predictions,
                    target_names=["Ham", "Spam"], output_dict=True,
                    zero_division=np.nan)
        precision = clf_report["Spam"]["precision"]
        recall =clf_report["Spam"]["recall"]
        score = accuracy_score(y, predictions)
        model_scores.append((key, round(score,3), round(precision,3),
                             round(recall,3), ))
    display(pd.DataFrame(model_scores,
    columns = ["Model", "Accuracy", "Precision", "Recall"]))

### Support Vector Classifier

In [7]:
svc = SVC(random_state=42).fit(X_train, y_train)

In [8]:
evaluate(svc, X_train, y_train)

Model Accuracy: 91.357 %

Confusion Matrix:
[[3482  127]
 [ 227  260]]

Precision (Spam): 67.183 %
Recall (Spam): 53.388 %


In [9]:
evaluate(svc, X_validate, y_validate)

Model Accuracy: 90.625 %

Confusion Matrix:
[[435  16]
 [ 32  29]]

Precision (Spam): 64.444 %
Recall (Spam): 47.541 %


### Gradient Boosted Trees

In [10]:
clf = GradientBoostingClassifier(random_state=42).fit(X_train, y_train)

In [11]:
evaluate(clf, X_train, y_train)

Model Accuracy: 99.17 %

Confusion Matrix:
[[3602    7]
 [  27  460]]

Precision (Spam): 98.501 %
Recall (Spam): 94.456 %


In [12]:
evaluate(clf, X_validate, y_validate)

Model Accuracy: 96.875 %

Confusion Matrix:
[[450   1]
 [ 15  46]]

Precision (Spam): 97.872 %
Recall (Spam): 75.41 %


### Logistic Regression

In [13]:
reg = LogisticRegression(random_state=42).fit(X_train, y_train)

In [14]:
evaluate(reg, X_train, y_train)

Model Accuracy: 97.632 %

Confusion Matrix:
[[3594   15]
 [  82  405]]

Precision (Spam): 96.429 %
Recall (Spam): 83.162 %


In [15]:
evaluate(reg, X_validate, y_validate)

Model Accuracy: 96.484 %

Confusion Matrix:
[[448   3]
 [ 15  46]]

Precision (Spam): 93.878 %
Recall (Spam): 75.41 %


In [16]:
model_list = {
    "Gradient Boosted Tree" : clf,
    "Logistic Regression" : reg,
    "Support Vector Classifier": svc
}
compare_models(model_list, X_test, y_test)

Unnamed: 0,Model,Accuracy,Precision,Recall
0,Gradient Boosted Tree,0.975,0.929,0.852
1,Logistic Regression,0.973,0.943,0.82
2,Support Vector Classifier,0.906,0.627,0.525


#### From the above data, the best model is Gradient Boosting Trees with an accuracy of 0.975 and precision of 0.929 and recall 0.852