In [25]:
# Import libaries
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
%matplotlib inline

In [2]:
# Load the data
train = joblib.load('data//train.pkl')
validate = joblib.load('data//validate.pkl')
test = joblib.load('data//test.pkl')

In [3]:
train.columns = train.columns.astype(str)
validate.columns = validate.columns.astype(str)
test.columns = test.columns.astype(str)

In [4]:
# Split the data into X and y
X_train = train.drop('label', axis=1)
y_train = train.label

X_validate = validate.drop('label', axis=1)
y_validate = validate.label

X_test = test.drop('label', axis=1)
y_test = test.label

In [38]:
def evaluate(model, X, y):
    predictions = model.predict(X)
    score = accuracy_score(y, predictions)
    print(f'Model Accuracy: {round(score*100,3)} %\n')
    cm = confusion_matrix(y, predictions)
    print(f'Confusion Matrix:\n{cm}\n')
    report = classification_report(y, predictions,
            target_names=["Spam", "Ham"],output_dict=True,
            zero_division=np.nan)
    precision = report["Spam"]["precision"]
    recall = report["Spam"]["recall"]
    print(f'Precision (Spam): {round(precision*100,3)} %')
    print(f'Recall (Spam): {round(recall*100,3)} %')

In [47]:
def compare_models(models, X, y):
    model_scores = []
    for (key, value) in models.items():
        predictions = value.predict(X)
        clf_report = classification_report(y, predictions,
                    target_names=["Spam", "Ham"], output_dict=True,
                    zero_division=np.nan)
        precision = clf_report["Spam"]["precision"]
        recall =clf_report["Spam"]["recall"]
        score = accuracy_score(y, predictions)
        model_scores.append((key, round(precision,3),
                             round(recall,3), round(score,3)))
    display(pd.DataFrame(model_scores,
    columns = ["Model", "Precision", "Recall", "Accuracy"]))

### Support Vector Classifier

In [7]:
svc = SVC().fit(X_train, y_train)

In [8]:
evaluate(svc, X_train, y_train)

Model Accuracy: 91.138 %
Precision (Spam): 67.318 %
Recall (Spam): 49.487 %


In [9]:
evaluate(svc, X_validate, y_validate)

Model Accuracy: 93.164 %
Precision (Spam): 79.545 %
Recall (Spam): 57.377 %


### Gradient Boosted Trees

In [10]:
clf = GradientBoostingClassifier().fit(X_train, y_train)

In [11]:
evaluate(clf, X_train, y_train)

Model Accuracy: 98.999 %
Precision (Spam): 97.447 %
Recall (Spam): 94.045 %


In [12]:
evaluate(clf, X_validate, y_validate)

Model Accuracy: 97.461 %
Precision (Spam): 98.0 %
Recall (Spam): 80.328 %


### Multinomial NB

In [35]:
nb_clf = MultinomialNB().fit(X_train, y_train)

In [40]:
evaluate(nb_clf, X_train, y_train)

Model Accuracy: 88.33 %

Confusion Matrix:
[[   9  478]
 [   0 3609]]

Precision (Spam): 100.0 %
Recall (Spam): 1.848 %


In [39]:
evaluate(nb_clf, X_test, y_test)

Model Accuracy: 88.086 %

Confusion Matrix:
[[  0  61]
 [  0 451]]

Precision (Spam): nan %
Recall (Spam): 0.0 %


### Hyperparameter Tuning

In [None]:
param_grid = {
    'learning_rate': [0.1, 0.01, 0.001],
}
gridCV = GridSearchCV(GradientBoostingClassifier(), param_grid)
gridCV.fit(X_train, y_train)
print(gridCV.best_estimator_)
print(gridCV.best_params_)

GradientBoostingClassifier()
{'learning_rate': 0.1}


In [45]:
clf = gridCV.best_estimator_
evaluate(clf, X_validate, y_validate)

Model Accuracy: 97.266 %

Confusion Matrix:
[[ 48  13]
 [  1 450]]

Precision (Spam): 97.959 %
Recall (Spam): 78.689 %


In [48]:
model_list = {
    "Multinomial Naive Bayes" : nb_clf,
    "Gradient Boosted Tree" : clf,
    "Support Vector Classifier": svc
}
compare_models(model_list, X_test, y_test)

Unnamed: 0,Model,Precision,Recall,Accuracy
0,Multinomial Naive Bayes,,0.0,0.881
1,Gradient Boosted Tree,0.957,0.721,0.963
2,Support Vector Classifier,0.596,0.459,0.898


#### From the above data, the best model is Gradient Boosting Trees with an accuracy of 0.963 and precision of 0.957