In [1]:
# Import libraries
import numpy as np
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
import joblib

In [2]:
# Concat all data
train = pd.read_csv('split-data/train.csv')
validate = pd.read_csv('split-data/validate.csv')
test = pd.read_csv('split-data/test.csv')
df = pd.concat([train, validate, test], ignore_index=True)

In [3]:
train, test = train_test_split(df, train_size=0.8,stratify=df.label)

In [4]:
pipeline = Pipeline([
        ('vectorizer', CountVectorizer(strip_accents='unicode'))])

In [5]:
def transform_df(df, pipeline=pipeline, train=False):
    """Pipeline to implement vectorization and tf-idf transformation"""
    
    corpus = df.text.values
    if train:
        corpus_tfidf = pipeline.fit_transform(corpus)
    else:
        corpus_tfidf = pipeline.transform(corpus)

    print(f'Shape of transformed matrix: {corpus_tfidf.shape}')
    print(f'Number of non-zero: {corpus_tfidf.nnz}')
    print(f'Sparsity: {round(100.0*corpus_tfidf.nnz/(corpus_tfidf.shape[0]*corpus_tfidf.shape[1]),3)}%\n')

    df.reset_index(drop=True, inplace=True)
    df_transform = pd.concat([df, pd.DataFrame(corpus_tfidf.toarray())], axis=1)
    df_transform.drop('text', axis=1, inplace=True)

    return df_transform

In [6]:
# Transform and split the data
df_train = transform_df(train, train=True)

Shape of transformed matrix: (4096, 7027)
Number of non-zero: 53720
Sparsity: 0.187%



In [7]:
df_test = transform_df(test)

Shape of transformed matrix: (1024, 7027)
Number of non-zero: 12651
Sparsity: 0.176%



In [8]:
# Make column names string
df_train.columns = df_train.columns.astype(str)
df_test.columns = df_test.columns.astype(str)

In [9]:
# Split the data into X and y
X_train = df_train.drop('label', axis=1)
y_train = df_train.label

X_test = df_test.drop('label', axis=1)
y_test = df_test.label

In [10]:
def compare_models(models, X, y):
    model_scores = []
    for (key, value) in models.items():
        predictions = value.predict_proba(X)[:,1]
        predictions = np.where(predictions < 0.5, 0, 1)
        clf_report = classification_report(y, predictions,
                    target_names=["Ham", "Spam"], output_dict=True,
                    zero_division=np.nan)
        precision = clf_report["Spam"]["precision"]
        recall =clf_report["Spam"]["recall"]
        score = accuracy_score(y, predictions)
        model_scores.append((key, round(score,3), round(precision,3),
                             round(recall,3), ))
    display(pd.DataFrame(model_scores,
    columns = ["Model", "Accuracy", "Precision", "Recall"]))

In [11]:
nb = MultinomialNB(alpha=0.2)
gbt = GradientBoostingClassifier(random_state=42)
svc = SVC(probability=True, random_state=42)

In [13]:
# Log the metrics and register the models
models = {
    "Naive Bayes Classifier": nb,
    "Gradient Boosting Classifier": gbt,
    "Support Vector Classifier": svc
}


In [14]:
# Train and log each model
for name, model in models.items():
        model.fit(X_train, y_train)

In [15]:
compare_models(models, X_test, y_test)

Unnamed: 0,Model,Accuracy,Precision,Recall
0,Naive Bayes Classifier,0.982,0.948,0.902
1,Gradient Boosting Classifier,0.979,0.972,0.844
2,Support Vector Classifier,0.982,0.933,0.918


In [16]:
# Best performance by SVC
joblib.dump(svc, 'svc.pkl')
joblib.dump(pipeline, 'pipeline.pkl')

['pipeline.pkl']