### 0. Imports

In [2]:
import gzip
import json
from collections import Counter
import re
import itertools
import joblib
from pathlib import Path

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

### 1. Load the data

In [None]:
data_file = Path('datasets/zenodo_open_metadata_2020-06-23.jsonl.gz')
data = []
labels = []
with gzip.open(data_file, 'rb') as fp:
    tag_removal_regex = re.compile(r'(<!--.*?-->|<[^>]*>)')
    for l in fp:
        r = json.loads(l)
        clean_description = tag_removal_regex.sub('', r['description'])
        data.append(r['title'] + ' ' + clean_description)
        labels.append(r['spam'])

### 2. Train the model

In [None]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.33, random_state=422)

# Random forest model
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=8000, ngram_range=(1, 1))),
    ("clf", RandomForestClassifier(n_estimators=100, n_jobs=4)),
    # ("clf", ExtraTreesClassifier(n_estimators=100, n_jobs=4)),
])

# NB model
# pipeline = Pipeline([
#     ('tfidf', TfidfVectorizer(max_features=8000, ngram_range=(1, 1))),
#     ('clf', MultinomialNB()),
#     # ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)),
# ])

# Trains the model. This might take some time (~30min)
text_clf = pipeline.fit(X_train, y_train)

### 3. Calculate accuracy

In [None]:
y_pred = text_clf.predict(X_test)
acc = [(ref, pred) for ref, pred in zip(y_test, y_pred)]
c = Counter(acc)
print(c)
print("Spam->Spam: {0:.4f}".format(c[(True, True)] / (c[(True, True)] + c[(True, False)])))
print("Ham -> Ham: {0:.4f}".format(c[(False, False)] / (c[(False, False)] + c[(False, True)])))
print("Accuracy: {0:.4f}".format((c[(False, False)] + c[(True, True)] ) / (len(acc))))

### 4. Dump model

In [None]:

models_dir = Path('models')
joblib.dump(text_clf, models_dir / '2020_06_23_record_spam.pkl')

### 5. Compare accuracy of old models

In [None]:
models = [
    '2017_06_18_record_spam.pkl',
    '2020_06_23_record_spam.pkl',
]

for model_path in models:
    model = joblib.load(models_dir / model_path)
    y_pred = old_model.predict(X_test)
    acc = [(ref, pred) for ref, pred in zip(y_test, y_pred)]
    c = Counter(acc)
    print(f'### Accuracy of model {model_path} for {data_file}')
    print(c)
    print("Spam->Spam: {0:.4f}".format(c[(True, True)] / (c[(True, True)] + c[(True, False)])))
    print("Ham -> Ham: {0:.4f}".format(c[(False, False)] / (c[(False, False)] + c[(False, True)])))
    print("Accuracy: {0:.4f}".format((c[(False, False)] + c[(True, True)] ) / (len(acc))))

#### Results

##### Accuracy of model for data up to 06/03/2017

Counter({(False, False): 56495, (True, True): 1652, (False, True): 140, (True, False): 38})

- Spam->Spam: 0.9775
- Ham -> Ham: 0.9975<br/>
- Accuracy: 0.9969<br/>

##### Accuracy of model for data up to 06/04/2017

Counter({(False, False): 58746, (True, True): 1855, (True, False): 65, (False, True): 54})

- Spam->Spam: 0.9661
- Ham -> Ham: 0.9991
- Accuracy: 0.9980

##### Accuracy of model for data up to 17/05/2018

Counter({(False, False): 133426, (True, True): 2627, (True, False): 149, (False, True): 128})

- Spam->Spam: 0.9463
- Ham -> Ham: 0.9990
- Accuracy: 0.9980

##### Accuracy of model for data up to 04/11/2019

Counter({(False, False): 482380, (True, True): 3526, (False, True): 970, (True, False): 515})

- Spam->Spam: 0.8726
- Ham -> Ham: 0.9980
- Accuracy: 0.9970

##### Accuracy of model for data up to 04/11/2019 with RandomForest Model

Counter({(False, False): 483307, (True, True): 3580, (True, False): 461, (False, True): 43})

- Spam->Spam: 0.8859
- Ham -> Ham: 0.9999
- Accuracy: 0.9990

##### Accuracy of model for data up to 23/06/2020 with RandomForest Model

Counter({(False, False): 530887, (True, True): 4623, (True, False): 918, (False, True): 65})<br/>

- Spam->Spam: 0.8343<br/>
- Ham -> Ham: 0.9999<br/>
- Accuracy: 0.9982<br/>

##### Accuracy of model for data up to 23/06/2020 with RandomForest Model (n_estimators=100)

Counter({(False, False): 530898, (True, True): 4691, (True, False): 850, (False, True): 54})

- Spam->Spam: 0.8466
- Ham -> Ham: 0.9999
- Accuracy: 0.9983

##### Accuracy of model for data up to 23/06/2020 with ExtraTreesClassifier (n_estimators=100)

Counter({(False, False): 530872, (True, True): 4705, (True, False): 836, (False, True): 80})

- Spam->Spam: 0.8491
- Ham -> Ham: 0.9998
- Accuracy: 0.9983



In [None]:
### 6. Plot results

In [None]:
import plotly.graph_objects as go

class CurrentModel:
    
    def __init__(self, acc, spam_spam, ham_ham, name):
        self.acc = acc
        self.spam_spam = spam_spam
        self.ham_ham = ham_ham
        self.name = name
        
    def plot(self):
        x = ['Acc', 'Spam', 'Ham']
        y = [self.acc, self.spam_spam, self.ham_ham]
        fig = go.Figure()
        fig.add_trace(go.Histogram(
            x=x,
            y=y,
            name=name
        ))
        fig.show()

def plot_all(models):
    x = ['Acc', 'Spam', 'Ham']
    fig = go.Figure()
    for model in models:
        fig.add_trace(go.Bar(
            x=x,
            y=[model.acc, model.spam_spam, model.ham_ham],
            name=model.name
        ))
    fig.update_yaxes(range=[0.85, 1])
    fig.show()
    
model_06_03_2017 = CurrentModel(0.9969, 0.9975, 0.9991, 'data up to 06/03/2017')
model_06_04_2017 = CurrentModel(0.9980, 0.9661, 0.9975, 'data up to 06/04/2017')
model_17_05_2018 = CurrentModel(0.9980, 0.9463, 0.9990, 'data up to 17/05/2018')
model_04_11_2019 = CurrentModel(0.9990, 0.8726, 0.9980, 'data up to 04/11/2019')
model_04_11_2019_RandomForest = CurrentModel(0.9990, 0.8859, 0.9999, 'data up to 04/11/2019 <RandomForest>')

models = [
    model_06_03_2017,
    model_06_04_2017,
    model_17_05_2018,
    model_04_11_2019,
    model_04_11_2019_RandomForest
]

plot_all(models)