### 0. Imports

In [1]:
import datetime
import gzip

import json
from collections import Counter
import re
import itertools
import joblib
from pathlib import Path

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

### 1. Load the data

In [2]:
data_file = Path('datasets/zenodo_community_metadata_2020-09-08.jsonl.gz')
data = []
labels = []
with gzip.open(data_file, 'rb') as fp:
    tag_removal_regex = re.compile(r'(<!--.*?-->|<[^>]*>)')
    for l in fp:
        r = json.loads(l)
        clean_description = tag_removal_regex.sub('', r['description'])
        clean_description = clean_description.replace('\n','').replace('\r','').replace('\t','')
        data.append(r['title'] + ' ' + r['curation_policy'] + r['page'] + ' ' + clean_description)
        labels.append(r['spam'])

In [3]:
data[1]

'Harmonic Radar  Radar using a non-linear element with the radar receiver at a multiple of the transmitter frequency.'

In [4]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.33, random_state=422)

# Random forest model
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=8000, ngram_range=(1, 1))),
    ("clf", RandomForestClassifier(n_estimators=100, n_jobs=4)),
    # ("clf", ExtraTreesClassifier(n_estimators=100, n_jobs=4)),
])

# Trains the model. This might take some time (~30min)
text_clf = pipeline.fit(X_train, y_train)

### 3. Calculate accuracy

In [5]:
y_pred = text_clf.predict(X_test)
acc = [(ref, pred) for ref, pred in zip(y_test, y_pred)]
c = Counter(acc)
print(c)
print("Spam->Spam: {0:.4f}".format(c[(True, True)] / (c[(True, True)] + c[(True, False)])))
print("Ham -> Ham: {0:.4f}".format(c[(False, False)] / (c[(False, False)] + c[(False, True)])))
print("Accuracy: {0:.4f}".format((c[(False, False)] + c[(True, True)] ) / (len(acc))))

Counter({(True, True): 9641, (False, False): 3837, (False, True): 168, (True, False): 92})
Spam->Spam: 0.9905
Ham -> Ham: 0.9581
Accuracy: 0.9811


### 4. Dump model

In [6]:
models_dir = Path('models')
modelname = '{}_community_spam.pkl'.format(datetime.date.today().isoformat())
joblib.dump(text_clf, modelname)

['2020-09-08_community_spam.pkl']

### 5. Compare accuracy of old models

#### Results

##### Accuracy of model for data up to 18/06/2017
- NonSpam -> Spam 99.4% accuracy
- Spam -> NonSpam # 97.1% accuracy

##### Accuracy of model for data up to 09/07/2020 with RandomForest Model

Counter({(True, True): 2746, (False, False): 1762, (False, True): 24, (True, False): 13})

- Spam->Spam: 0.9953
- Ham -> Ham: 0.9866
- Accuracy: 0.9919

##### Accuracy of model for data up to 26/08/2020 with RandomForest Model

Counter({(True, True): 8482, (False, False): 2657, (False, True): 145, (True, False): 44})
- Spam->Spam: 0.9948
- Ham -> Ham: 0.9483
- Accuracy: 0.9833

##### Accuracy of model for data up to 08/09/2020 with RandomForest Model

Counter({(True, True): 9641, (False, False): 3837, (False, True): 168, (True, False): 92})
- Spam->Spam: 0.9905
- Ham -> Ham: 0.9581
- Accuracy: 0.9811