### 0. Imports

In [13]:
import datetime
import gzip
import json
from collections import Counter
import re
import itertools
import joblib
from pathlib import Path

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

### 1. Load the data

In [17]:
data_file = Path('datasets/zenodo_community_metadata_2020-07-07.jsonl.gz')
data = []
labels = []
with gzip.open(data_file, 'rb') as fp:
    tag_removal_regex = re.compile(r'(<!--.*?-->|<[^>]*>)')
    for l in fp:
        r = json.loads(l)
        clean_description = tag_removal_regex.sub('', r['description'])
        clean_description = clean_description.replace('\n','').replace('\r','').replace('\t','')
        data.append(r['title'] + ' ' + r['curation_policy'] + r['page'] + ' ' + clean_description)
        labels.append(r['spam'])

In [9]:
data[0]

'European Middleware Initiative  &nbsp;The European Middleware Initiative (EMI) is a close collaboration of the three major middleware providers, ARC, gLite and UNICORE, and other specialized software providers like dCache.The&nbsp;project&#39;s mission&nbsp;is todeliver a consolidated set of middleware components for deployment in EGI (as part of the Unified Middleware Distribution - UMD), PRACE and other DCIs,extend the interoperability and integration with emerging computing models,strengthen the reliability and manageability of the services and establish a sustainable model to support,harmonise and evolve the middleware, ensuring it responds effectively to the requirements of the scientific communities relying on it.'

In [10]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.33, random_state=422)

# Random forest model
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=8000, ngram_range=(1, 1))),
    ("clf", RandomForestClassifier(n_estimators=100, n_jobs=4)),
    # ("clf", ExtraTreesClassifier(n_estimators=100, n_jobs=4)),
])

# Trains the model. This might take some time (~30min)
text_clf = pipeline.fit(X_train, y_train)

### 3. Calculate accuracy

In [11]:
y_pred = text_clf.predict(X_test)
acc = [(ref, pred) for ref, pred in zip(y_test, y_pred)]
c = Counter(acc)
print(c)
print("Spam->Spam: {0:.4f}".format(c[(True, True)] / (c[(True, True)] + c[(True, False)])))
print("Ham -> Ham: {0:.4f}".format(c[(False, False)] / (c[(False, False)] + c[(False, True)])))
print("Accuracy: {0:.4f}".format((c[(False, False)] + c[(True, True)] ) / (len(acc))))

Counter({(True, True): 2746, (False, False): 1762, (False, True): 24, (True, False): 13})
Spam->Spam: 0.9953
Ham -> Ham: 0.9866
Accuracy: 0.9919


### 4. Dump model

In [15]:
models_dir = Path('models')
modelname = '{}_community_spam.pkl'.format(datetime.date.today().isoformat())
joblib.dump(text_clf, modelname)

['2020-07-09_community_spam.pkl']

### 5. Compare accuracy of old models

#### Results

##### Accuracy of model for data up to 18/06/2017
- NonSpam -> Spam 99.4% accuracy
- Spam -> NonSpam # 97.1% accuracy

##### Accuracy of model for data up to 09/07/2020 with RandomForest Model

Counter({(True, True): 2746, (False, False): 1762, (False, True): 24, (True, False): 13})

- Spam->Spam: 0.9953
- Ham -> Ham: 0.9866
- Accuracy: 0.9919