In [None]:
import re
import gzip
import json
import joblib
import itertools

In [None]:
# Load and clean/format the data
tag_re = re.compile(r'(<!--.*?-->|<[^>]*>)')
def remove_tags(text):
    return tag_re.sub('', text)

FILENAME = 'datasets/zenodo_open_metadata_2020-06-23.jsonl.gz'
data = []
labels = []
with gzip.open(FILENAME, 'rb') as fp:
    for l in fp:
        r = json.loads(l)
        data.append(((r['recid'], r['title']), r['title'] + ' ' + remove_tags(r['description'])))
        labels.append(r['spam'])

In [None]:
# Load the model
model = joblib.load('models/2020_06_23_reports_spam.pkl')

In [None]:
# Split data into chunks and run classifier over them
def chunkify(n, iterable):
    i = iter(iterable)
    piece = tuple(itertools.islice(i, n))
    while piece:
        yield piece
        piece = list(islice(i, n))

results = []
for chunk in chunkify(data, 10000):
    records = [records for records, _ in chunk]
    values = [value for _, value in chunk]
    spam_results = model.predict(values)
    spam = [r for idx, r in enumerate(records) if spam_results[idx]]
    if spam:
        print(len(spam))
    results.extend(spam)
    

In [None]:

# Print spam results
for recid, title in results:
    print(f'[{title}](https://zenodo.org/record/{recid})')