## This notebook trains SPAM classifier for Zenodo records.

Run the cells in sequence to train the model on the data. Some steps are optional or used for experimentation.

In [None]:
import json
import sklearn
import requests
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV

## Load the data

Loads the previously dumped data (TXT). Each line in the file is single record's metadata.

In [None]:
# Update the filename accordingly
FILENAME = "./data/zenodo_open_metadata_17_05_2018.txt"

with open(FILENAME, "r") as fp:
    data = [json.loads(l) for l in fp.readlines()]
print("Loaded metadata of {} records".format(len(data)))

## Optional step: Manually mark some spammers

Next cell allows for manually marking some Users as spammers. Provide User IDs (int) of record owners, which records should be marked as SPAM.

In [None]:
spam_owners = {}  # Manually mark some User IDs as SPAMmers
for d in data:
    owner = d['owners'][0] if d['owners'] else None
    if owner in spam_owners and not d['spam']:
        d['spam'] = True
        
spamcnt = Counter([d['spam'] for d in data])
print("SPAM: {0}, Non-SPAM: {1}".format(spamcnt[True], spamcnt[False]))

## Train the model

Train the model on the SPAM label. You can experiment with parameters here, and observe the accuracy on the test set (Spam->Spam, Ham->Ham values).

In [None]:
labels = [d['spam'] for d in data]
X_train_full, X_test_full, y_train, y_test = train_test_split(data, labels, test_size=0.33, random_state=422)
def feat_tr(d):
    return d['description'] + d['title']

X_train = [feat_tr(d) for d in X_train_full]
X_test = [feat_tr(d) for d in X_test_full]

ngram_range=(1, 1)

## Alternatively you can experiment with building a spam vocabulary from the training dataset
# X_train_spam = [feat_tr(d) for d in X_train_full if d['spam']]
# count_vect = CountVectorizer(ngram_range=ngram_range, max_features=8000)
# count_vect.fit_transform(X_train_spam)
# vocabulary = count_vect.vocabulary_

text_clf = Pipeline([
                     ('vect', CountVectorizer(max_features=8000, ngram_range=ngram_range)),
                     #('vect', CountVectorizer(vocabulary=vocabulary, ngram_range=ngram_range)),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                     #('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)),
                    ])
                     
text_clf = text_clf.fit(X_train, y_train)
y_pred = text_clf.predict(X_test)
acc = [(ref, pred) for ref, pred in zip(y_test, y_pred)]
c = Counter(acc)
print(c)
print("Spam->Spam: {0:.4f}".format(c[(True, True)] / (c[(True, True)] + c[(True, False)])))
print("Ham -> Ham: {0:.4f}".format(c[(False, False)] / (c[(False, False)] + c[(False, True)])))
print("Accuracy: {0:.4f}".format((c[(False, False)] + c[(True, True)] ) / (len(acc))))

## Check accuracy again on the full data.

This contains biased because some data was used for training the model.

In [None]:
data_tr = [feat_tr(d) for d in data]
y_pred = text_clf.predict(data_tr)
acc = [(ref, pred) for ref, pred in zip(labels, y_pred)]
c = Counter(acc)
print(c)
print("Spam->Spam: {0:.4f}".format(c[(True, True)] / (c[(True, True)] + c[(True, False)])))
print("Ham -> Ham: {0:.4f}".format(c[(False, False)] / (c[(False, False)] + c[(False, True)])))
print("Accuracy: {0:.4f}".format((c[(False, False)] + c[(True, True)] ) / (len(acc))))


### Investigate the records

The next cell allows you to take a peek at false negatives (i.e.: SPAM which slipped through the filter).

In [None]:
acc = [idx for idx, (ref, pred) in enumerate(zip(labels, y_pred)) if (ref, pred) == (False, True)]
spammy_stuff = [(data[idx]['recid'], data[idx]['description']) for idx in acc]

for rec in spammy_stuff:
    print(rec)
    print("\n\n")

In [None]:
from sklearn.externals import joblib
joblib.dump(text_clf, '2017_06_18_record_spam.pkl') 

In [None]:
acc = [idx for idx, (ref, pred) in enumerate(zip(y_test, y_pred)) if (ref, pred) == (False, True)]
spammy_stuff = [(X_test_full[idx]['recid'], X_test_full[idx]['description']) for idx in acc]

In [None]:
spammy_stuff

In [None]:
spams = []
for recid, _ in spammy_stuff:
    resp = requests.get('https://zenodo.org/record/{0}'.format(recid))
    if resp.status_code == 410:
        spams.append(recid)
print(len(spams), spams)

In [None]:
def spam_ratio(ground, pred):
    tf = [(ref, pred) for ref, pred in zip(y_test, y_pred) if (ref, pred) == (True, False)]
    t = [(ref, pred) for ref, pred in zip(y_test, y_pred) if ref == True]
    return float(tf) / float(t)
score_fun = make_scorer(spam_ratio, greater_is_better=False)

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, n_iter=5, random_state=42)),])
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),])
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    # 'tfidf__use_idf': (True, False),
    # 'clf__alpha': (1e-2, 1e-3),
}


In [None]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)

In [None]:
y_pred = gs_clf.predict(X_test)

In [None]:
gs_clf.best_params_

In [None]:
score_fun(gs_clf, y_test, y_pred)

In [None]:
acc = [(ref, pred) for ref, pred in zip(y_test, y_pred)]
Counter(acc)

In [None]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, scoring=score_fun)
gs_clf = gs_clf.fit(X_train, y_train)
y_pred = gs_clf.predict(y_test)
acc = [(ref, pred) for ref, pred in zip(y_test, y_pred)]
Counter(acc)