In [1]:
import json
import sklearn
import requests
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV

In [2]:
with open("./data/zenodo_open_metadata_06_04_2017.json", "r") as fp:
    data = json.load(fp)

In [3]:
labels = [d['spam'] for d in data]

In [7]:
X_train_full, X_test_full, y_train, y_test = train_test_split(data, labels, test_size=0.33, random_state=42)
def feat_tr(d):
    return d['description'] + d['title']
    #return d['title']
    #return d['description']

X_train = [feat_tr(d) for d in X_train_full]
X_test = [feat_tr(d) for d in X_test_full]

ngram_range=(1, 1)

# Build spam vocabulary from the training dataset
#X_train_spam = [feat_tr(d) for d in X_train_full if d['spam']]
#count_vect = CountVectorizer(ngram_range=ngram_range, max_features=8000)
#count_vect.fit_transform(X_train_spam)
#vocabulary = count_vect.vocabulary_
#vocabulary = None

text_clf = Pipeline([
                    ('vect', CountVectorizer(max_features=8000, ngram_range=ngram_range)),
                     #('vect', CountVectorizer(vocabulary=vocabulary, ngram_range=ngram_range)),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                     #('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)),
                     #('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)),
                    ])
                     
text_clf = text_clf.fit(X_train, y_train)
y_pred = text_clf.predict(X_test)
acc = [(ref, pred) for ref, pred in zip(y_test, y_pred)]
c = Counter(acc)
print(c)
print("Spam->Spam: {0:.4f}".format(c[(True, True)] / (c[(True, True)] + c[(True, False)])))
print("Ham -> Ham: {0:.4f}".format(c[(False, False)] / (c[(False, False)] + c[(False, True)])))
print("Accuracy: {0:.4f}".format((c[(False, False)] + c[(True, True)] ) / (len(acc))))

Counter({(False, False): 58790, (True, True): 1808, (True, False): 63, (False, True): 59})
Spam->Spam: 0.9663
Ham -> Ham: 0.9990
Accuracy: 0.9980


In [10]:
from sklearn.externals import joblib
joblib.dump(text_clf, '2017_06_18_record_spam.pkl') 

['2017_06_18_record_spam.pkl']

In [8]:
acc = [idx for idx, (ref, pred) in enumerate(zip(y_test, y_pred)) if (ref, pred) == (False, True)]
spammy_stuff = [(X_test_full[idx]['recid'], X_test_full[idx]['description']) for idx in acc]

In [9]:
spammy_stuff

[(345647, '<p>freud, vigotski</p>'),
 (268709,
  'Many businesses today recognize the increased significance of service and the transition toward service orientation. Nonetheless, organizational practitioners frequently encounter problems managing this shift and seizing service-related business opportunities. This practical relevance, together with many still-unanswered service research questions, has inspired the preparation of this special section that advances the extant literatures on business services. We finish by providing a research agenda. First, more research is needed on the buyer perspective. Second, researchers need to keep in mind financial issues related to business services. Third, more researchers could tap into management, leadership, and decision-making in business service companies. Finally, sustainability, social responsibility, and environmental considerations are important topics for further exploration.'),
 (45840,
  '<p><strong>Radomska Szkoła Wyższa w Radomiu<

In [None]:
spams = []
for recid, _ in spammy_stuff:
    resp = requests.get('https://zenodo.org/record/{0}'.format(recid))
    if resp.status_code == 410:
        spams.append(recid)
print(len(spams), spams)

In [None]:
def spam_ratio(ground, pred):
    tf = [(ref, pred) for ref, pred in zip(y_test, y_pred) if (ref, pred) == (True, False)]
    t = [(ref, pred) for ref, pred in zip(y_test, y_pred) if ref == True]
    return float(tf) / float(t)
score_fun = make_scorer(spam_ratio, greater_is_better=False)

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, n_iter=5, random_state=42)),])
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),])
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    # 'tfidf__use_idf': (True, False),
    # 'clf__alpha': (1e-2, 1e-3),
}


In [None]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)

In [None]:
y_pred = gs_clf.predict(X_test)

In [None]:
gs_clf.best_params_

In [None]:
score_fun(gs_clf, y_test, y_pred)

In [None]:
acc = [(ref, pred) for ref, pred in zip(y_test, y_pred)]
Counter(acc)

In [None]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, scoring=score_fun)
gs_clf = gs_clf.fit(X_train, y_train)
y_pred = gs_clf.predict(y_test)
acc = [(ref, pred) for ref, pred in zip(y_test, y_pred)]
Counter(acc)