In [21]:
import json
import sklearn
import requests
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV

In [2]:
with open("./zenodo_open_metadata_06_04_2017.json", "r") as fp:
    data = json.load(fp)

In [44]:
spams = [33787,166754,32807,17229, 400865, 19031, 285825, 440002, 292995, 221828, 18818, 160518, 32519, 437895, 22211, 290571, 438157, 44942, 33550, 246800, 51728, 398866, 400531, 60051, 50704, 32918, 164886, 164888, 495001, 165784, 34331, 321818, 60828, 346531, 399268, 18045, 27942, 35364, 30632, 202291, 47155, 293045, 47286, 290359, 224947, 29113, 375738, 20539, 17596, 293053, 439998, 439997, 290237, 49089, 440001, 440003, 440004, 440005, 440006, 440007, 154696, 440009, 440008, 290123, 35276, 284358, 398797, 32076, 33365, 249942, 290134, 344537, 166752, 291812, 18281, 46445, 51054, 290685, 34287, 28271, 439999, 32439, 232944, 47174, 290157, 23155, 290166, 440000, 249981]
maybe_spams = [153959, 12846, 13138, 13385, 398764, 400012]

In [45]:
for d in data:
    if d['recid'] in (spams + maybe_spams):
        d['spam'] = True

In [46]:
labels = [d['spam'] for d in data]

In [41]:
X_train_full, X_test_full, y_train, y_test = train_test_split(data, labels, test_size=0.33)
def feat_tr(d):
    return d['description'] + d['title']
    #return d['title']
    #return d['description']

X_train = [feat_tr(d) for d in X_train_full]
X_test = [feat_tr(d) for d in X_test_full]

ngram_range=(1, 1)

# Build spam vocabulary from the training dataset
#X_train_spam = [feat_tr(d) for d in X_train_full if d['spam']]
#count_vect = CountVectorizer(ngram_range=ngram_range, max_features=10000)
#count_vect.fit_transform(X_train_spam)
#vocabulary = count_vect.vocabulary_
#vocabulary = None

text_clf = Pipeline([('vect', CountVectorizer(max_features=8000, ngram_range=ngram_range)),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                     #('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)),
                     #('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)),
                    ])
                     
text_clf = text_clf.fit(X_train, y_train)
y_pred = text_clf.predict(X_test)
acc = [(ref, pred) for ref, pred in zip(y_test, y_pred)]
c = Counter(acc)
print(c)
print("Spam->Spam: {0:.4f}".format(c[(True, True)] / (c[(True, True)] + c[(True, False)])))
print("Ham -> Ham: {0:.4f}".format(c[(False, False)] / (c[(False, False)] + c[(False, True)])))

Counter({(False, False): 58800, (True, True): 1800, (False, True): 73, (True, False): 47})
Spam->Spam: 0.9746
Ham -> Ham: 0.9988


In [10]:
from sklearn.externals import joblib
joblib.dump(text_clf, '2017_06_18_record_spam.pkl') 

['2017_06_18_record_spam.pkl']

In [42]:
acc = [idx for idx, (ref, pred) in enumerate(zip(y_test, y_pred)) if (ref, pred) == (False, True)]
spammy_stuff = [(X_test_full[idx]['recid'], X_test_full[idx]['description']) for idx in acc]

In [43]:
spammy_stuff

[(47174,
  '<p>&ldquo;How could I download or record videos from YouTube? Is there any video downloader or screen recorder that enables me to download or record YouTube videos on my PC?&rdquo; You probably are familiar with these questions and are looking for solutions to record YouTube videos. YouTube, the biggest video sharing website, indeed does not allow users to download videos from the site. YouTube owns a great number of videos updated every day. It lets users watch video or upload their own videos to share with others. However, users can not directly download YouTube videos from the site. So, when you have viewed an amazing video and want to view them with your mobile device, what will you do? In this case, you can use YouTube Recorder, a third-party tool, to capture YouTube video.</p>\n\n<p>YouTube Recorder is the best video recorder to capture YouTube videos. It allows users to record videos from YouTube, Dailymotion, Yahoo, Facebook, Hulu and almost online videos streamed o

In [None]:
spams = []
for recid, _ in spammy_stuff:
    resp = requests.get('https://zenodo.org/record/{0}'.format(recid))
    if resp.status_code == 410:
        spams.append(recid)
print(len(spams), spams)

In [None]:
def spam_ratio(ground, pred):
    tf = [(ref, pred) for ref, pred in zip(y_test, y_pred) if (ref, pred) == (True, False)]
    t = [(ref, pred) for ref, pred in zip(y_test, y_pred) if ref == True]
    return float(tf) / float(t)
score_fun = make_scorer(spam_ratio, greater_is_better=False)

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, n_iter=5, random_state=42)),])
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),])
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    # 'tfidf__use_idf': (True, False),
    # 'clf__alpha': (1e-2, 1e-3),
}


In [None]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)

In [None]:
y_pred = gs_clf.predict(X_test)

In [None]:
gs_clf.best_params_

In [None]:
score_fun(gs_clf, y_test, y_pred)

In [None]:
acc = [(ref, pred) for ref, pred in zip(y_test, y_pred)]
Counter(acc)

In [None]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, scoring=score_fun)
gs_clf = gs_clf.fit(X_train, y_train)
y_pred = gs_clf.predict(y_test)
acc = [(ref, pred) for ref, pred in zip(y_test, y_pred)]
Counter(acc)