In [1]:
import json
import sklearn
import requests
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV

In [2]:
with open("./data/zenodo_open_metadata_17_05_2018.txt", "r") as fp:
    data = [json.loads(l) for l in fp.readlines()]

In [3]:
len(data)

413079

In [7]:
spam_owners = {}

In [5]:
for d in data:
    owner = d['owners'][0] if d['owners'] else None
    if owner in spam_owners and not d['spam']:
        d['spam'] = True

In [8]:
Counter([d['spam'] for d in data])

Counter({False: 404648, True: 8472})

In [18]:
labels = [d['spam'] for d in data]
X_train_full, X_test_full, y_train, y_test = train_test_split(data, labels, test_size=0.33, random_state=422)
def feat_tr(d):
    return d['description'] + d['title']

X_train = [feat_tr(d) for d in X_train_full]
X_test = [feat_tr(d) for d in X_test_full]

ngram_range=(1, 1)

# Build spam vocabulary from the training dataset
X_train_spam = [feat_tr(d) for d in X_train_full if d['spam']]
count_vect = CountVectorizer(ngram_range=ngram_range, max_features=8000)
count_vect.fit_transform(X_train_spam)
vocabulary = count_vect.vocabulary_
vocabulary = None

text_clf = Pipeline([
                     ('vect', CountVectorizer(max_features=8000, ngram_range=ngram_range)),
                     #('vect', CountVectorizer(vocabulary=vocabulary, ngram_range=ngram_range)),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                     #('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)),
                    ])
                     
text_clf = text_clf.fit(X_train, y_train)
y_pred = text_clf.predict(X_test)
acc = [(ref, pred) for ref, pred in zip(y_test, y_pred)]
c = Counter(acc)
print(c)
print("Spam->Spam: {0:.4f}".format(c[(True, True)] / (c[(True, True)] + c[(True, False)])))
print("Ham -> Ham: {0:.4f}".format(c[(False, False)] / (c[(False, False)] + c[(False, True)])))
print("Accuracy: {0:.4f}".format((c[(False, False)] + c[(True, True)] ) / (len(acc))))

Counter({(False, False): 133426, (True, True): 2627, (True, False): 149, (False, True): 128})
Spam->Spam: 0.9463
Ham -> Ham: 0.9990
Accuracy: 0.9980


In [71]:
data_tr = [feat_tr(d) for d in data]
y_pred = text_clf.predict(data_tr)
acc = [(ref, pred) for ref, pred in zip(labels, y_pred)]
c = Counter(acc)
print(c)
print("Spam->Spam: {0:.4f}".format(c[(True, True)] / (c[(True, True)] + c[(True, False)])))
print("Ham -> Ham: {0:.4f}".format(c[(False, False)] / (c[(False, False)] + c[(False, True)])))
print("Accuracy: {0:.4f}".format((c[(False, False)] + c[(True, True)] ) / (len(acc))))
acc = [idx for idx, (ref, pred) in enumerate(zip(labels, y_pred)) if (ref, pred) == (False, True)]
spammy_stuff = [(data[idx]['recid'], data[idx]['description']) for idx in acc]

Counter({(False, False): 404185, (True, True): 8009, (True, False): 458, (False, True): 427})
Spam->Spam: 0.9459
Ham -> Ham: 0.9989
Accuracy: 0.9979


In [72]:
for rec in spammy_stuff:
    print(rec)
    print("\n\n")

(49404, '<p>&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;Ekonomi global saat ini sedang pada titik puncak perubahan besar yang sebanding besarnya dengan munculnya revolusi industri pertama atau perkembangan perakitan produksi, atau bahkan penemuan mikrocip. Kemajuan teknologi memungkinkan terjadinya otomatisasi hampir di semua bidang. Sementara itu, kepemilikan perangkat pintar di berbagai bagian dunia mengarah pada tingkat keterkaitan satu sama yang lain yang tak terbayangkan sebelumnya. Di antara berbagai tantangan yang sedang dihadapi dunia saat&nbsp; ini, mungkin yang paling besar adalah bagaimana membentuk Revolusi Industri keempat&nbsp; (disebut juga sebagai Industri 4.0) yang dimulai pada permulaan abad ini. Teknologi dan pendekatan baru yang menggabungkan dunia fisik, digital, dan biologi dengan cara yang fundamental akan mengubah umat manusia. Ada banyak pendapat bahwa sektor kesehatan dan bioteknologi sangat diuntungkan oleh transformasi ini. Sejauh mana transformasi ini ak

In [None]:
Counter({(False, False): 133452, (True, True): 2603, (True, False): 158, (False, True): 104})
Spam->Spam: 0.9428
Ham -> Ham: 0.9992
Accuracy: 0.9981

In [None]:
Counter({(False, False): 58790, (True, True): 1808, (True, False): 63, (False, True): 59})
Spam->Spam: 0.9663
Ham -> Ham: 0.9990
Accuracy: 0.9980

In [10]:
from sklearn.externals import joblib
joblib.dump(text_clf, '2017_06_18_record_spam.pkl') 

['2017_06_18_record_spam.pkl']

In [32]:
acc = [idx for idx, (ref, pred) in enumerate(zip(y_test, y_pred)) if (ref, pred) == (False, True)]
spammy_stuff = [(X_test_full[idx]['recid'], X_test_full[idx]['description']) for idx in acc]

In [33]:
spammy_stuff

[(1154298,
  '<p>Penahanan merupakan salah satu mekanisme yang berlaku dalam proses penegakkan hukum dalam perkara pidana di Indonesia. Masyarakat harus memahami mekanisme ini supaya masyarakat tidak mendapat perlakuan yang semena-mena dari aparat penegak hukum serta untuk tetap dapat menjamin kepastian hukum bagi seorang yang diduga melakukan tindak pidana atau disebut tersangka. Kurangnya pemahaman masyarakat khususnya yang memiliki status sebagai tersangka sering kali tidak memahami aturan yang berlaku mengenai penahanan. Dalam hukum acara pidana mengatur syarat-syarat yang harus dipenuhi untuk dapat dilakukan penahanan terhadap seseorang tersangka. Akan tetapi perlu disadari bahwa syarat-syarat penahanan yang diatur di dalam KUHAP masih belum dapat menjamin kepastian hukum bagi seorang tersangka karena dalam KUHAP tidak diatur secara detail mengenai syarat-syarat penahanan. Dalam faktamya, ketidakpastian hukum dalam syarat-syarat penahanan dapat menjadi celah bagi aparat penegak hu

In [None]:
spams = []
for recid, _ in spammy_stuff:
    resp = requests.get('https://zenodo.org/record/{0}'.format(recid))
    if resp.status_code == 410:
        spams.append(recid)
print(len(spams), spams)

In [None]:
def spam_ratio(ground, pred):
    tf = [(ref, pred) for ref, pred in zip(y_test, y_pred) if (ref, pred) == (True, False)]
    t = [(ref, pred) for ref, pred in zip(y_test, y_pred) if ref == True]
    return float(tf) / float(t)
score_fun = make_scorer(spam_ratio, greater_is_better=False)

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, n_iter=5, random_state=42)),])
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),])
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    # 'tfidf__use_idf': (True, False),
    # 'clf__alpha': (1e-2, 1e-3),
}


In [None]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)

In [None]:
y_pred = gs_clf.predict(X_test)

In [None]:
gs_clf.best_params_

In [None]:
score_fun(gs_clf, y_test, y_pred)

In [None]:
acc = [(ref, pred) for ref, pred in zip(y_test, y_pred)]
Counter(acc)

In [None]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, scoring=score_fun)
gs_clf = gs_clf.fit(X_train, y_train)
y_pred = gs_clf.predict(y_test)
acc = [(ref, pred) for ref, pred in zip(y_test, y_pred)]
Counter(acc)