In [32]:
import random

import joblib
import numpy as np
import pandas as pd
from sklearn import discriminant_analysis, ensemble, metrics, model_selection, naive_bayes, svm
from tqdm.notebook import tqdm
import xgboost as xgb

from pan20 import auth
from pan20.util import topkfreqs, text
from pan20.fake import models

In [2]:
X, y = auth.load_small()

In [3]:
data = list(zip(X, y))

In [4]:
random.shuffle(data)

In [5]:
data = data[0:10000]

In [6]:
X = [x[0] for x in data]
y = [x[1] for x in data]

In [7]:
y = np.array([y_['same'] for y_ in y])

In [8]:
vz1 = topkfreqs.Vectorizer(k=1024, n=1)
vz2 = topkfreqs.Vectorizer(k=2048, n=2)
vz3 = topkfreqs.Vectorizer(k=2048, n=3)

In [16]:
def get_feats1(x):
    d0 = x['pair'][0]
    d1 = x['pair'][1]
    d0 = text.simple_tokenize(d0)
    d1 = text.simple_tokenize(d1)
    v0 = vz1(d0)
    v1 = vz1(d1)
    d = np.abs(v0 - v1)
    d = np.expand_dims(d, 0)
    return d

def get_feats2(x):
    d0 = x['pair'][0]
    d1 = x['pair'][1]
    d0 = text.simple_tokenize(d0, n=2)
    d1 = text.simple_tokenize(d1, n=2)
    v0 = vz2(d0)
    v1 = vz2(d1)
    d = np.abs(v0 - v1)
    d = np.expand_dims(d, 0)
    return d

def get_feats3(x):
    d0 = x['pair'][0]
    d1 = x['pair'][1]
    v0 = vz3(d0)
    v1 = vz3(d1)
    d = np.abs(v0 - v1)
    d = np.expand_dims(d, 0)
    return d

## Unigrams

In [10]:
X1 = np.concatenate([get_feats1(x) for x in X])

In [11]:
svc1 = svm.SVC(C=1., kernel='rbf', probability=True)
svc1.fit(X1, y)
metrics.accuracy_score(y, svc1.predict(X1))

0.8799

In [12]:
rf1 = ensemble.RandomForestClassifier(n_estimators=200, max_depth=15)
rf1.fit(X1, y)
metrics.accuracy_score(y, rf1.predict(X1))

1.0

In [13]:
nb1 = naive_bayes.GaussianNB()
nb1.fit(X1, y)
metrics.accuracy_score(y, nb1.predict(X1))

0.7057

In [None]:
lda1 = discriminant_analysis.LinearDiscriminantAnalysis(solver='eigen')
lda1.fit(X1, y)
metrics.accuracy_score(y, lda1.predict(X1))

In [None]:
joblib.dump(svc1, 'pan20/auth/svc1.model')
joblib.dump(rf1, 'pan20/auth/rf1.model')
joblib.dump(nb1, 'pan20/auth/nb1.model')
#joblib.dump(lda1, 'pan20/auth/lda1.model')

In [15]:
p_svc1 = models.get_preds(svc1, X1)
p_rf1 = models.get_preds(rf1, X1)
p_nb1 = models.get_preds(nb1, X1)

In [None]:
#p_lda1 = models.get_preds(lda1, X1)

## Bigrams

In [17]:
X2 = np.concatenate([get_feats2(x) for x in X])

In [18]:
svc2 = svm.SVC(C=1., kernel='rbf', probability=True)
svc2.fit(X2, y)
metrics.accuracy_score(y, svc2.predict(X2))

0.8843

In [19]:
rf2 = ensemble.RandomForestClassifier(n_estimators=200, max_depth=15)
rf2.fit(X2, y)
metrics.accuracy_score(y, rf2.predict(X2))

1.0

In [20]:
nb2 = naive_bayes.GaussianNB()
nb2.fit(X2, y)
metrics.accuracy_score(y, nb2.predict(X2))

0.6023

In [None]:
lda2 = discriminant_analysis.LinearDiscriminantAnalysis(solver='eigen')
lda2.fit(X2, y)
metrics.accuracy_score(y, lda2.predict(X2))

In [None]:
joblib.dump(svc2, 'pan20/auth/svc2.model')
joblib.dump(rf2, 'pan20/auth/rf2.model')
joblib.dump(nb2, 'pan20/auth/nb2.model')
#joblib.dump(lda2, 'pan20/auth/lda2.model')

In [None]:
p_svc2 = models.get_preds(svc2, X2)
p_rf2 = models.get_preds(rf2, X2)
p_nb2 = models.get_preds(nb2, X2)
#p_lda2 = models.get_preds(lda2, X2)

### XGBoost

In [29]:
x_in = pd.DataFrame({
    'label': y,
    'svc1': p_svc1,
    'rf1': p_rf1,
    'nb1': p_nb1,
    'svc2': p_svc2,
    'rf2': p_rf2,
    'nb2': p_nb2,
})

In [38]:
x_in.head()

Unnamed: 0,label,svc1,rf1,nb1,svc2,rf2,nb2
0,True,0.573122,0.822132,0.9999971,0.575577,0.746626,1.0
1,False,0.044934,0.09968,4.64637e-24,0.063139,0.181055,1.0
2,True,0.981972,0.827597,1.0,0.888151,0.800154,1.0
3,True,0.926003,0.836402,1.0,0.887176,0.773269,1.0
4,False,0.1581,0.230936,4.934542e-26,0.108087,0.224703,3.247435e-108


In [52]:
def to_txt(feats, file_path, test=False):
    with open(file_path, 'w+') as f:
        for _, x in feats.iterrows():
            if not test:
                row = f'{int(x.label)} 0:{x.svc1} 1:{x.rf1} 2:{x.nb1} 3:{x.svc2} 4:{x.rf2} 5:{x.nb2}\n'
            else:
                row = f'0:{x.svc1} 1:{x.rf1} 2:{x.nb1} 3:{x.svc2} 4:{x.rf2} 5:{x.nb2}\n'
            f.write(row)

In [53]:
accs = []
skf = model_selection.StratifiedKFold(n_splits=5)
for tr_ix, ts_ix in skf.split(range(len(x_in)), y):
    tr = x_in.iloc[tr_ix]
    ts = x_in.iloc[ts_ix]
    to_txt(tr, 'tmp/xgb_train.txt')
    to_txt(ts, 'tmp/xgb_test.txt', test=True)
    dtrain = xgb.DMatrix('tmp/xgb_train.txt')
    dtest = xgb.DMatrix('tmp/xgb_test.txt')
    params = {
        'max_depth': 3,
        'eta': 0.3,
        'objective': 'binary:logistic',
    }
    num_round = 2
    bst = xgb.train(params, dtrain, num_round)
    preds = bst.predict(dtest)
    preds = [p > 0.5 for p in preds]
    accs.append(metrics.accuracy_score(ts.label.values, preds))
print(accs)
print(np.mean(accs))

[19:05:10] 8000x6 matrix with 48000 entries loaded from tmp/xgb_train.txt
[19:05:10] 2000x6 matrix with 10000 entries loaded from tmp/xgb_test.txt
[0.74339575 0.25662777 0.74339575 ... 0.74339575 0.25662777 0.25662777]
[19:05:11] 8000x6 matrix with 48000 entries loaded from tmp/xgb_train.txt
[19:05:11] 2000x6 matrix with 10000 entries loaded from tmp/xgb_test.txt
[0.74339575 0.74339575 0.74339575 ... 0.74339575 0.74339575 0.74339575]
[19:05:13] 8000x6 matrix with 48000 entries loaded from tmp/xgb_train.txt
[19:05:13] 2000x6 matrix with 10000 entries loaded from tmp/xgb_test.txt
[0.25662777 0.25662777 0.25662777 ... 0.74339575 0.74339575 0.74339575]
[19:05:14] 8000x6 matrix with 48000 entries loaded from tmp/xgb_train.txt
[19:05:14] 2000x6 matrix with 10000 entries loaded from tmp/xgb_test.txt
[0.25662777 0.25662777 0.25662777 ... 0.74339575 0.74339575 0.74339575]
[19:05:16] 8000x6 matrix with 48000 entries loaded from tmp/xgb_train.txt
[19:05:16] 2000x6 matrix with 10000 entries loaded

In [64]:
to_txt(x_in, 'tmp/xgb_train.txt', test=False)
to_txt(x_in, 'tmp/xgb_test.txt', test=True)
dtrain = xgb.DMatrix('tmp/xgb_train.txt')
dtest = xgb.DMatrix('tmp/xgb_test.txt')

[19:09:40] 10000x6 matrix with 60000 entries loaded from tmp/xgb_train.txt
[19:09:40] 10000x6 matrix with 50000 entries loaded from tmp/xgb_test.txt


In [65]:
params = {
    'max_depth': 3,
    'eta': 0.3,
    'objective': 'binary:logistic',
}
num_round = 2
bst = xgb.train(params, dtrain, num_round)

In [66]:
preds = bst.predict(dtest)
preds = [p > 0.5 for p in preds]
metrics.accuracy_score(x_in.label.values, preds)

1.0

In [67]:
joblib.dump(bst, 'pan20/auth/bst.model')

['pan20/auth/bst.model']

## Trigrams

In [4]:
def get_feats(x):
    d0 = x['pair'][0]
    d1 = x['pair'][1]
    v01 = vz1(d0)
    v02 = vz2(d0)
    v11 = vz1(d1)
    v12 = vz2(d1)
    d1 = np.abs(v01 - v11)
    d2 = np.abs(v02 - v12)
    d = np.concatenate([d1, d2], axis=0)
    d = np.expand_dims(d, 0)
    return d

In [5]:
# get vector for X
feats = []
with tqdm(total=len(X)) as pbar:
    for x in X:
        feats.append(get_feats(x))
        pbar.update()
X = np.concatenate(feats, axis=0)
y = np.array([y_['same'] for y_ in y])

HBox(children=(FloatProgress(value=0.0, max=52601.0), HTML(value='')))




In [6]:
np.save('data/auth/X.npy', X)

In [7]:
np.save('data/auth/y.npy', y)

In [8]:
X.shape

(52601, 3072)

In [9]:
svc = svm.SVC(C=1., kernel='rbf', probability=True)
rf = ensemble.RandomForestClassifier(n_estimators=200, max_depth=15)
nb = naive_bayes.GaussianNB()
lda = discriminant_analysis.LinearDiscriminantAnalysis(solver='eigen')

In [None]:
svc.fit(X, y)  # 16:32

In [None]:
metrics.accuracy_score(y, svc.predict(X))

In [None]:
rf.fit(X, y)
metrics.accuracy_score(y, )

In [10]:
nb.fit(X, y)

GaussianNB(priors=None, var_smoothing=1e-09)

In [11]:
metrics.accuracy_score(y, nb.predict(X))

0.6308435200851695

In [12]:
lda.fit(X, y)

LinAlgError: the leading minor of order 9 of 'b' is not positive definite. The factorization of 'b' could not be completed and no eigenvalues or eigenvectors were computed.

In [None]:
metrics.accuracy_score(y, lda.predict(X))