In [1]:
import random

import joblib
import numpy as np
from sklearn import discriminant_analysis, ensemble, metrics, naive_bayes, svm
from tqdm.notebook import tqdm
import xgboost as xgb

from pan20 import auth
from pan20.util import topkfreqs, text
from pan20.fake import models

In [2]:
X, y = auth.load_small()

In [3]:
data = list(zip(X, y))

In [4]:
random.shuffle(data)

In [5]:
data = data[0:10000]

In [6]:
X = [x[0] for x in data]
y = [x[1] for x in data]

In [7]:
y = np.array([y_['same'] for y_ in y])

In [8]:
vz1 = topkfreqs.Vectorizer(k=1024, n=1)
vz2 = topkfreqs.Vectorizer(k=2048, n=2)
vz3 = topkfreqs.Vectorizer(k=2048, n=3)

In [9]:
def get_feats1(x):
    d0 = x['pair'][0]
    d1 = x['pair'][1]
    d0 = text.simple_tokenize(d0)
    d1 = text.simple_tokenize(d1)
    v0 = vz1(d0)
    v1 = vz1(d1)
    d = np.abs(v0 - v1)
    d = np.expand_dims(d, 0)
    return d

def get_feats2(x):
    d0 = x['pair'][0]
    d1 = x['pair'][1]
    v0 = vz2(d0)
    v1 = vz2(d1)
    d = np.abs(v0 - v1)
    d = np.expand_dims(d, 0)
    return d

def get_feats3(x):
    d0 = x['pair'][0]
    d1 = x['pair'][1]
    v0 = vz3(d0)
    v1 = vz3(d1)
    d = np.abs(v0 - v1)
    d = np.expand_dims(d, 0)
    return d

## Unigrams

In [10]:
X1 = np.concatenate([get_feats1(x) for x in X])

In [None]:
svc1 = svm.SVC(C=1., kernel='rbf', probability=True)
svc1.fit(X1, y)
metrics.accuracy_score(y, svc1.predict(X1))

In [None]:
rf1 = ensemble.RandomForestClassifier(n_estimators=200, max_depth=15)
rf1.fit(X1, y)
metrics.accuracy_score(y, rf1.predict(X1))

In [None]:
nb1 = naive_bayes.GaussianNB()
nb1.fit(X1, y)
metrics.accuracy_score(y, nb1.predict(X1))

In [None]:
joblib.dump(svc1, 'pan20/auth/svc1.model')
joblib.dump(rf1, 'pan20/auth/rf1.model')
joblib.dump(nb1, 'pan20/auth/nb1.model')

In [None]:
p_svc1 = models.get_preds(svc1, X1)
p_rf1 = models.get_preds(rf1, X1)
p_nb1 = models.get_preds(nb1, X1)

## Bigrams

In [42]:
X2 = np.concatenate([get_feats2(x) for x in X])

In [None]:
svc2 = svm.SVC(C=1., kernel='rbf', probability=True)
svc2.fit(X2, y)
metrics.accuracy_score(y, svc2.predict(X2))

In [44]:
rf2 = ensemble.RandomForestClassifier(n_estimators=200, max_depth=15)
rf2.fit(X2, y)
metrics.accuracy_score(y, rf2.predict(X2))

0.5233

In [43]:
nb2 = naive_bayes.GaussianNB()
nb2.fit(X2, y)
metrics.accuracy_score(y, nb2.predict(X2))

  n_ij = - 0.5 * np.sum(np.log(2. * np.pi * self.sigma_[i, :]))
  (self.sigma_[i, :]), 1)


0.4767

In [None]:
joblib.dump(svc2, 'pan20/auth/svc2.model')
joblib.dump(rf2, 'pan20/auth/rf2.model')
joblib.dump(nb2, 'pan20/auth/nb2.model')

## Trigrams

In [4]:
def get_feats(x):
    d0 = x['pair'][0]
    d1 = x['pair'][1]
    v01 = vz1(d0)
    v02 = vz2(d0)
    v11 = vz1(d1)
    v12 = vz2(d1)
    d1 = np.abs(v01 - v11)
    d2 = np.abs(v02 - v12)
    d = np.concatenate([d1, d2], axis=0)
    d = np.expand_dims(d, 0)
    return d

In [5]:
# get vector for X
feats = []
with tqdm(total=len(X)) as pbar:
    for x in X:
        feats.append(get_feats(x))
        pbar.update()
X = np.concatenate(feats, axis=0)
y = np.array([y_['same'] for y_ in y])

HBox(children=(FloatProgress(value=0.0, max=52601.0), HTML(value='')))




In [6]:
np.save('data/auth/X.npy', X)

In [7]:
np.save('data/auth/y.npy', y)

In [8]:
X.shape

(52601, 3072)

In [9]:
svc = svm.SVC(C=1., kernel='rbf', probability=True)
rf = ensemble.RandomForestClassifier(n_estimators=200, max_depth=15)
nb = naive_bayes.GaussianNB()
lda = discriminant_analysis.LinearDiscriminantAnalysis(solver='eigen')

In [None]:
svc.fit(X, y)  # 16:32

In [None]:
metrics.accuracy_score(y, svc.predict(X))

In [None]:
rf.fit(X, y)
metrics.accuracy_score(y, )

In [10]:
nb.fit(X, y)

GaussianNB(priors=None, var_smoothing=1e-09)

In [11]:
metrics.accuracy_score(y, nb.predict(X))

0.6308435200851695

In [12]:
lda.fit(X, y)

LinAlgError: the leading minor of order 9 of 'b' is not positive definite. The factorization of 'b' could not be completed and no eigenvalues or eigenvectors were computed.

In [None]:
metrics.accuracy_score(y, lda.predict(X))