In [34]:
import joblib
import numpy as np
import pandas as pd
from sklearn import metrics, model_selection, preprocessing
import xgboost as xgb

from pan20 import util

In [2]:
df = pd.read_csv('data/fake/feats.csv')

In [50]:
def get_X_y(cols=None, no_cols=None):
    X = df.groupby(['author', 'label']).mean().reset_index()
    y = X.label.values
    a = list(X.author.values)
    X.drop(columns=['author', 'n'], inplace=True)
    if cols:
        X = X.loc[:, [c in cols for c in X.columns]]
    elif no_cols:
        X = X.loc[:, [c not in no_cols for c in X.columns]]
    else:
        X = X.loc[:, [c != 'label' for c in X.columns]]
    feat_dict = util.IxDict(X.columns)
    # normalize features to be between 0 and 1
    x = X.values
    min_max_scaler = preprocessing.MinMaxScaler()
    X = min_max_scaler.fit_transform(x)
    return X, y, feat_dict, a

In [51]:
feats = [
    'adverbs', 
    'impersonal_pronouns',
    'personal_pronouns', 
    'function_words',
    'avg_bf', 
    'max_np_height', 
    'max_vp_height',
    'senti', 
    'senti_neg', 
    'anger', 
    'distrust'
]

In [52]:
X, y, feat_dict, a = get_X_y(feats)

In [53]:
svc = joblib.load('pan20/fake/svc.model')
rf = joblib.load('pan20/fake/rf.model')
nb = joblib.load('pan20/fake/nb.model')

In [54]:
p_svc = list(np.exp(svc.predict_log_proba(X))[:, 1])
p_rf = list(np.exp(rf.predict_log_proba(X))[:, 1])
p_nb = list(np.exp(nb.predict_log_proba(X))[:, 1])

In [135]:
p_bert = pd.read_csv('tmp/bert_encoded.csv')

In [56]:
p_bert.head()

Unnamed: 0,author,label,probability
0,h0icm6kusqcb4tajgiafnkw6490426e9,1,0.44648
1,8vp74g6kssomu1a6akix6y3hqy6552t7,1,0.422064
2,z9sjtv730uwrqxjtr4gc6l9w0j1oczgq,0,0.408713
3,4fl04aqn59zudhd6f1908ydqlrzewjr0,0,0.545989
4,06ct0t68y1acizh9eow3g5rhancrppr8,1,0.848502


In [119]:
rob = pd.read_csv('tmp/yingjia/roberta_svm_072.csv')

In [120]:
rob.head()

Unnamed: 0,author,label,probability
0,h0icm6kusqcb4tajgiafnkw6490426e9,1,0.472721
1,8vp74g6kssomu1a6akix6y3hqy6552t7,1,0.511446
2,z9sjtv730uwrqxjtr4gc6l9w0j1oczgq,0,0.406618
3,4fl04aqn59zudhd6f1908ydqlrzewjr0,0,0.59023
4,06ct0t68y1acizh9eow3g5rhancrppr8,1,0.737762


In [140]:
xgb_in = pd.DataFrame({
    'author': a,
    'label': y,
    'svc': p_svc,
    'rf': p_rf,
    'nb': p_nb,
})

In [127]:
xgb_in.set_index('author')
rob.set_index('author')
xgb_in = xgb_in.merge(rob, on='author')

In [137]:
xgb_in.set_index('author')
p_bert.set_index('author')
xgb_in = xgb_in.merge(p_bert, on='author')

In [138]:
xgb_in.drop(columns=['label_y'], inplace=True)
xgb_in.rename(columns={'label_x': 'label', 'probability': 'bert'}, inplace=True)

In [141]:
xgb_in.head()

Unnamed: 0,author,label,svc,rf,nb
0,06ct0t68y1acizh9eow3g5rhancrppr8,1,0.714161,0.86771,0.706297
1,071nxc49ihpd0jlfmvn2lghtayy3b5n9,0,0.595813,0.445694,0.591051
2,09py5qescynpnnckmzueqzr2y49moh1o,0,0.511316,0.361854,0.509568
3,0dwovd7nj6yg9m795ng2c629me0ccmrh,0,0.47575,0.260155,0.475279
4,0ibi364m7i7l01xi4xqafyathrmrrnll,1,0.680197,0.752722,0.6731


In [62]:
xgb_in.to_csv('tmp/xgb_in.csv', index=False)

In [133]:
def to_txt(feats, file_path, test=False):
    with open(file_path, 'w+') as f:
        for _, x in feats.iterrows():
            if not test:
                row = f'{x.label} 0:{x.svc} 1:{x.rf} 2:{x.nb} 3:{x.roberta}\n'
            else:
                row = f'0:{x.svc} 1:{x.rf} 2:{x.nb} 3:{x.roberta}\n'
            f.write(row)

In [134]:
accs = []
skf = model_selection.StratifiedKFold(n_splits=5)
for tr_ix, ts_ix in skf.split(X, y):
    tr = xgb_in.iloc[tr_ix]
    ts = xgb_in.iloc[ts_ix]
    to_txt(tr, 'tmp/xgb_train.txt')
    to_txt(ts, 'tmp/xgb_test.txt', test=True)
    dtrain = xgb.DMatrix('tmp/xgb_train.txt')
    dtest = xgb.DMatrix('tmp/xgb_test.txt')
    params = {
        'max_depth': 2,
        'eta': 1,
        'objective': 'binary:logistic',
    }
    num_round = 2
    bst = xgb.train(params, dtrain, num_round)
    preds = bst.predict(dtest)
    preds = [p > 0.5 for p in preds]
    accs.append(metrics.accuracy_score(ts.label.values, preds))
print(accs)
print(np.mean(accs))

[14:50:41] 240x3 matrix with 720 entries loaded from tmp/xgb_train.txt
[14:50:41] 60x3 matrix with 120 entries loaded from tmp/xgb_test.txt
[14:50:41] 240x3 matrix with 720 entries loaded from tmp/xgb_train.txt
[14:50:41] 60x3 matrix with 120 entries loaded from tmp/xgb_test.txt
[14:50:41] 240x3 matrix with 720 entries loaded from tmp/xgb_train.txt
[14:50:41] 60x3 matrix with 120 entries loaded from tmp/xgb_test.txt
[14:50:41] 240x3 matrix with 720 entries loaded from tmp/xgb_train.txt
[14:50:41] 60x3 matrix with 120 entries loaded from tmp/xgb_test.txt
[14:50:41] 240x3 matrix with 720 entries loaded from tmp/xgb_train.txt
[14:50:41] 60x3 matrix with 120 entries loaded from tmp/xgb_test.txt
[0.9333333333333333, 0.9, 0.9, 0.8333333333333334, 0.9]
0.8933333333333333


In [108]:
xgbtr = xgb_in[['label', 'svc', 'rf', 'nb']]

In [111]:
to_txt(xgbtr, 'tmp/xgb_train.txt', test=False)
to_txt(xgbtr, 'tmp/xgb_test.txt', test=True)
dtrain = xgb.DMatrix('tmp/xgb_train.txt')
dtest = xgb.DMatrix('tmp/xgb_test.txt')

[14:41:25] 300x3 matrix with 900 entries loaded from tmp/xgb_train.txt
[14:41:25] 300x3 matrix with 600 entries loaded from tmp/xgb_test.txt


In [112]:
params = {
    'max_depth': 3,
    'eta': 1,
    'objective': 'binary:logistic',
}
num_round = 2
bst = xgb.train(params, dtrain, num_round)

In [113]:
preds = bst.predict(dtrain)
preds = [p > 0.5 for p in preds]
metrics.accuracy_score(xgb_in.label.values, preds)

0.9533333333333334

In [118]:
joblib.dump(bst, 'pan20/fake/bst.model')

['pan20/fake/bst.model']

In [116]:
bst2 = joblib.load('bst.model')

In [117]:
preds = bst2.predict(dtrain)
preds = [p > 0.5 for p in preds]
metrics.accuracy_score(xgb_in.label.values, preds)

0.9533333333333334