In [146]:
import functools
import pickle

import numpy as np
import pandas as pd
from sklearn import ensemble, linear_model, metrics, model_selection, naive_bayes, preprocessing, svm
from tqdm.notebook import tqdm

from pan20 import fake, util
from pan20.util import ctree, text
from pan20.util.lexicons import noble, sentiwordnet, trust, tweet_anger

In [2]:
df = fake.load()

In [5]:
df['toks'] = df.tweet.apply(text.tokenize)

In [8]:
df['n'] = df.toks.apply(lambda x: len(x))

### Features

#### Noble's Function Words

Previous investigation indicated a good feature set to be:
- impersonal_pronouns
- personal_pronouns
- prepositions
- function_words

In [9]:
nd = noble.NobleDict()
freqs = []
with tqdm(total=len(df)) as pbar:
    for _, x in df.iterrows():
        fq = nd(x.toks)
        freqs.append(fq)
        pbar.update()
for cat in noble.cats:
    df[cat] = [f[cat] for f in freqs]
df['function_words'] = [f['function_words'] for f in freqs]

HBox(children=(FloatProgress(value=0.0, max=30000.0), HTML(value='')))




#### Constituency Tree Features

Previous indicated:
- avg_bf
- max_np_height
- max_vp_height

In [13]:
get_tree = ctree.GetTree()
trees = []
with tqdm(total=len(df)) as pbar:
    for _, x in df.iterrows():
        trees.append(get_tree(x.tweet))
        pbar.update()
df['avg_bf'] = [ctree.avg_branch_factor(t) for t in trees]
df['height'] = [t.height() for t in trees]
df['max_np_height'] = [ctree.max_const_height(t, 'NP') for t in trees]
df['max_pp_height'] = [ctree.max_const_height(t, 'PP') for t in trees]
df['max_vp_height'] = [ctree.max_const_height(t, 'VP') for t in trees]





HBox(children=(FloatProgress(value=0.0, max=30000.0), HTML(value='')))




#### Emotions

LIWC analysis indicated:
- negemo
- anx
- anger

In fact just these three get us to 70%.

For **SentiWordNet** indication is:
- senti
- senti_neg

with senti_pos being more or less useless.

In [17]:
swn = sentiwordnet.SentiWordNet()
df['senti'] = df.toks.apply(swn.score)
df['senti_neg'] = df.toks.apply(swn.score_neg)
df['senti_pos'] = df.toks.apply(swn.score_pos)

In [19]:
# my learned anger dict
ta = tweet_anger.Lexicon()
fn = functools.partial(ta.cat_freq, cat='anger')
df['anger'] = df.toks.apply(fn)

#### Trust and Distrust

In [123]:
dt = trust.Lexicon()
fn = functools.partial(dt.cat_freq, cat='distrust')
df['distrust'] = df.toks.apply(fn)

#### SAVE AND RESUME

In [124]:
df.to_csv('data/fake/feats.csv', index=False)

In [222]:
df = pd.read_csv('data/fake/feats.csv')

In [223]:
def get_X_y(cols=None, no_cols=None):
    X = df.groupby(['author', 'label']).mean().reset_index().drop(columns=['author', 'n'])
    y = X.label.values
    if cols:
        X = X.loc[:, [c in cols for c in X.columns]]
    elif no_cols:
        X = X.loc[:, [c not in no_cols for c in X.columns]]
    else:
        X = X.loc[:, [c != 'label' for c in X.columns]]
    feat_dict = util.IxDict(X.columns)
    # normalize features to be between 0 and 1
    x = X.values
    min_max_scaler = preprocessing.MinMaxScaler()
    X = min_max_scaler.fit_transform(x)
    return X, y, feat_dict

In [224]:
X, y, feat_dict = get_X_y()

### Feature Selection

Looking for variability of features across 

In [136]:
svc = svm.SVC(C=1., kernel='rbf')
model_selection.cross_val_score(svc, X, y).mean()

0.6733333333333333

In [137]:
coef = []
skf = model_selection.StratifiedKFold(n_splits=5)
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    svc = svm.SVC(C=1., kernel='linear')
    svc.fit(X_train, y_train)
    print(metrics.accuracy_score(y_test, svc.predict(X_test)))
    coef.append(svc.coef_)

0.7166666666666667
0.7
0.6
0.5833333333333334
0.6


In [138]:
coef[0].shape

(1, 19)

In [139]:
cf = []
for coef_ in coef:
    for ix in range(coef_.shape[1]):
        feat = feat_dict[ix]
        cf.append({
            'feature': feat,
            'coef': np.abs(coef_[0, ix])
        })
cf = pd.DataFrame(cf)

In [140]:
cfmu = cf.groupby('feature').mean().reset_index().sort_values(by='coef', ascending=False)
cfmu

Unnamed: 0,feature,coef
13,personal_pronouns,1.812791
15,quantifiers,1.608437
9,impersonal_pronouns,1.531036
10,max_np_height,1.462371
0,adverbs,1.421656
1,anger,1.267959
14,prepositions,1.215212
11,max_pp_height,1.21472
3,auxiliary_verbs,1.128011
4,avg_bf,0.989712


In [141]:
feats = list(cfmu[cfmu.coef > 0.9].feature.values)

In [142]:
X, y, feat_dict = get_X_y(feats)

In [143]:
svc = svm.SVC(C=1., kernel='rbf')
model_selection.cross_val_score(svc, X, y).mean()

0.6466666666666667

Well, that sucked.

In [225]:
feats = [
    'adverbs', 
    #'articles',
    #'auxiliary_verbs', 
    #'conjunctions', 
    'impersonal_pronouns',
    'personal_pronouns', 
    #'prepositions', 
    #'quantifiers', 
    'function_words',
    'avg_bf', 
    #'height', 
    'max_np_height', 
    #'max_pp_height', 
    'max_vp_height',
    'senti', 
    'senti_neg', 
    #'senti_pos', 
    'anger', 
    'distrust'
]

In [226]:
X, y, feat_dict = get_X_y(feats)
svc = svm.SVC(C=1., kernel='rbf', probability=True)
s = model_selection.cross_val_score(svc, X, y)
print(s.mean())
print(s.std())

0.6866666666666666
0.06944222218666553


In [227]:
svc.fit(X, y)
metrics.accuracy_score(y, svc.predict(X))

0.6966666666666667

In [181]:
import joblib

In [189]:
joblib.dump(svc, 'pan20/fake/svc.model')

['pan20/fake/svc.model']

In [190]:
svc2 = joblib.load('pan20/fake/svc.model')

In [191]:
metrics.accuracy_score(y, svc2.predict(X))

0.6966666666666667

In [199]:
list(np.exp(svc2.predict_log_proba(X))[:, 1])

[0.7141606192115727,
 0.5958132500202161,
 0.5113159992433761,
 0.47574987128918966,
 0.6801969141453642,
 0.7658636571624955,
 0.5824570899609552,
 0.2599739791714044,
 0.6315693467491665,
 0.6502529671660975,
 0.7707186242521762,
 0.2599313475047878,
 0.7351846314988525,
 0.5387497535733041,
 0.7622874424698669,
 0.2109278557868544,
 0.761439744576779,
 0.7524165150077915,
 0.569834052392983,
 0.521888597652202,
 0.4403361886204582,
 0.6179294574523457,
 0.678875279522349,
 0.6737241554128963,
 0.7141380070086942,
 0.7141606192115723,
 0.34084064054116997,
 0.5195464532960465,
 0.49489924133641244,
 0.29968990214851515,
 0.73336090694533,
 0.6489672108589887,
 0.8334573457573052,
 0.25652539859750934,
 0.668617661175339,
 0.6910650894528386,
 0.3564189307095654,
 0.5535944716954,
 0.23641103351090698,
 0.46110020643706157,
 0.853586153019991,
 0.16014893099629507,
 0.20316842281887798,
 0.25992889742583325,
 0.39729658661424194,
 0.46867850400987504,
 0.39123779448575036,
 0.64816088

In [None]:
_ = pickle.dumps(svc)

Best set so far.

In [265]:
rf = ensemble.RandomForestClassifier(n_estimators=40, max_depth=5)
s = model_selection.cross_val_score(rf, X, y)
print(s.mean())
print(s.std())
rf.fit(X, y)
acc = metrics.accuracy_score(y, rf.predict(X))
print(acc)
print(acc / s.mean())

0.6833333333333333
0.06236095644623236
0.9033333333333333
1.3219512195121952


In [266]:
joblib.dump(rf, 'pan20/fake/rf.model')

['pan20/fake/rf.model']

In [230]:
nb = naive_bayes.GaussianNB()
s = model_selection.cross_val_score(nb, X, y)
print(s.mean())
print(s.std())
nb.fit(X, y)
metrics.accuracy_score(y, nb.predict(X))

0.6433333333333333
0.05120763831912404


0.67

In [258]:
joblib.dump(svc, 'pan20/fake/nb.model')

['pan20/fake/nb.model']

In [260]:
nb.predict_proba(X)

array([[1.32359717e-01, 8.67640283e-01],
       [3.03792683e-01, 6.96207317e-01],
       [2.85036843e-01, 7.14963157e-01],
       [6.25985149e-01, 3.74014851e-01],
       [2.36190777e-01, 7.63809223e-01],
       [1.17641356e-01, 8.82358644e-01],
       [2.19788778e-01, 7.80211222e-01],
       [9.28839448e-01, 7.11605515e-02],
       [1.51962332e-01, 8.48037668e-01],
       [1.26260944e-01, 8.73739056e-01],
       [7.76252773e-02, 9.22374723e-01],
       [9.99186912e-01, 8.13088087e-04],
       [1.27144580e-01, 8.72855420e-01],
       [5.51176865e-01, 4.48823135e-01],
       [6.73416364e-02, 9.32658364e-01],
       [9.32901309e-01, 6.70986915e-02],
       [8.49505923e-02, 9.15049408e-01],
       [1.03691433e-01, 8.96308567e-01],
       [2.25176568e-01, 7.74823432e-01],
       [3.43206983e-01, 6.56793017e-01],
       [4.53011563e-01, 5.46988437e-01],
       [2.64963492e-01, 7.35036508e-01],
       [1.67348921e-01, 8.32651079e-01],
       [1.80301810e-01, 8.19698190e-01],
       [2.588729

In [106]:
feats = [
    'adverbs', 
    #'articles',
    #'auxiliary_verbs', 
    #'conjunctions', 
    'impersonal_pronouns',
    'personal_pronouns', 
    #'prepositions', 
    #'quantifiers', 
    'function_words',
    'avg_bf', 
    #'height', 
    'max_np_height', 
    #'max_pp_height', 
    'max_vp_height',
    'senti', 
    'senti_neg', 
    #'senti_pos', 
    'anger', 
    #'distrust'
]

In [None]:
X, y, feat_dict = get_X_y(feats)
svc = svm.SVC(C=1., kernel='rbf')
s = model_selection.cross_val_score(svc, X, y)
print(s.mean())
print(s.std())

### Test out code going up

In [200]:
from pan20.fake import models

In [209]:
model = models.LexicalSVM()

In [212]:
df = fake.load()

In [213]:
y = df.label.values
df.drop(columns=['label'], inplace=True)

In [217]:
preds = model.predict(df.iloc[0:200].copy())

Getting features...


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))




In [218]:
preds

Unnamed: 0,author,probability
0,0dwovd7nj6yg9m795ng2c629me0ccmrh,0.264851
1,f4zgi7ym7829iqld6x77q6mh30s0rf86,0.274757
