In [108]:
import functools

import numpy as np
import pandas as pd
from sklearn import ensemble, linear_model, metrics, model_selection, naive_bayes, preprocessing, svm
from tqdm.notebook import tqdm

from pan20 import fake, util
from pan20.util import ctree, text
from pan20.util.lexicons import noble, sentiwordnet, trust, tweet_anger

In [2]:
df = fake.load()

In [5]:
df['toks'] = df.tweet.apply(text.tokenize)

In [8]:
df['n'] = df.toks.apply(lambda x: len(x))

### Features

#### Noble's Function Words

Previous investigation indicated a good feature set to be:
- impersonal_pronouns
- personal_pronouns
- prepositions
- function_words

In [9]:
nd = noble.NobleDict()
freqs = []
with tqdm(total=len(df)) as pbar:
    for _, x in df.iterrows():
        fq = nd(x.toks)
        freqs.append(fq)
        pbar.update()
for cat in noble.cats:
    df[cat] = [f[cat] for f in freqs]
df['function_words'] = [f['function_words'] for f in freqs]

HBox(children=(FloatProgress(value=0.0, max=30000.0), HTML(value='')))




#### Constituency Tree Features

Previous indicated:
- avg_bf
- max_np_height
- max_vp_height

In [13]:
get_tree = ctree.GetTree()
trees = []
with tqdm(total=len(df)) as pbar:
    for _, x in df.iterrows():
        trees.append(get_tree(x.tweet))
        pbar.update()
df['avg_bf'] = [ctree.avg_branch_factor(t) for t in trees]
df['height'] = [t.height() for t in trees]
df['max_np_height'] = [ctree.max_const_height(t, 'NP') for t in trees]
df['max_pp_height'] = [ctree.max_const_height(t, 'PP') for t in trees]
df['max_vp_height'] = [ctree.max_const_height(t, 'VP') for t in trees]





HBox(children=(FloatProgress(value=0.0, max=30000.0), HTML(value='')))




#### Emotions

LIWC analysis indicated:
- negemo
- anx
- anger

In fact just these three get us to 70%.

For **SentiWordNet** indication is:
- senti
- senti_neg

with senti_pos being more or less useless.

In [17]:
swn = sentiwordnet.SentiWordNet()
df['senti'] = df.toks.apply(swn.score)
df['senti_neg'] = df.toks.apply(swn.score_neg)
df['senti_pos'] = df.toks.apply(swn.score_pos)

In [19]:
# my learned anger dict
ta = tweet_anger.Lexicon()
fn = functools.partial(ta.cat_freq, cat='anger')
df['anger'] = df.toks.apply(fn)

#### Trust and Distrust

In [123]:
dt = trust.Lexicon()
fn = functools.partial(dt.cat_freq, cat='distrust')
df['distrust'] = df.toks.apply(fn)

#### SAVE AND RESUME

In [124]:
df.to_csv('data/fake/feats.csv', index=False)

In [3]:
df = pd.read_csv('data/fake/feats.csv')

In [65]:
def get_X_y(cols=None, no_cols=None):
    X = df.groupby(['author', 'label']).mean().reset_index().drop(columns=['author', 'n'])
    y = X.label.values
    if cols:
        X = X.loc[:, [c in cols for c in X.columns]]
    elif no_cols:
        X = X.loc[:, [c not in no_cols for c in X.columns]]
    else:
        X = X.loc[:, [c != 'label' for c in X.columns]]
    feat_dict = util.IxDict(X.columns)
    # normalize features to be between 0 and 1
    x = X.values
    min_max_scaler = preprocessing.MinMaxScaler()
    X = min_max_scaler.fit_transform(x)
    return X, y, feat_dict

In [71]:
X, y, feat_dict = get_X_y()

### Feature Selection

Looking for variability of features across 

In [72]:
svc = svm.SVC(C=1., kernel='rbf')
model_selection.cross_val_score(svc, X, y).mean()

0.6566666666666667

In [58]:
coef = []
skf = model_selection.StratifiedKFold(n_splits=5)
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    svc = svm.SVC(C=1., kernel='linear')
    svc.fit(X_train, y_train)
    print(metrics.accuracy_score(y_test, svc.predict(X_test)))
    coef.append(svc.coef_)

0.65
0.6833333333333333
0.5833333333333334
0.5666666666666667
0.6


In [39]:
coef[0].shape

(1, 19)

In [49]:
cf = []
for coef_ in coef:
    for ix in range(coef_.shape[1]):
        feat = feat_dict[ix]
        cf.append({
            'feature': feat,
            'coef': np.abs(coef_[0, ix])
        })
cf = pd.DataFrame(cf)

In [61]:
cfmu = cf.groupby('feature').mean().reset_index().sort_values(by='coef', ascending=False)
cfmu

Unnamed: 0,feature,coef
13,personal_pronouns,1.82033
15,quantifiers,1.635655
9,impersonal_pronouns,1.524259
10,max_np_height,1.472452
0,adverbs,1.435989
1,anger,1.259601
14,prepositions,1.227429
11,max_pp_height,1.197082
3,auxiliary_verbs,1.120938
4,avg_bf,1.026686


In [64]:
feats = list(cfmu[cfmu.coef > 0.9].feature.values)

In [67]:
X, y, feat_dict = get_X_y(feats)

In [70]:
svc = svm.SVC(C=1., kernel='rbf')
model_selection.cross_val_score(svc, X, y).mean()

0.6466666666666667

Well, that sucked.

In [129]:
feats = [
    'adverbs', 
    #'articles',
    #'auxiliary_verbs', 
    #'conjunctions', 
    'impersonal_pronouns',
    'personal_pronouns', 
    #'prepositions', 
    #'quantifiers', 
    'function_words',
    'avg_bf', 
    #'height', 
    'max_np_height', 
    #'max_pp_height', 
    'max_vp_height',
    'senti', 
    'senti_neg', 
    #'senti_pos', 
    'anger', 
    'distrust'
]

In [130]:
X, y, feat_dict = get_X_y(feats)
svc = svm.SVC(C=1., kernel='rbf')
s = model_selection.cross_val_score(svc, X, y)
print(s.mean())
print(s.std())

0.6866666666666666
0.06944222218666553


Best set so far.

In [120]:
rf = ensemble.RandomForestClassifier(n_estimators=200, max_depth=5)
s = model_selection.cross_val_score(rf, X, y)
print(s.mean())
print(s.std())

0.6766666666666665
0.0442216638714053


In [122]:
nb = naive_bayes.GaussianNB()
s = model_selection.cross_val_score(nb, X, y)
print(s.mean())
print(s.std())

0.6433333333333333
0.05120763831912404


In [106]:
feats = [
    'adverbs', 
    #'articles',
    #'auxiliary_verbs', 
    #'conjunctions', 
    'impersonal_pronouns',
    'personal_pronouns', 
    #'prepositions', 
    #'quantifiers', 
    'function_words',
    'avg_bf', 
    #'height', 
    'max_np_height', 
    #'max_pp_height', 
    'max_vp_height',
    'senti', 
    'senti_neg', 
    #'senti_pos', 
    'anger', 
    #'distrust'
]

In [None]:
X, y, feat_dict = get_X_y(feats)
svc = svm.SVC(C=1., kernel='rbf')
s = model_selection.cross_val_score(svc, X, y)
print(s.mean())
print(s.std())