In [24]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

from pan20 import fake, util
from pan20.util import text
from pan20.util.lexicons import wordnet_affect

In [4]:
wna = wordnet_affect.WordNetAffect()

In [5]:
len(wna.words_in('anger'))

318

Note: [184 in LIWC 2007](http://citeseerx.ist.psu.edu/viewdoc/download?rep=rep1&type=pdf&doi=10.1.1.216.2064).

### Calculate PMI with class labels

In [17]:
df = pd.read_csv('data/fake/feats.csv')

In [19]:
df['toks'] = df.tweet.apply(text.tokenize)

Think first step is to build a word-label count matrix.

In [27]:
counts = np.zeros((len(wna.words_in('anger')), 2))
word_dict = util.IxDict(wna.words_in('anger'))
with tqdm(total=len(df)) as pbar:
    for _, x in df.iterrows():
        for tok in x.toks:
            if tok in word_dict:
                tok_ix = word_dict[tok]
                counts[tok_ix, x.label] += 1
        pbar.update()

HBox(children=(FloatProgress(value=0.0, max=30000.0), HTML(value='')))




Using [formulas here](https://www.kaggle.com/gabrielaltay/word-vectors-from-pmi-matrix):

In [37]:
p_w_c = counts / counts.sum()
c_w = np.expand_dims(counts.sum(axis=1), 1)
p_w = c_w / counts.sum()
c_c = np.expand_dims(counts.sum(axis=0), 0)
p_c = c_c / counts.sum()
denom = p_w * p_c
pmi = np.log(p_w_c / denom)

  import sys
  import sys


In [47]:
pmis = []
for tok_ix, tok in word_dict.items():
    pmis.append({
        'word': tok,
        'count0': counts[tok_ix, 0],
        'count1': counts[tok_ix, 1],
        'n': counts[tok_ix].sum(),
        'pmi0': pmi[tok_ix, 0],
        'pmi1': pmi[tok_ix, 1],        
    })
pmis = pd.DataFrame(pmis)

In [52]:
pd.set_option('display.max_rows', 500)
pmis[pmis.n > 0].head(80)

Unnamed: 0,word,count0,count1,n,pmi0,pmi1
3,abomination,0.0,1.0,1.0,-inf,0.620827
5,aggravated,0.0,1.0,1.0,-inf,0.620827
7,aggression,0.0,1.0,1.0,-inf,0.620827
8,aggressive,2.0,4.0,6.0,-0.327504,0.215361
10,amok,1.0,1.0,2.0,0.077962,-0.072321
12,anger,4.0,3.0,7.0,0.211493,-0.226471
14,angrily,0.0,2.0,2.0,-inf,0.620827
15,angry,13.0,8.0,21.0,0.291536,-0.344254
16,animosity,0.0,1.0,1.0,-inf,0.620827
17,animus,1.0,0.0,1.0,0.771109,-inf


In [56]:
pmis.count0.sum()

222.0

In [55]:
pmis.count1.sum()

258.0

In [53]:
ldf = pd.read_csv('tmp/anger.csv')

In [54]:
ldf.head()

Unnamed: 0,Filename,anger
0,06ct0t68y1acizh9eow3g5rhancrppr8.txt,1.02
1,071nxc49ihpd0jlfmvn2lghtayy3b5n9.txt,0.52
2,09py5qescynpnnckmzueqzr2y49moh1o.txt,0.39
3,0dwovd7nj6yg9m795ng2c629me0ccmrh.txt,1.06
4,0ibi364m7i7l01xi4xqafyathrmrrnll.txt,2.47
