In [8]:
from data import wiki
from data import corpus
import wham

import nltk
import pandas as pd
import pickle
from tqdm import tqdm_notebook as tqdm

tqdm().pandas()

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [9]:
posts = wiki.load_posts()
pairs = corpus.get_reply_pairs(posts)
users = wiki.create_users(posts=posts)
network = corpus.create_network(pairs)
users = corpus.compute_centrality(users, network, normalize=True, overwrite=True)

Reading users file.: 100%|██████████| 26397/26397 [00:00<00:00, 339974.28it/s]
Reading admins file.: 100%|██████████| 1949/1949 [00:00<00:00, 69111.36it/s]


Computed eigenvector centrality for 25633 users. Eigenvalue was 488.93710835096704.


In [10]:
posts = corpus.pos_tag_posts(posts)



In [11]:
wiki.save_posts(posts, overwrite=True)

In [12]:
df = pd.merge(posts, users, left_on='user', right_index=True)
print(len(df[df.admin]))
print(len(df[~df.admin]))

111589
242860


In [13]:
from collections import Counter
def get_ngrams(n, tokens):
    return zip(*(tokens[i:] for i in range(n)))

pos_trigrams = df.pos_tags.progress_apply(lambda x: Counter(list(get_ngrams(3, x))))
df['pos_trigrams'] = pos_trigrams

HBox(children=(IntProgress(value=0, max=354449), HTML(value='')))




In [14]:
# trigram_counts = Counter()
# for c in tqdm(df.pos_trigrams):
#     trigram_counts += c
with open('trigram_counts.pickle', 'rb') as f:
    trigram_counts = pickle.load(f)

In [15]:
trigram_counts_admin = Counter()
for c in tqdm(df[df.admin].pos_trigrams):
    trigram_counts_admin += c

HBox(children=(IntProgress(value=0, max=111589), HTML(value='')))




In [16]:
threshold = users.centrality.mean() + users.centrality.std()
df['highly_central'] = (df['centrality'] > threshold)
df.highly_central.value_counts()

False    316591
True      37858
Name: highly_central, dtype: int64

In [17]:
trigram_counts_central = Counter()
for c in tqdm(df[df.highly_central].pos_trigrams):
    trigram_counts_central += c

HBox(children=(IntProgress(value=0, max=37858), HTML(value='')))




In [18]:
import math

def KLD(P, Q):
    return {i: P[i] * math.log(P[i] / Q[i], 2) if i in P else 0 for i in Q}

def count_to_freq(counter):
    total = sum(counter.values())
    return {i: counter[i]/total for i in counter}

In [19]:
trigram_freq = count_to_freq(trigram_counts)
trigram_freq_admin = count_to_freq(trigram_counts_admin)
trigram_freq_central = count_to_freq(trigram_counts_central)

In [20]:
admin_diverg = KLD(trigram_freq_admin, trigram_freq)
central_diverg = KLD(trigram_freq_central, trigram_freq)

In [21]:
pos_admin_diverg = {k:v for (k,v) in admin_diverg.items() if v >= 0}
neg_admin_diverg = {k:v for (k,v) in admin_diverg.items() if v < 0}

pos_central_diverg = {k:v for (k,v) in central_diverg.items() if v >= 0}
neg_central_diverg = {k:v for (k,v) in central_diverg.items() if v < 0}

top_pos_admin = sorted(pos_admin_diverg.items(), key=lambda x: x[1], reverse=True)[:20]
top_neg_admin = sorted(neg_admin_diverg.items(), key=lambda x: x[1], reverse=False)[:20]

top_pos_central = sorted(pos_central_diverg.items(), key=lambda x: x[1], reverse=True)[:20]
top_neg_central = sorted(neg_central_diverg.items(), key=lambda x: x[1], reverse=False)[:20]

In [169]:
def find_sub_list(sl,l):
    results=[]
    sll=len(sl)
    for ind in (i for i,e in enumerate(l) if e==sl[0]):
        if l[ind:ind+sll]==sl:
            results.append((ind,ind+sll-1))
    return results

def get_top_instances(top, ngram_row='pos_trigrams'):
    top_instances = {pos[0]: Counter() for pos in top}
    for _, row in tqdm(df.iterrows(), total=len(df)):
        for pos_ngram, _ in top:
            if pos_ngram in row[ngram_row]:
                locs = find_sub_list(list(pos_ngram), row['pos_tags'])
                ngrams = [tuple(row['tokens'][loc[0]:loc[1]+1]) for loc in locs]
                top_instances[pos_ngram].update(ngrams)
    return top_instances

In [None]:
top_pos_instances_admin = get_top_instances(top_pos_admin)
top_neg_instances_admin = get_top_instances(top_neg_admin)
top_pos_instances_central = get_top_instances(top_pos_central)
top_neg_instances_central = get_top_instances(top_neg_central)

In [29]:
users['highly_central'] = (users['centrality'] > threshold)
users.highly_central.value_counts()

False    30762
True       138
Name: highly_central, dtype: int64

In [94]:
def print_top_pos(top_pos):
    print('{:^12} {:^8}'.format('n-gram', 'KLD weight'))
    for (pos_seq, kld) in top_pos:
        pos_str = ('{:3} '*len(pos_seq)).format(*pos_seq)
        print("{} {:.6f}".format(pos_str, kld))

In [132]:
def print_top_instances(top, instances, freqs, counts):
    for pos, kld in top:
        freq = freqs[pos]
        count = counts[pos]
        pos_str = ('{:3} '*len(pos)).format(*pos)
        print(pos_str, 'KLD:', kld, 'freq:', freq, 'count:', count)
        top_inst = count_to_freq(instances[pos]) 
        top_inst = sorted(top_inst.items(), key=lambda x: x[1], reverse=True)[:5] # display top 5
        print('{:20} {:8}'.format('instance', 'rel. freq.'))
        for inst, freq in top_inst:
            print('{:20} {:.6f}'.format(' '.join(inst), freq))
        print()

In [30]:
print("Admin pop\t", len(users[users.admin]))
print("Admin posts\t", len(df[df.admin]))
print("Admin KLD\t", sum(admin_diverg.values()))
print()
print("Central pop\t", len(users[users.highly_central]))
print("Central posts\t", len(df[df.highly_central]))
print("Central KLD\t", sum(central_diverg.values()))

Admin pop	 1754
Admin posts	 111589
Admin KLD	 0.011643954774979318

Central pop	 138
Central posts	 37858
Central KLD	 0.03923107604227925


In [104]:
print("Top indicative trigrams (admins)")
print_top_pos(top_pos_admin)
print()
print("Top indicative trigrams (highly central)")
print_top_pos(top_pos_central)

Top indicative trigrams (admins)
   n-gram    KLD weight
NNP NNP :    0.001029
NNP :   NNP  0.000908
PRP MD  VB   0.000699
:   NNP NNP  0.000689
IN  PRP VBP  0.000574
JJ  NNP :    0.000557
,   CC  PRP  0.000495
NN  .   :    0.000475
NN  ,   CC   0.000435
DT  NN  ,    0.000412
JJ  TO  VB   0.000349
MD  RB  VB   0.000344
.   :   )    0.000340
VB  DT  NN   0.000334
,   PRP MD   0.000324
''  RB  :    0.000301
CC  ''  NN   0.000296
PRP VBP JJ   0.000289
VB  TO  VB   0.000283
NN  TO  VB   0.000279

Top indicative trigrams (highly central)
   n-gram    KLD weight
.   :   )    0.001921
PRP VBP RB   0.001061
IN  DT  NN   0.000947
PRP VBP JJ   0.000798
DT  NN  ,    0.000746
.   PRP VBP  0.000677
PRP MD  VB   0.000641
TO  VB  IN   0.000632
,   CC  PRP  0.000603
''  VBZ NN   0.000567
NN  ,   CC   0.000555
VBZ NN  NN   0.000551
VBP RB  VB   0.000515
.   ''  ''   0.000508
:   PRP VBP  0.000497
IN  PRP VBP  0.000455
VB  TO  VB   0.000435
MD  RB  VB   0.000421
.   NNP ,    0.000419
CC  PRP VBP  0.0004

In [149]:
print_top_instances(top_pos_admin, top_pos_instances_admin, trigram_freq, trigram_counts)

NNP NNP :    KLD: 0.0010285035822041127 freq: 0.002995493612019941 count: 61288
instance             rel. freq.
[ WP :               0.295294
[ User :             0.230665
[ Wikipedia :        0.218526
[ Special :          0.034689
] ] -                0.021162

NNP :   NNP  KLD: 0.0009084737661793094 freq: 0.0028204208715445554 count: 57706
instance             rel. freq.
Wikipedia : Criteria 0.012962
WP : CSD             0.009982
WP : RS              0.009652
Wikipedia : Media    0.009514
WP : V               0.008595

PRP MD  VB   KLD: 0.0006989988217194979 freq: 0.0074848239744278455 count: 153140
instance             rel. freq.
it would be          0.024755
I 'd like            0.014183
I would like         0.012949
it should be         0.012831
I 'll be             0.011865

:   NNP NNP  KLD: 0.0006894492687301764 freq: 0.0027750642263292685 count: 56778
instance             rel. freq.
: RS ]               0.008930
: V ]                0.007397
: Monotype Corsiva   0.007256
: BLP

In [123]:
print_top_instances(top_pos_central, top_pos_instances_central, trigram_freq, trigram_counts)

.   :   )    KLD: 0.001921269043920664 freq: 0.0004652477433236167 count: 9519
instance             rel. freq.
. : )                0.641769
! : )                0.168715
. ; )                0.085198
? : )                0.059565
! ; )                0.025108

PRP VBP RB   KLD: 0.0010614612088188896 freq: 0.0048782342697118244 count: 99809
instance             rel. freq.
I do n't             0.226743
I 'm not             0.109259
you do n't           0.041630
I am not             0.036630
I have n't           0.028825

IN  DT  NN   KLD: 0.0009465187591187016 freq: 0.01461432164420576 count: 299010
instance             rel. freq.
in the article       0.015224
of the article       0.014324
on the talk          0.012337
on the article       0.009863
in the future        0.006378

PRP VBP JJ   KLD: 0.0007976618972286392 freq: 0.0019944704970907474 count: 40807
instance             rel. freq.
I 'm sure            0.056608
I 'm sorry           0.043228
You 're welcome      0.037224
I 'm gla

In [115]:
print("Top anti-indicative trigrams (admins)")
print_top_pos(top_neg_admin)
print()
print("Top anti-indicative trigrams (highly central)")
print_top_pos(top_neg_central)

Top anti-indicative trigrams (admins)
   n-gram    KLD weight
NNP NNP NNP  -0.001652
IN  DT  NNP  -0.000582
NN  IN  DT   -0.000466
DT  NNP NNP  -0.000459
NNP NNP NN   -0.000293
NN  NN  NN   -0.000286
NN  IN  NNP  -0.000279
NNP ,   NNP  -0.000267
DT  NNP NN   -0.000253
NNP IN  NNP  -0.000247
JJ  NN  IN   -0.000239
NNP NNP ''   -0.000237
DT  NN  IN   -0.000220
.   PRP VBD  -0.000213
NN  NNP NN   -0.000212
NN  ''  NN   -0.000205
NNP NN  NN   -0.000198
MD  PRP VB   -0.000194
NN  .   PRP  -0.000182
NN  CC  NN   -0.000175

Top anti-indicative trigrams (highly central)
   n-gram    KLD weight
NNP NNP NNP  -0.006749
NNP NNP NN   -0.001049
NN  NNP NNP  -0.001028
NN  ''  NN   -0.000985
NNP NNP :    -0.000978
:   NNP NNP  -0.000978
NNP NNP .    -0.000846
NNP :   NNP  -0.000814
NN  :   NN   -0.000793
IN  NNP NNP  -0.000776
:   NN  :    -0.000696
JJ  NNP NNP  -0.000642
IN  DT  NNP  -0.000631
DT  NNP NNP  -0.000627
NN  IN  NNP  -0.000561
NNP NNP ,    -0.000553
JJ  NN  ''   -0.000525
NN  :   NNP  -0.

In [131]:
print_top_instances(top_neg_admin, top_neg_instances_admin, trigram_freq, trigram_counts)

NNP NNP NNP  KLD: -0.001651809956463843 freq: 0.017511379605941662 count: 358284
instance             rel. freq.
[ [ WP               0.044448
[ [ Wikipedia        0.040635
[ [ User             0.039536
< \/font >           0.014890
] ] <                0.012066

IN  DT  NNP  KLD: -0.0005821594907861032 freq: 0.003026138672957425 count: 61915
instance             rel. freq.
on the [             0.035403
at the [             0.025955
of the [             0.024291
in the [             0.022127
for the [            0.008269

NN  IN  DT   KLD: -0.0004659622837395458 freq: 0.008467469827933604 count: 173245
instance             rel. freq.
part of the          0.012635
look at the          0.010049
top of the           0.005149
article on the       0.005016
end of the           0.004733

DT  NNP NNP  KLD: -0.00045851082751109694 freq: 0.00188875236821072 count: 38644
instance             rel. freq.
the [ [              0.389401
The [ [              0.006883
the Arbitration Committee 0.004399

In [125]:
print_top_instances(top_neg_central, top_neg_instances_central, trigram_freq, trigram_counts)

NNP NNP NNP  KLD: -0.0067488391135532904 freq: 0.017511379605941662 count: 358284
instance             rel. freq.
[ [ WP               0.044448
[ [ Wikipedia        0.040635
[ [ User             0.039536
< \/font >           0.014890
] ] <                0.012066

NNP NNP NN   KLD: -0.001048698914744547 freq: 0.0026356218677094263 count: 53925
instance             rel. freq.
[ User talk          0.111637
[ [ Talk             0.064942
] ] article          0.057024
> ] ]                0.033102
[ Wikipedia talk     0.028132

NN  NNP NNP  KLD: -0.0010279029825061787 freq: 0.00246006037028129 count: 50333
instance             rel. freq.
deletion ] ]         0.037629
article [ [          0.026484
page ] ]             0.025252
> [ [                0.024338
b > <                0.017881

NN  ''  NN   KLD: -0.0009852507643142276 freq: 0.0026623079973296273 count: 54471
instance             rel. freq.
style=\ '' color     0.045143
style=\ '' border:1px 0.039489
color=\ '' green\    0.032843
sty

In [37]:
pos_fourgrams = df.pos_tags.progress_apply(lambda x: Counter(list(get_ngrams(4, x))))
df['pos_fourgrams'] = pos_fourgrams

HBox(children=(IntProgress(value=0, max=354449), HTML(value='')))




In [38]:
# fourgrams_counts = Counter()
# for c in tqdm(df.pos_fourgrams):
#     fourgrams_counts += c
with open('fourgram_counts.pickle', 'rb') as f:
    fourgram_counts = pickle.load(f)

In [39]:
fourgram_counts_admin = Counter()
for c in tqdm(df[df.admin].pos_fourgrams):
    fourgram_counts_admin += c
    
fourgram_counts_central = Counter()
for c in tqdm(df[df.highly_central].pos_fourgrams):
    fourgram_counts_central += c

HBox(children=(IntProgress(value=0, max=111589), HTML(value='')))




HBox(children=(IntProgress(value=0, max=37858), HTML(value='')))




In [40]:
fourgram_freq = count_to_freq(fourgram_counts)
fourgram_freq_admin = count_to_freq(fourgram_counts_admin)
fourgram_freq_central = count_to_freq(fourgram_counts_central)

In [41]:
admin_diverg4 = KLD(fourgram_freq_admin, fourgram_freq)
central_diverg4 = KLD(fourgram_freq_central, fourgram_freq)

In [42]:
pos_admin_diverg4 = {k:v for (k,v) in admin_diverg4.items() if v >= 0}
neg_admin_diverg4 = {k:v for (k,v) in admin_diverg4.items() if v < 0}

pos_central_diverg4 = {k:v for (k,v) in central_diverg4.items() if v >= 0}
neg_central_diverg4 = {k:v for (k,v) in central_diverg4.items() if v < 0}

top_pos_admin4 = sorted(pos_admin_diverg4.items(), key=lambda x: x[1], reverse=True)[:20]
top_neg_admin4 = sorted(neg_admin_diverg4.items(), key=lambda x: x[1], reverse=False)[:20]

top_pos_central4 = sorted(pos_central_diverg4.items(), key=lambda x: x[1], reverse=True)[:20]
top_neg_central4 = sorted(neg_central_diverg4.items(), key=lambda x: x[1], reverse=False)[:20]

In [170]:
top_pos_instances_admin4 = get_top_instances(top_pos_admin4, ngram_row='pos_fourgrams')
top_neg_instances_admin4 = get_top_instances(top_neg_admin4, ngram_row='pos_fourgrams')
top_pos_instances_central4 = get_top_instances(top_pos_central4, ngram_row='pos_fourgrams')
top_neg_instances_central4 = get_top_instances(top_neg_central4, ngram_row='pos_fourgrams')

HBox(children=(IntProgress(value=0, max=354449), HTML(value='')))

HBox(children=(IntProgress(value=0, max=354449), HTML(value='')))

HBox(children=(IntProgress(value=0, max=354449), HTML(value='')))

HBox(children=(IntProgress(value=0, max=354449), HTML(value='')))

In [176]:
admin_diverg4 = KLD(fourgram_freq_admin, fourgram_freq)
central_diverg4 = KLD(fourgram_freq_central, fourgram_freq)
print("Admin KLD\t", sum(admin_diverg4.values()))
print("Central KLD\t", sum(central_diverg4.values()))

Admin KLD	 0.05295665257277906
Central KLD	 0.15517900441686655


In [152]:
print("Top indicative trigrams (admins)")
print_top_pos(top_pos_admin4)
print()
print("Top indicative trigrams (highly central)")
print_top_pos(top_pos_central4)

Top indicative trigrams (admins)
   n-gram    KLD weight
NNP NNP NNP :    0.000928
:   NNP NNP NNP  0.000646
NNP :   NNP NNP  0.000613
NNP NNP :   NNP  0.000576
NN  ''  RB  :    0.000318
:   CC  ''  NN   0.000303
IN  NNP NNP NNP  0.000298
JJ  JJ  NNP :    0.000269
JJ  NNP :   NNP  0.000261
IN  DT  NN  ,    0.000257
,   PRP MD  VB   0.000249
''  RB  :   NNP  0.000248
NN  :   CC  ''   0.000234
:   NN  :   CC   0.000232
PRP MD  RB  VB   0.000206
DT  NN  ,   CC   0.000204
JJ  NN  ''  RB   0.000199
#   CD  :   NN   0.000197
VBD #   CD  :    0.000194
DT  NN  .   :    0.000192

Top indicative trigrams (highly central)
   n-gram    KLD weight
.   ''  ''  VBZ  0.000691
NN  NN  ''  #    0.000658
''  ''  VBZ NN   0.000625
''  VBZ NN  NN   0.000606
VBZ NN  NN  ''   0.000594
IN  DT  NN  ,    0.000556
,   CC  NN  :    0.000515
NN  .   :   )    0.000495
.   NNP ,   CC   0.000441
NNP ,   CC  NN   0.000435
NNS IN  DT  NN   0.000386
PRP VBP JJ  .    0.000369
.   :   )   PRP  0.000368
.   PRP VBP RB   0.

In [172]:
print_top_instances(top_pos_admin4, top_pos_instances_admin4, fourgram_freq, fourgram_counts)

NNP NNP NNP :    KLD: 0.0009275431459103214 freq: 0.0024572175894019242 count: 49420
instance             rel. freq.
[ [ WP :             0.322197
[ [ Wikipedia :      0.270538
[ [ User :           0.188810
[ [ Special :        0.041279
[ [ User_talk :      0.024727

:   NNP NNP NNP  KLD: 0.000646103013974396 freq: 0.002209755591375753 count: 44443
instance             rel. freq.
: RS ] ]             0.010778
: V ] ]              0.009045
: BLP ] ]            0.008753
: Consensus|consensus ] ] 0.007988
: NPOV ] ]           0.006885

NNP :   NNP NNP  KLD: 0.0006128775825352614 freq: 0.0019410129553838994 count: 39038
instance             rel. freq.
WP : RS ]            0.012680
WP : V ]             0.010631
WP : BLP ]           0.010093
Wikipedia : Consensus|consensus ] 0.008966
WP : NPOV ]          0.008223

NNP NNP :   NNP  KLD: 0.0005763059619276268 freq: 0.0018547965389528404 count: 37304
instance             rel. freq.
[ Wikipedia : Media  0.012706
[ WP : CSD           0.010160
[ W

In [173]:
print_top_instances(top_pos_central4, top_pos_instances_central4, fourgram_freq, fourgram_counts)

.   ''  ''  VBZ  KLD: 0.0006908220035745305 freq: 9.33762572419428e-05 count: 1878
instance             rel. freq.
. ' '' <             0.668797
! ' '' <             0.111821
? ' '' <             0.073482
. ' '' [             0.065495
! ' '' [             0.013312

NN  NN  ''  #    KLD: 0.0006578125837332128 freq: 9.606119754602423e-05 count: 1932
instance             rel. freq.
font color=\ '' #    0.958075
color =\ '' #        0.040373
| bgcolor=\ '' #     0.001553

''  ''  VBZ NN   KLD: 0.0006250993273154904 freq: 0.00010396685510804175 count: 2091
instance             rel. freq.
' '' < font          0.879005
'' '' < font         0.044476
'' '' < span         0.011000
' '' [ [             0.010521
' '' [ tk            0.010043

''  VBZ NN  NN   KLD: 0.0006062589885314171 freq: 0.00010481211409265997 count: 2108
instance             rel. freq.
'' < font color=\    0.728178
'' < font face=\     0.188330
'' < span style=\    0.044118
'' < font style=\    0.019450
'' [ tk ]            0

In [175]:
print("Top anti-indicative trigrams (admins)")
print_top_pos(top_neg_admin4)
print()
print("Top anti-indicative trigrams (highly central)")
print_top_pos(top_neg_central4)

Top anti-indicative trigrams (admins)
   n-gram    KLD weight
NNP NNP NNP NNP  -0.001838
IN  DT  NNP NNP  -0.000263
NN  IN  DT  NNP  -0.000232
DT  NNP NNP NNP  -0.000178
JJ  NNP NNP NNP  -0.000156
IN  DT  NNP NN   -0.000155
DT  NN  IN  DT   -0.000149
NN  NN  NN  NN   -0.000147
IN  DT  NN  IN   -0.000143
NNP NNP IN  NNP  -0.000121
JJ  NN  IN  DT   -0.000118
NNP NNP ,   NNP  -0.000116
NN  CC  NN  CC   -0.000115
DT  NNP NNP NN   -0.000112
NNP ,   NNP NNP  -0.000112
NN  ''  NN  ''   -0.000110
DT  JJ  NN  IN   -0.000106
NNP IN  NNP NNP  -0.000103
NN  NNP NN  NNP  -0.000101
NNP NNP NNP (    -0.000100

Top anti-indicative trigrams (highly central)
   n-gram    KLD weight
NNP NNP NNP NNP  -0.003494
NNP NNP NNP :    -0.000788
:   NNP NNP NNP  -0.000659
NNP :   NNP NNP  -0.000580
NNP NNP :   NNP  -0.000569
NNP NNP NNP NN   -0.000523
NNP NNP NNP .    -0.000460
IN  NNP NNP NNP  -0.000423
NN  :   NN  :    -0.000404
NN  NNP NNP NNP  -0.000397
IN  DT  NNP NNP  -0.000378
DT  NNP NNP NNP  -0.000375
NN 

In [None]:
print_top_instances(top_neg_admin4, top_neg_instances_admin4, fourgram_freq, fourgram_counts)