In [1]:
import pystan
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime
import nltk
import os

tqdm.pandas()

DELIM         = " +++$+++ "
CONVO_FILE    = "wikipedia.talkpages.conversations.txt"
USERS_FILE    = "wikipedia.talkpages.userinfo.txt"
ADMINS_FILE   = "wikipedia.talkpages.admins.txt"
POSTS_DF_FILE = "posts_df.pickle"
USERS_DF_FILE = "users_df.pickle"
POSTS_CSV     = "posts.csv"
USERS_CSV     = "users.csv"
NETWORK_FILE  = "users_network.pickle"
CORPUS_DIR    = ("../data/wiki/")
FWORDS_DIR    = '../data/function words/'

function_words = ['conjunctions', 'articles', 'prepositions', 'adverbs', 'quantifiers', 
           'impersonal_pronouns', 'personal_pronouns', 'auxiliary_verbs']

In [2]:
posts = pd.read_pickle(CORPUS_DIR + POSTS_DF_FILE)

In [3]:
columns = ['user', 'edit_count', 'gender', 'numerical_id']

users = {column: [] for column in columns}
with open(CORPUS_DIR + USERS_FILE) as f:
    for line in f.readlines():
        line = line.rstrip('\n').split(DELIM)
        assert(len(line) == len(columns))
        line = {column: value for column, value in zip(columns, line)}
        for column, value in line.items():
            users[column].append(value)
            
users = pd.DataFrame(data=users, index=users['user'], columns=columns)

columns = ['user', 'admin_ascension']

admins = {column: [] for column in columns}
with open(CORPUS_DIR + ADMINS_FILE) as f:
    for line in f.readlines():
        line = line.rstrip('\n').split(' ')
        line = ' '.join(line[:-1]), line[-1]
        assert(len(line) == len(columns))
        line = {column: value for column, value in zip(columns, line)}
            
        # convert timestamps to datetime objects
        try:
            line['admin_ascension'] = datetime.strptime(line['admin_ascension'], "%Y-%m-%d")
        except ValueError:
            line['admin_ascension'] = None
            
        for column, value in line.items():
            admins[column].append(value)
            
admins = pd.DataFrame(data=admins, index=admins['user'], columns=columns)
users = pd.merge(users, admins, on='user', how='left').set_index('user')
users['admin'] = users['admin_ascension'].notna()

In [4]:
df = pd.merge(posts, users, left_on='user', right_index=True)
print(len(df[df.admin]))
print(len(df[~df.admin]))

112452
248259


In [5]:
pos = df.tokens.progress_apply(nltk.pos_tag)
df['pos'] = pos.apply(lambda pos_list: list(map(lambda t: t[1], pos_list)))

100%|██████████| 360711/360711 [13:41<00:00, 439.04it/s]


In [8]:
from collections import Counter
def get_ngrams(n, tokens):
    return zip(*(tokens[i:] for i in range(n)))

pos_trigrams = df.pos.progress_apply(lambda x: Counter(list(get_ngrams(3, x))))
df['pos_trigrams'] = pos_trigrams

100%|██████████| 360711/360711 [00:15<00:00, 23666.97it/s]


In [23]:
trigram_counts = Counter()
for c in tqdm(df.pos_trigrams):
    trigram_counts += c

100%|██████████| 360711/360711 [17:47<00:00, 337.75it/s]


In [24]:
trigram_counts_admin = Counter()
for c in tqdm(df[df.admin].pos_trigrams):
    trigram_counts_admin += c

100%|██████████| 112452/112452 [03:19<00:00, 562.58it/s]


In [25]:
import math

def KLD(P, Q):
    return {i: P[i] * math.log(P[i] / Q[i], 2) if i in P else 0 for i in Q}

In [26]:
def count_to_freq(counter):
    total = sum(counter.values())
    return {i: counter[i]/total for i in counter}

trigram_freq = count_to_freq(trigram_counts)
trigram_freq_admin = count_to_freq(trigram_counts_admin)

In [27]:
admin_diverg = KLD(trigram_freq_admin, trigram_freq)

In [42]:
pos_admin_diverg = {k:v for (k,v) in admin_diverg.items() if v >= 0}
neg_admin_diverg = {k:v for (k,v) in admin_diverg.items() if v < 0}

top_pos = sorted(pos_admin_diverg.items(), key=lambda x: x[1], reverse=True)[:20]
top_neg = sorted(neg_admin_diverg.items(), key=lambda x: x[1], reverse=False)[:20]

In [58]:
top_pos

[(('NNP', 'NNP', ':'), 0.0010285035822041127),
 (('NNP', ':', 'NNP'), 0.0009084737661793094),
 (('PRP', 'MD', 'VB'), 0.0006989988217194979),
 ((':', 'NNP', 'NNP'), 0.0006894492687301764),
 (('IN', 'PRP', 'VBP'), 0.0005741059322565479),
 (('JJ', 'NNP', ':'), 0.0005573336781763727),
 ((',', 'CC', 'PRP'), 0.0004951565213662961),
 (('NN', '.', ':'), 0.00047544730252520935),
 (('NN', ',', 'CC'), 0.00043464868379632993),
 (('DT', 'NN', ','), 0.0004117405408922502),
 (('JJ', 'TO', 'VB'), 0.0003487158631102547),
 (('MD', 'RB', 'VB'), 0.000343727597046823),
 (('.', ':', ')'), 0.0003398520849684353),
 (('VB', 'DT', 'NN'), 0.00033411335315110887),
 ((',', 'PRP', 'MD'), 0.0003243493511141481),
 (("''", 'RB', ':'), 0.00030062116186188923),
 (('CC', "''", 'NN'), 0.0002957467715850177),
 (('PRP', 'VBP', 'JJ'), 0.00028854697393355396),
 (('VB', 'TO', 'VB'), 0.0002828513435158768),
 (('NN', 'TO', 'VB'), 0.00027865728932818963)]

In [81]:
def find_sub_list(sl,l):
    results=[]
    sll=len(sl)
    for ind in (i for i,e in enumerate(l) if e==sl[0]):
        if l[ind:ind+sll]==sl:
            results.append((ind,ind+sll-1))

    return results

top_pos_instances = {pos[0]: Counter() for pos in top_pos}
top_neg_instances = {pos[0]: Counter() for pos in top_neg}

for _, row in tqdm(df.iterrows()):
    for pos_trigram, _ in top_pos:
        if pos_trigram in row['pos_trigrams']:
            locs = find_sub_list(list(pos_trigram), row['pos'])
            trigrams = [tuple(row['tokens'][loc[0]:loc[1]+1]) for loc in locs]
            top_pos_instances[pos_trigram].update(trigrams)
            

360711it [03:07, 1920.43it/s]


In [99]:
for _, row in tqdm(df.iterrows()):
    for pos_trigram, _ in top_neg:
        if pos_trigram in row['pos_trigrams']:
            locs = find_sub_list(list(pos_trigram), row['pos'])
            trigrams = [tuple(row['tokens'][loc[0]:loc[1]+1]) for loc in locs]
            top_neg_instances[pos_trigram].update(trigrams)

360711it [02:38, 2273.93it/s]


In [73]:
top_pos

[(('NNP', 'NNP', ':'), 0.0010285035822041127),
 (('NNP', ':', 'NNP'), 0.0009084737661793094),
 (('PRP', 'MD', 'VB'), 0.0006989988217194979),
 ((':', 'NNP', 'NNP'), 0.0006894492687301764),
 (('IN', 'PRP', 'VBP'), 0.0005741059322565479),
 (('JJ', 'NNP', ':'), 0.0005573336781763727),
 ((',', 'CC', 'PRP'), 0.0004951565213662961),
 (('NN', '.', ':'), 0.00047544730252520935),
 (('NN', ',', 'CC'), 0.00043464868379632993),
 (('DT', 'NN', ','), 0.0004117405408922502),
 (('JJ', 'TO', 'VB'), 0.0003487158631102547),
 (('MD', 'RB', 'VB'), 0.000343727597046823),
 (('.', ':', ')'), 0.0003398520849684353),
 (('VB', 'DT', 'NN'), 0.00033411335315110887),
 ((',', 'PRP', 'MD'), 0.0003243493511141481),
 (("''", 'RB', ':'), 0.00030062116186188923),
 (('CC', "''", 'NN'), 0.0002957467715850177),
 (('PRP', 'VBP', 'JJ'), 0.00028854697393355396),
 (('VB', 'TO', 'VB'), 0.0002828513435158768),
 (('NN', 'TO', 'VB'), 0.00027865728932818963)]

In [100]:
top_neg

[(('NNP', 'NNP', 'NNP'), -0.001651809956463843),
 (('IN', 'DT', 'NNP'), -0.0005821594907861032),
 (('NN', 'IN', 'DT'), -0.0004659622837395458),
 (('DT', 'NNP', 'NNP'), -0.00045851082751109694),
 (('NNP', 'NNP', 'NN'), -0.0002932459541662096),
 (('NN', 'NN', 'NN'), -0.00028608374835630983),
 (('NN', 'IN', 'NNP'), -0.0002794303547490264),
 (('NNP', ',', 'NNP'), -0.00026695039716942426),
 (('DT', 'NNP', 'NN'), -0.0002532779692771846),
 (('NNP', 'IN', 'NNP'), -0.0002467470943616104),
 (('JJ', 'NN', 'IN'), -0.0002387135910615494),
 (('NNP', 'NNP', "''"), -0.00023710546938931676),
 (('DT', 'NN', 'IN'), -0.00021990033574099366),
 (('.', 'PRP', 'VBD'), -0.00021257057184801778),
 (('NN', 'NNP', 'NN'), -0.00021240816268725449),
 (('NN', "''", 'NN'), -0.00020503006122331288),
 (('NNP', 'NN', 'NN'), -0.00019832488982324953),
 (('MD', 'PRP', 'VB'), -0.00019395014865393237),
 (('NN', '.', 'PRP'), -0.00018226224980441553),
 (('NN', 'CC', 'NN'), -0.00017531880811265206)]

In [96]:
for pos, kld in top_pos:
    print(pos, 'KLD:', kld)
    top_inst = count_to_freq(top_pos_instances[pos]) 
    top_inst = sorted(top_inst.items(), key=lambda x: x[1], reverse=True)[:5] # display top 5
    for inst, freq in top_inst:
        print('{}\t{}'.format(inst, freq))
    print()

('NNP', 'NNP', ':') KLD: 0.0010285035822041127
('[', 'WP', ':')	0.29529434799634513
('[', 'User', ':')	0.23066505678109908
('[', 'Wikipedia', ':')	0.21852564939302963
('[', 'Special', ':')	0.03468868293956402
(']', ']', '-')	0.02116238089022321

('NNP', ':', 'NNP') KLD: 0.0009084737661793094
('Wikipedia', ':', 'Criteria')	0.012962256957682043
('WP', ':', 'CSD')	0.00998163102623644
('WP', ':', 'RS')	0.00965237583613489
('Wikipedia', ':', 'Media')	0.009513742071881607
('WP', ':', 'V')	0.0085952933837036

('PRP', 'MD', 'VB') KLD: 0.0006989988217194979
('it', 'would', 'be')	0.02475512602847068
('I', "'d", 'like')	0.01418310043097819
('I', 'would', 'like')	0.012948935614470419
('it', 'should', 'be')	0.012831396108136345
('I', "'ll", 'be')	0.011864960167167298

(':', 'NNP', 'NNP') KLD: 0.0006894492687301764
(':', 'RS', ']')	0.008929514952974744
(':', 'V', ']')	0.007397231321990912
(':', 'Monotype', 'Corsiva')	0.007256331677762514
(':', 'BLP', ']')	0.007168269400119765
(':', 'Consensus|consen

In [101]:
for pos, kld in top_neg:
    print(pos, 'KLD:', kld)
    top_inst = count_to_freq(top_neg_instances[pos]) 
    top_inst = sorted(top_inst.items(), key=lambda x: x[1], reverse=True)[:5] # display top 5
    for inst, freq in top_inst:
        print('{}\t{}'.format(inst, freq))
    print()

('NNP', 'NNP', 'NNP') KLD: -0.001651809956463843
('[', '[', 'WP')	0.04444797981489545
('[', '[', 'Wikipedia')	0.040635361891683694
('[', '[', 'User')	0.039535675609293185
('<', '\\/font', '>')	0.01489042212323185
(']', ']', '<')	0.01206584720501055

('IN', 'DT', 'NNP') KLD: -0.0005821594907861032
('on', 'the', '[')	0.03540337559557458
('at', 'the', '[')	0.02595493822175563
('of', 'the', '[')	0.02429136719696358
('in', 'the', '[')	0.02212710974723411
('for', 'the', '[')	0.008269401598966325

('NN', 'IN', 'DT') KLD: -0.0004659622837395458
('part', 'of', 'the')	0.012635285289618749
('look', 'at', 'the')	0.01004935207365292
('top', 'of', 'the')	0.0051487777425033915
('article', 'on', 'the')	0.00501601777829086
('end', 'of', 'the')	0.004733181332794597

('DT', 'NNP', 'NNP') KLD: -0.00045851082751109694
('the', '[', '[')	0.38940068315909326
('The', '[', '[')	0.006883345409377911
('the', 'Arbitration', 'Committee')	0.0043991305247903945
('the', 'United', 'Kingdom')	0.0040886036642169546
('the