In [24]:
from __future__ import print_function

This script assumes the corpus is located in `/data/corpus/`. The corpus can be downloaded using `get_corus.sh`.

In [25]:
import wiki
corpus = wiki.Corpus('../data/corpus/')
utts = corpus.get_utts()
users = {utt.user_id for utt in utts if utt.user_id}

Creating Users...
Loading conversations...


# User statistics:
- Eigenvector centrality (+ binarized centrality)
- avg. perplexity (of the user's utterances)
- total number of utterances
- Admin status

In [34]:
user_stats_list = ['user_id', 'eigen_central', 'eigen_central_bin', 'avg_perplexity', 'n_utts', 'admin_status']
user_stats = {user: {stat: None for stat in user_stats_list} for user in users}
for user in user_stats:
    user_stats[user]['user_id'] = user

### Eigenvector Centrality

In [35]:
import networkx as nx
import numpy

network = corpus.generate_network()
centrality = nx.eigenvector_centrality_numpy(network, weight=None)

mean = numpy.mean(centrality.values())
stddev = numpy.std(centrality.values())

for user in users:
    eigen = centrality[user] if user in centrality else 0
    user_stats[user]['eigen_central'] = eigen
    user_stats[user]['eigen_central_bin'] = eigen > mean + stddev

Generating network from 389121 utterances...
There were 111264 replies to unknown users.
The unpruned network has  25813 nodes.
Pruning network to its largest component...
	 removed 34 users from 15 disconnected components.
Normalizing edge weights...
max weight 767


### Average Perplexity & Number of Utterances

First, we set up the KenLM n-gram language model for the corpus: 
- [Download and install](https://kheafield.com/code/kenlm/)
- [Paper](https://kheafield.com/papers/avenue/kenlm.pdf)
- [Python module](https://github.com/kpu/kenlm)
KenLM needs to be run from the command line to generate a language model object. KenLM expects to receive a corpus in [this](https://kheafield.com/code/kenlm/estimation/) format. The language model object can then be loaded into Python.

In [44]:
import kenlm
import os

# Location of the compiled KenLM utility
lmplz = '~/kenlm/build/bin/lmplz'
# Location to store temporary input and output files for KenLM
corpus_file = '../data/lm_corpus.txt'
kenlm_file = '../data/lm.arpa'

# Format the corpus for KenLM. See corpus formatting notes: https://kheafield.com/code/kenlm/estimation/
kenlm_corpus = '\n'.join(' '.join(b.tokenized) for b in utts)
with open(corpus_file, 'w') as f:
    f.write(kenlm_corpus)
# use KenLM to create the n-gram language model
print('{0} -o 3 -S 20% <{1} >{2}'.format(lmplz, corpus_file, kenlm_file))
os.system('{0} -o 3 -S 20% <{1} >{2}'.format(lmplz, corpus_file, kenlm_file))

~/kenlm/build/bin/lmplz -o 3 -S 20% <../data/lm_corpus.txt >../data/lm.arpa


-1

In [56]:
from collections import defaultdict

lm = kenlm.Model(kenlm_file)
utt_perplexity = defaultdict(list)

for utt in utts:
    utt_perplexity[utt.user_id].append(lm.perplexity(utt.clean_text))
    
for user in users:
    user_stats[user]['avg_perplexity'] = numpy.mean(utt_perplexity[user])
    user_stats[user]['n_utts'] = len(utt_perplexity[user])

### Admin Status

In [46]:
for user in users:
    if user in corpus.users:
        user_stats[user]['admin_status'] = corpus.users[user].admin
    else:
        user_stats[user]['admin_status'] = False

### Write out collected data

In [57]:
import csv

with open('../data/user_stats.csv', 'w') as f:
    writer = csv.DictWriter(f, user_stats_list)
    writer.writeheader()
    writer.writerows(user_stats.values())

# Temporal Utterance Statistics
- perplexity of the utterance (aggregate LM / time period LM)
- user Admin status
- user network centrality
- date of utt
- months since first utt

In [None]:
import wiki
import csv

corpus = wiki.Corpus('../data/corpus/')
utts = corpus.get_utts()

with open ('../data/user_statistics.csv') as f: