In [12]:
from __future__ import print_function

This script assumes the corpus is located in `/data/corpus/`. The corpus can be downloaded using `get_corus.sh`.

In [13]:
import wiki
corpus = wiki.Corpus('../data/corpus/')
utts = corpus.get_utts()
users = {utt.user_id for utt in utts if utt.user_id}

Creating Users...
Loading conversations...


# User statistics:
- Eigenvector centrality (+ binarized centrality)
- avg. perplexity (of the user's utterances)
- total number of utterances
- Admin status

In [14]:
user_stats_list = ['user_id', 'eigen_central', 'eigen_central_bin', 'avg_perplexity', 
                   'n_utts', 'admin_status', 'n_content_words', 'n_tokens', 'n_types', 
                   'n_function_words']
user_stats = {user: {stat: None for stat in user_stats_list} for user in users}
for user in user_stats:
    user_stats[user]['user_id'] = user

### Eigenvector centrality

In [15]:
import networkx as nx
import numpy

network = corpus.generate_network()
centrality = nx.eigenvector_centrality_numpy(network, weight=None)

mean = numpy.mean(centrality.values())
stddev = numpy.std(centrality.values())

for user in users:
    eigen = centrality[user] if user in centrality else 0
    user_stats[user]['eigen_central'] = eigen
    user_stats[user]['eigen_central_bin'] = eigen > mean + stddev

Generating network from 389121 utterances...
There were 111264 replies to unknown users.
The unpruned network has  25813 nodes.
Pruning network to its largest component...
	 removed 34 users from 15 disconnected components.
Normalizing edge weights...


### Average Perplexity & Number of Utterances

First, we set up the KenLM n-gram language model for the corpus: 
- [Download and install](https://kheafield.com/code/kenlm/)
- [Paper](https://kheafield.com/papers/avenue/kenlm.pdf)
- [Python module](https://github.com/kpu/kenlm)
KenLM needs to be run from the command line to generate a language model object. KenLM expects to receive a corpus in [this](https://kheafield.com/code/kenlm/estimation/) format. The language model object can then be loaded into Python.

In [16]:
import kenlm
import os

# Location of the compiled KenLM utility
lmplz = '~/kenlm/build/bin/lmplz'
# Location to store temporary input and output files for KenLM
corpus_file = '../data/lm_corpus.txt'
kenlm_file = '../data/lm.arpa'

# Format the corpus for KenLM. See corpus formatting notes: https://kheafield.com/code/kenlm/estimation/
kenlm_corpus = '\n'.join(' '.join(b.tokenized) for b in utts)
with open(corpus_file, 'w') as f:
    f.write(kenlm_corpus)
# use KenLM to create the n-gram language model
print('{0} -o 3 -S 20% <{1} >{2}'.format(lmplz, corpus_file, kenlm_file))
os.system('{0} -o 3 -S 20% <{1} >{2}'.format(lmplz, corpus_file, kenlm_file))

~/kenlm/build/bin/lmplz -o 3 -S 20% <../data/lm_corpus.txt >../data/lm.arpa


-1

In [17]:
from collections import defaultdict

lm = kenlm.Model(kenlm_file)
utt_perplexity = defaultdict(list)

for utt in utts:
    utt_perplexity[utt.user_id].append(lm.perplexity(utt.clean_text))
    
for user in users:
    user_stats[user]['avg_perplexity'] = numpy.mean(utt_perplexity[user])
    user_stats[user]['n_utts'] = len(utt_perplexity[user])

### Admin status

In [18]:
for user in users:
    if user in corpus.users:
        user_stats[user]['admin_status'] = corpus.users[user].admin
    else:
        user_stats[user]['admin_status'] = False

### Linguistic style features

In [19]:
import nltk
from nltk.stem.snowball import SnowballStemmer
from collections import Counter
from collections import defaultdict

stemmer = SnowballStemmer("english")
markers = {m: wiki.get_lines(wiki.FWORDS_DIR + m + '.txt') for m in wiki.markers} 

type_counts = defaultdict(Counter)
function_word_counts = defaultdict(lambda: {m: Counter({w: 0 for w in markers[m]}) for m in markers})
content_word_count = Counter()

for i,utt in enumerate(utts):
    if i % 10000 == 0:
        print(i, end = '\r')
    type_counts[utt.user_id].update(utt.tokenized)
    for t in utt.tokenized:
        content_word = True
        for m in markers:
            if t in markers[m]:
                function_word_counts[utt.user_id][m][t] += 1
                content_word = False
        if content_word:
            content_word_count[utt.user_id] += 1
overall_function_word_counts = {m: sum([function_word_counts[user][m] for user in users], Counter()) for m in markers}

for user in users:
    user_stats[user]['n_types'] = len(type_counts[user])
    user_stats[user]['n_tokens'] = sum(type_counts[user].values())
    user_stats[user]['n_content_words'] = content_word_count[user]
    user_stats[user]['n_function_words'] = sum([sum(function_word_counts[user][f].values()) for f in function_word_counts[user]])
    # TODO: calculate function word distribution typicality for each marker

380000

### Write out collected data

In [20]:
import csv

with open('../data/user_stats.csv', 'w') as f:
    writer = csv.DictWriter(f, user_stats_list)
    writer.writeheader()
    writer.writerows(user_stats.values())

# Temporal utterance statistics
- perplexity based on LM from that year
- perplexity based on overall corpus LM
- user Admin status at time of utterance
- user network centrality at time of utterance
- date of utt
- months since first utt

In [21]:
import wiki

corpus = wiki.Corpus('../data/corpus/')
utts = corpus.get_utts()
users = {utt.user_id for utt in utts if utt.user_id}

Creating Users...
Loading conversations...


### Build timeboxed user networks

In [22]:
import networkx as nx
import numpy
import datetime

eigen_centr = {year:{} for year in range(2006,2012)}
eigen_centr_bin = {year:{} for year in range(2006,2012)}


for year in range(2006, 2012):
    
    start_date = datetime.datetime(year,1,1)
    end_date = datetime.datetime(year,12,31)
    
    network = corpus.generate_network(start_date=start_date, end_date=end_date)
    centrality = nx.eigenvector_centrality_numpy(network, weight=None)

    mean = numpy.mean(centrality.values())
    stddev = numpy.std(centrality.values())

    eigen_centr[year] = {user: centrality[user] if user in centrality else 0 for user in users}
    eigen_centr_bin[year] = {user: eigen_centr[year][user] > mean + stddev for user in users}

Generating network from 46191 utterances...
There were 15474 replies to unknown users.
The unpruned network has  5883 nodes.
Pruning network to its largest component...
	 removed 143 users from 64 disconnected components.
Normalizing edge weights...
Generating network from 58586 utterances...
There were 19237 replies to unknown users.
The unpruned network has  7024 nodes.
Pruning network to its largest component...
	 removed 170 users from 71 disconnected components.
Normalizing edge weights...
Generating network from 56579 utterances...
There were 17440 replies to unknown users.
The unpruned network has  6962 nodes.
Pruning network to its largest component...
	 removed 213 users from 99 disconnected components.
Normalizing edge weights...
Generating network from 54344 utterances...
There were 15728 replies to unknown users.
The unpruned network has  6998 nodes.
Pruning network to its largest component...
	 removed 293 users from 128 disconnected components.
Normalizing edge weights...

### Building timeboxed language models

In [23]:
import kenlm
import os

lm = {}

for year in range(2006, 2012):
    # Location of the compiled KenLM utility
    lmplz = '~/kenlm/build/bin/lmplz'
    # Location to store temporary input and output files for KenLM
    corpus_file = '../data/lm_corpus.txt'
    kenlm_file = '../data/lm_{0}.arpa'.format(year)

    # Format the corpus for KenLM. See corpus formatting notes: https://kheafield.com/code/kenlm/estimation/
    kenlm_corpus = '\n'.join(' '.join(b.tokenized) for b in [utt for utt in utts if utt.timestamp.year == year])
    with open(corpus_file, 'w') as f:
        f.write(kenlm_corpus)
    # use KenLM to create the n-gram language model
    print('{0} -o 3 -S 20% <{1} >{2}'.format(lmplz, corpus_file, kenlm_file))
    os.system('{0} -o 3 -S 20% <{1} >{2}'.format(lmplz, corpus_file, kenlm_file))
    
    lm[year] = kenlm.Model(kenlm_file)
    
lm_all_years = kenlm.Model('../data/lm.arpa')


~/kenlm/build/bin/lmplz -o 3 -S 20% <../data/lm_corpus.txt >../data/lm_2006.arpa
~/kenlm/build/bin/lmplz -o 3 -S 20% <../data/lm_corpus.txt >../data/lm_2007.arpa
~/kenlm/build/bin/lmplz -o 3 -S 20% <../data/lm_corpus.txt >../data/lm_2008.arpa
~/kenlm/build/bin/lmplz -o 3 -S 20% <../data/lm_corpus.txt >../data/lm_2009.arpa
~/kenlm/build/bin/lmplz -o 3 -S 20% <../data/lm_corpus.txt >../data/lm_2010.arpa
~/kenlm/build/bin/lmplz -o 3 -S 20% <../data/lm_corpus.txt >../data/lm_2011.arpa


In [24]:
utt_stats_list = ['utt_id', 'timestamp', 'perplexity_overall', 'perplexity_year',
                  'admin_status', 'eigen_central', 'eigen_central_bin',
                  'utt_rank', 'days_since_first', 'utt_length']
utt_stats = {}

max_date = datetime.datetime(2011,12,31)
min_date = datetime.datetime(2006,1,1)

for utt in utts:
    
    if not utt.timestamp or utt.timestamp < min_date or utt.timestamp > max_date:
        continue
        
    utt_id = utt.utt_id
    user_id = utt.user_id
    year = utt.timestamp.year
    utt_stats[utt_id] = {}

    utt_stats[utt_id]['utt_id'] = utt_id
    utt_stats[utt_id]['timestamp'] = utt.timestamp.isoformat()
    
    utt_stats[utt_id]['perplexity_overall'] = lm_all_years.perplexity(utt.clean_text)
    utt_stats[utt_id]['perplexity_year'] = lm[year].perplexity(utt.clean_text)
    
    utt_stats[utt_id]['eigen_central'] = eigen_centr[year][user_id] if user_id in eigen_centr[year] else 0
    utt_stats[utt_id]['eigen_central_bin'] = eigen_centr_bin[year][user_id] if user_id in eigen_centr_bin[year] else False
    utt_stats[utt_id]['utt_length'] = len(utt.clean_text.split())
    
    if not user_id in corpus.users:
        utt_stats[utt_id]['admin_status'] = False
    elif corpus.users[user_id].admin:
        ascention = corpus.users[user_id].admin_ascention
        if not ascention or ascention <= utt.timestamp:
            utt_stats[utt_id]['admin_status'] = True
        else:
            utt_stats[utt_id]['admin_status'] = False
    else:
        utt_stats[utt_id]['admin_status'] = False
        
    if not user_id in corpus.users:
        utt_stats[utt_id]['utt_rank'] = 0
        utt_stats[utt_id]['days_since_first'] = 0
    else:
        user_utts = corpus.users[user_id].utts
        utt_stats[utt_id]['utt_rank'] = len([utt2 for utt2 in user_utts if utt2.timestamp < utt.timestamp])
        utt_stats[utt_id]['days_since_first'] = (utt.timestamp - min([utt2.timestamp for utt2 in user_utts])).days

In [25]:
import csv

with open('../data/utt_stats.csv', 'w') as f:
    writer = csv.DictWriter(f, utt_stats_list)
    writer.writeheader()
    writer.writerows(utt_stats.values())