In [None]:
import pystan
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
from datetime import datetime
import nltk
import os


DELIM         = " +++$+++ "
CONVO_FILE    = "wikipedia.talkpages.conversations.txt"
USERS_FILE    = "wikipedia.talkpages.userinfo.txt"
ADMINS_FILE   = "wikipedia.ta~lkpages.admins.txt"
POSTS_DF_FILE = "posts_df.pickle"
USERS_DF_FILE = "users_df.pickle"
POSTS_CSV     = "posts.csv"
USERS_CSV     = "users.csv"
NETWORK_FILE  = "users_network.pickle"
CORPUS_DIR    = ("../data/wiki/")
FWORDS_DIR    = '../data/function words/'

function_words = ['conjunctions', 'articles', 'prepositions', 'adverbs', 'quantifiers', 
           'impersonal_pronouns', 'personal_pronouns', 'auxiliary_verbs']

## Create the `posts` dataframe

In [None]:
columns = ['utterance_id', 'user', 'talkpage_user', 'conversation_root', 'reply_to', 
           'timestamp', 'timestamp_unixtime', 'clean_text', 'raw_text']

posts = {column: [] for column in columns}
with open(CORPUS_DIR + CONVO_FILE) as f:
    for line in tqdm(f.readlines()):
        
        # parse lines from the conversations file
        if line.startswith("could not match") or line.strip() == "":
            continue
        line = line.rstrip('\n').split(DELIM)
        assert(len(line) == len(columns))
        line = {column: value for column, value in zip(columns, line)}
        
        # convert timestamps to datetime objects
        try:
            line['timestamp'] = datetime.strptime(line['timestamp'], "%Y-%m-%d %H:%M:%S")
        except ValueError:
            line['timestamp'] = None
            
        for column, value in line.items():
            posts[column].append(value)

            
posts = pd.DataFrame(data=posts, index=posts['utterance_id'], columns=columns, dtype=str)

# tokenize the post content
if not 'tokens' in posts.columns:
    tokens = [nltk.tokenize.word_tokenize(text) for text in tqdm(posts['clean_text'])]
    posts = posts.assign(tokens=tokens)
    
# look for markers

markers = {feature: [] for feature in function_words}
for feature in function_words:
    with open(FWORDS_DIR + feature + '.txt') as f:
        markers[feature] = [word.rstrip('\n') for word in f.readlines()]
        
feature_columns = {m: [False] * len(posts) for m in function_words}       
for i, tokens in enumerate(tqdm(posts['tokens'])):
    for m in function_words:
        if any(t.lower() in markers[m] for t in tokens):
            feature_columns[m][i] = True
        
posts = posts.assign(**feature_columns)

# save the dataframe

pd.to_pickle(posts, CORPUS_DIR + POSTS_DF_FILE)

## ... or load already-saved posts

In [None]:
posts = pd.read_pickle(CORPUS_DIR + POSTS_DF_FILE)

## Merge posts into reply pairs

In [None]:
pairs = pd.merge(posts, posts, how='inner', left_index=True, right_on='reply_to', suffixes=['_a', '_b'])

# filter out empty users & self-replies
pairs = pairs[(pairs.user_a != pairs.user_b) & pairs.user_a & pairs.user_b]

## Format the input data for Stan

In [None]:
# merge the marker usage columns for the reply pair
for m in function_words:
    pairs[m] = list(zip(pairs[m+'_a'], pairs[m+'_b']))

# reshape
df = pd.melt(pairs, id_vars = ['user_a', 'user_b', 'utterance_id_b'], value_vars=function_words, var_name='marker')

# change the marker labels to indices Stan will like
marker_idx = {m:i+1 for i,m in enumerate(function_words)}
df['marker'] = df['marker'].apply(lambda x: marker_idx[x])

# reshape again
df = df.pivot_table(index=['user_a', 'user_b', 'marker'], columns='value', aggfunc='size', fill_value=0)
df = df.reset_index()

# df = df.sample(50000)
print(len(df))

In [None]:

data = {
    "NumMarkers": len(function_words),
    "NumObservations": len(df),
    "MarkerType": df.marker.values,
    "NumUtterancesAB": (df[(True, True)] + df[(True, False)]).values,
    "NumUtterancesNotAB": (df[(False, True)] + df[(False, False)]).values,
    "CountsAB": df[(True, True)].values,
    "CountsNotAB": df[(False, True)].values,
    "StdDev": .25
}

### Compile the Stan model

In [None]:
sm = pystan.StanModel(file='alignment.cauchy.nosubpop.stan', verbose=True)

### Fit the Stan model to the data
save the paramteer `eta_ab_pop`

In [None]:
import time

start = time.time()

fit = sm.sampling(data=data, iter=200, pars=['eta_ab_pop'], chains=4)

end = time.time()
print(end - start)

In [None]:
%matplotlib inline
fit.plot()

In [None]:
fit.summary()