In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
from datetime import datetime
import nltk
import os


DELIM         = " +++$+++ "
CONVO_FILE    = "wikipedia.talkpages.conversations.txt"
USERS_FILE    = "wikipedia.talkpages.userinfo.txt"
ADMINS_FILE   = "wikipedia.ta~lkpages.admins.txt"
POSTS_DF_FILE = "posts_df.pickle"
USERS_DF_FILE = "users_df.pickle"
POSTS_CSV     = "posts.csv"
USERS_CSV     = "users.csv"
NETWORK_FILE  = "users_network.pickle"
CORPUS_DIR    = ("../data/wiki/")
FWORDS_DIR    = '../data/function words/'

function_words = ['conjunctions', 'articles', 'prepositions', 'adverbs', 'quantifiers', 
           'impersonal_pronouns', 'personal_pronouns', 'auxiliary_verbs']

## Create the `posts` dataframe

In [None]:
columns = ['utterance_id', 'user', 'talkpage_user', 'conversation_root', 'reply_to', 
           'timestamp', 'timestamp_unixtime', 'clean_text', 'raw_text']

posts = {column: [] for column in columns}
with open(CORPUS_DIR + CONVO_FILE) as f:
    for line in tqdm(f.readlines()):
        
        # parse lines from the conversations file
        if line.startswith("could not match") or line.strip() == "":
            continue
        line = line.rstrip('\n').split(DELIM)
        assert(len(line) == len(columns))
        line = {column: value for column, value in zip(columns, line)}
        
        # convert timestamps to datetime objects
        try:
            line['timestamp'] = datetime.strptime(line['timestamp'], "%Y-%m-%d %H:%M:%S")
        except ValueError:
            line['timestamp'] = None
            
        for column, value in line.items():
            posts[column].append(value)

            
posts = pd.DataFrame(data=posts, index=posts['utterance_id'], columns=columns, dtype=str)

# tokenize the post content
if not 'tokens' in posts.columns:
    tokens = [nltk.tokenize.word_tokenize(text) for text in tqdm(posts['clean_text'])]
    posts = posts.assign(tokens=tokens)
    
# look for markers

markers = {feature: [] for feature in function_words}
for feature in function_words:
    with open(FWORDS_DIR + feature + '.txt') as f:
        markers[feature] = [word.rstrip('\n') for word in f.readlines()]
        
feature_columns = {m: [False] * len(posts) for m in function_words}       
for i, tokens in enumerate(tqdm(posts['tokens'])):
    for m in function_words:
        if any(t.lower() in markers[m] for t in tokens):
            feature_columns[m][i] = True
        
posts = posts.assign(**feature_columns)

# save the dataframe

pd.to_pickle(posts, CORPUS_DIR + POSTS_DF_FILE)

## ... or load already-saved posts

In [None]:
posts = pd.read_pickle(CORPUS_DIR + POSTS_DF_FILE)

## Merge posts into reply pairs

In [None]:
pairs = pd.merge(posts, posts, how='inner', left_index=True, right_on='reply_to', suffixes=['_a', '_b'])
# TODO: filter out empty users & self-replies

In [None]:
for m in function_words:
    pairs[m] = list(zip(pairs[m+'_a'], pairs[m+'_b']))

In [None]:
pairs = pairs.set_index(['utterance_id_b', 'user_a', 'user_b'])[function_words]

In [None]:
df = pairs.unstack()

In [None]:
df.head(30)

In [None]:
#df = pairs.groupby(['user_a', 'user_b'] + marker_usage).size().unstack(fill_value=0)

In [None]:
num_markers = len(function_words)
num_observations = len(df)
marker_type = np.array([1] * num_observations)
num_utterances_ab = (df[(True, True)] + df[(True, False)]).values
num_utterances_not_ab = (df[(False, True)] + df[(False, False)]).values
counts_ab = df[(True, True)].values
counts_not_ab = df[(False, True)].values
stddev = .25

data = {
    "NumMarkers": num_markers,
    "NumObservations": num_observations,
    "MarkerType": marker_type,
    "NumUtterancesAB": num_utterances_ab,
    "NumUtterancesNotAB": num_utterances_not_ab,
    "CountsAB": counts_ab,
    "CountsNotAB": counts_not_ab,
    "StdDev": stddev
}

In [None]:
import pystan
sm = pystan.StanModel(file='../../disc_align/models/alignment.cauchy.nosubpop.stan', verbose=True)
fit = sm.sampling(data=data, iter=200, chains=1)

In [None]:
%matplotlib inline
fit.plot()