# Imports 


In [3]:
import os
import sys
import numpy as np
import csv as csv
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import math
import json
import time
from datetime import datetime
from termcolor import colored
from sklearn.model_selection import train_test_split
import random
import seaborn as sns
import uuid


from nltk.util import bigrams, ngrams
from nltk.lm.preprocessing import pad_both_ends, flatten, padded_everygram_pipeline
from nltk.tokenize import word_tokenize


os.chdir('/homes/gws/taugust/ARK/community_guidelines/')

print(os.getcwd())


/homes/gws/taugust/ARK/community_guidelines


In [None]:
##################################################################
# redefining alt functions that don't rely on month
##################################################################



    






    
    

# Utility functions for SLM building 
Take fron lib/SLM_building.py, added here so all the functions for SLM building could be in the same place
the lib/ file is in the process of being phased out


In [7]:
MONTHS = list(range(1,13))

##################################################################
# randomly sampling 200 users  
# defined as users with at least 5 in the respective community and month.
# Kind defines if you are looking at comments or posts, if None then using both

# TODO: might not want to reset the author_df, maybe make a copy?
##################################################################
# def get_active_users(author_df, month, author_col, threshold=5, num_authors=200, kind=None):
#     if kind:
#         author_df = author_df[author_df['kind'] == kind]
#     if num_authors: 
#         print(len(author_df[author_df[month] > threshold]))
#         return author_df[author_df[month] > threshold].drop_duplicates().sample(num_authors)[author_col]
#     else: 
#         return author_df[author_df[month] > threshold].drop_duplicates()[author_col]


# if no month, assume you want the entire df and set threshold for total counts
def get_active_users(author_df, month, author_col, threshold=5, num_authors=200, kind=None):
    if kind:
        author_df = author_df[author_df['kind'] == kind]  
    if month is not None:
        author_df = author_df[author_df[str(month)] > threshold].drop_duplicates()
    else:
        print('No month...taking all author counts')
        author_df = author_df[author_df[[str(m) for m in MONTHS]].sum(axis=1) >= threshold]
    if num_authors: 
        return author_df.sample(num_authors)[author_col]
    return author_df[author_col]
    
##################################################################
# getting outsiders -- users who only ever posted once in a community-- 
# but are still activein Reddit in general  (TODO)
##################################################################
# def get_outside_users(author_df, month, author_col, threshold=1, num_authors=None, kind=None):
#     if kind:
#         author_df = author_df[author_df['kind'] == kind]
#     outside_users = author_df[author_df[[str(m) for m in MONTHS]].sum(axis=1) <= threshold]
#     if num_authors:
#         return outside_users[outside_users[month] == threshold][author_col].sample(num_authors)
#     else: 
#         return outside_users[outside_users[month] == threshold][author_col]

def get_outside_users(author_df, month, author_col, threshold=1, num_authors=None, kind=None):
    if kind:
        author_df = author_df[author_df['kind'] == kind]
    outside_users = author_df[author_df[[str(m) for m in MONTHS]].sum(axis=1) <= threshold]
    if month is not None:
        outside_users = outside_users[outside_users[str(month)] == threshold]
    else:
        print('No month...taking all author counts')    
    if num_authors:
        return outside_users[author_col].sample(num_authors)
    return outside_users[author_col]
    
##################################################################   
# For each of these 200 active users
# select 5 random 10-word spans from 5 unique comments
# ASSUMING: from that month
# ASSUMING: this is one 10 word span from each comment, not 5 for each comment
# NOTE: Note using this currently, mostly because using later spans actually might not be good 
# -- people tend to get more esoteric the longer they go one
##################################################################
def get_random_span(text, length):
    text = [w for w in word_tokenize(text)]
    try: 
        beg = random.randint(0, len(text) - length)
        end = beg + 10
        return text[beg:end]
    except:
        raise IndexError("Error: index out of range, probably happened if you didn't clean comments to be at least 10 words long")
        
        
##################################################################
# get 50 comments by active users (10 comments from 5 randomly sampled active users, 
# who were not used to construct the SLM -TODO) and 50 comments from randomly-sampled outsiders
# same length controlling effects used here - select random 10 word span from each comment (see ASSUMING above)
##################################################################
# def get_user_comments(df, authors, month, num_posts, month_col='created_month'):
#     df_month = df[df[month_col] == month]
#     df_month_author = df_month[df_month['author'].apply(lambda x: x in authors)]
#     df_grouped = df_month_author.groupby('author')
#     sampled_comments = []
#     for a, g in df_grouped:
#         if num_posts:
#             sample = g.sample(num_posts)['body'].apply(lambda x: [w for w in word_tokenize(x)][:10]) 
#         else: 
#             sample = g['body'].apply(lambda x: [w for w in word_tokenize(x)][:10]) 
#         sampled_comments.extend(sample)
#     return sampled_comments

def get_user_comments(df, authors, month, num_posts, month_col='created_month'):
    if month is not None:
        df = df[df[month_col] == int(month)]
    else:
        print('No month...taking all author counts')
    df_author = df[df['author'].apply(lambda x: x in authors)]
    df_grouped = df_author.groupby('author')
    sampled_comments = []
    for a, g in df_grouped:
        if num_posts:
            sample = g.sample(num_posts)['body'].apply(lambda x: [w for w in word_tokenize(x)][:10]) 
        else: 
            sample = g['body'].apply(lambda x: [w for w in word_tokenize(x)][:10]) 
        sampled_comments.extend(sample)
    return sampled_comments




###################################################################
# Importing data
###################################################################
def import_csvs(sub, path='data/cleaned/train/2017/', ext='_train_2017.csv', comment_pre_path='data/cleaned/sub_comments/', comment_ext='_comments_2017.csv'):
    
    # currently importing the same comments file for test/train
    # This is because the authors have been seperated, so there shouldn't be any of the same messages
    # between the two sets (even though they pull text from the same file)
    comment_path = comment_pre_path+sub+comment_ext
    
    author_path = path+'author_counts/'+sub+'_author_counts'+ext
    
    print('Importing ', colored(comment_path, 'magenta'),'.....', end=' ')
    df_sub_comments = pd.read_csv(comment_path, quoting=csv.QUOTE_ALL, escapechar='\\')
    print('Done')
    print('Importing ', colored(author_path, 'magenta'),'.....', end=' ')
    df_author_counts = pd.read_csv(author_path, quoting=csv.QUOTE_ALL, escapechar='\\')
    print('Done')
    
    # renaming month columns per the issue of having a string float
    cols = df_author_counts.columns.tolist()
    df_author_counts = df_author_counts.rename(index=str, columns={c:str(int(float(c))) for c in cols[2:len(cols)-1]})
    
    
    return df_sub_comments, df_author_counts

# simpler function for just importing a csv
def import_csv(sub, path='data/cleaned/train/2017/', ext='_train_2017.csv', kind=None):

    path = path+sub+ext
    
    print('Importing ', colored(path, 'magenta'),'.....', end=' ')
    df = pd.read_csv(comment_path, quoting=csv.QUOTE_ALL, escapechar='\\')
    print('Done')
    
    return df


# SLM building functions based on SRILM
These have to stay in a notebook since it uses Jupyter magic to run the commands for SRILM. It also has mannnyy path dependencies, so make sure you know where 1) SRILM lives, 2) where you're store SRILM's lms, count files, and corperum, and 3) where this file exisits. Note that this file will change the directory to ...ARK/community_guidelines for simplicities sake

These functions also require some of the functions from SLM_building.py in the lib folder, so also running that there. Though ideally in future iterations this will be taken out

In [5]:
#TODO: Change to absolute paths

In [6]:
###################################################################
# export text for SRILM to train models on
# takes the form of a list of strings, tokenized
def export_text(text, name, corpus_path='../data/srilm_data/'):
    pd.Series(text).to_csv(corpus_path + name + '.txt', sep='\n', index=False, quoting=csv.QUOTE_NONE)
    return corpus_path + name + '.txt'
    
    
###################################################################
def train_SRILM(ngram_count_command, corpus, count_file, lm):
    print('Reading text corpus at', colored(corpus, 'green'), ' and writing to count file ', colored(count_file, 'magenta'), '.....', end='')
    ! {ngram_count_command} -text {corpus} -order 2 -write {count_file} -unk 
    print('Done')
    print('Training LM from count file', colored(count_file, 'magenta'), ' to ', colored(lm, 'red'), '....', end='')
    ! {ngram_count_command} -read {count_file} -order 2 -lm {lm} -gt1min 3 -gt1max 7 -gt2min 3 -gt2max 7
    print('Done')
    
    
###################################################################    
# Calculate entropy for SRILM LM path
# requrires Jupyter magic
def get_SRILM_entropy(ngram_command, lm_path, test_text_path):
    ppl_output = ! {ngram_command} -ppl {test_text_path} -lm {lm_path}
    try:
        ppl = float(ppl_output[1].split(' ')[5])
    except IndexError:
        print('Index out of range, probably due to there being no model where you pointed')
        print('SRILM output: ', ppl_output)
    return math.log(ppl,2)

###################################################################
# def construct_LM_SRILM(active_user_text, sub_name, month, index_num, kind, vocab=None, ngram_count_command='./../../tools/SRILM/bin/i686-m64/ngram-count'):
#     corpus_path = export_text(active_user_text, name=sub_name+'_'+str(index_num)+'_month_'+str(month))
    
#     count_file = '../srilms_LMs/counts/' + kind + '_' + sub_name +'_'+str(index_num)+'_month_'+str(month)+'.count'
#     lm_path = '../srilms_LMs/' + kind + '/' + sub_name + '/' +sub_name+'_'+str(index_num)+'_month_'+str(month)+'.lm'
    
#     train_SRILM(ngram_count_command, corpus_path, count_file, lm_path)
#     return lm_path

# CHANGED TO 2018
def construct_LM_SRILM(active_user_text, sub_name, month, index_num, kind, vocab=None, ngram_count_command='./../../tools/SRILM/bin/i686-m64/ngram-count'):
    if month is not None:
        month_str = str(month) 
    else:
        month_str = 'total'
    corpus_path = export_text(active_user_text, name=sub_name+'_'+str(index_num)+'_month_'+month_str)
    count_file = '../srilms_LMs/counts/2018/' + kind + '_' + sub_name +'_'+str(index_num)+'_month_'+month_str+'.count'
    lm_path = '../srilms_LMs/2018/' + kind + '/' + sub_name + '/' +sub_name+'_'+str(index_num)+'_month_'+month_str+'.lm'
    train_SRILM(ngram_count_command, corpus_path, count_file, lm_path)
    return lm_path

###################################################################
# build an SLMs for a single month
def build_SLMs_SRILM(df, author_counts, slm_count, month, name, num_authors, kind, threshold_count):
    print('Creating ', colored(str(slm_count) + ' SLMs ', 'red'), 'for', colored(' month ' + str(month), 'green'), '.....')
    slms = []
    for i in range(0, slm_count): 
        active_users = get_active_users(author_counts, month, 'author', threshold=threshold_count, num_authors=num_authors, kind=kind)
        active_user_comments = get_user_comments(df, list(active_users), month=month, num_posts=threshold_count)
        slm_path = construct_LM_SRILM(active_user_comments, name, month, i, kind=kind)
        slms.append(slm_path)
    return slms


# returns dict of {month:SLM}
def build_monthly_SLM_SRILM(df, author_counts, slm_count, name, use_saved_lms=False, kind=None, num_authors=200, threshold_count=5):
    slm_dict = {}
    # if we don't want to remake the LMs, can just load our old ones
    if use_saved_lms:
        for m in MONTHS:
            slms = []
            for i in range(0, slm_count):
                slms.append('../srilms_LMs/'+ kind + '/' + name + '/' + name +'_'+str(i)+'_month_'+str(m)+'.lm')
            slm_dict[m] = slms
        return slm_dict
    # otherwise just remake them
    else: 
        for m in MONTHS:
            slms = build_SLMs_SRILM(df, 
                                    author_counts,
                                    slm_count,
                                    month=m,
                                    name=name,
                                    num_authors=num_authors, # num authors to sample
                                    kind=kind, # kind of text, either posts or comments
                                    threshold_count=threshold_count) # threshold of how many posts to use
            slm_dict[m] = slms
        return slm_dict

###################################################################
# returns LM for entire df, not sampled by month
def build_total_SLM_SRILM(df, author_counts, slm_count, name, use_saved_lms=False, kind=None, num_authors=200, threshold_count=5):
    slms = []
    # if we don't want to remake the LMs, can just load our old ones
    if use_saved_lms:
        for i in range(0, slm_count):
            slms.append('../srilms_LMs/'+ kind + '/' + name + '/' + name +'_'+str(i)+'_month_total.lm')
        return slms
    # otherwise just remake them
    else: 
        slms = build_SLMs_SRILM(df, 
                                author_counts,
                                slm_count,
                                month=None,
                                name=name,
                                num_authors=num_authors, # num authors to sample
                                kind=kind, # kind of text, either posts or comments
                                threshold_count=threshold_count) # threshold of how many posts to use
        return slms
    
###################################################################  
def calc_month_entropy_SRILM(slms, text_path):
    entropies = []
    for i, slm in enumerate(slms):
        # pretty sure this works since SRILM takes each line in the text file as seperate and outputs an average 
        entropies.append(get_SRILM_entropy('/homes/gws/taugust/tools/SRILM/bin/i686-m64/ngram', slm, text_path))
    return entropies


###################################################################    
def calc_acc_gap_SRILM(slms, author_counts, comments, sub_name, kind=None, num_active_authors=5, num_active_posts=10, num_outside_authors=50, num_outside_posts=1):
    monthly_acc_gap = {}
    entropies = {'inside':[], 'outside':[]}
    for month in slms.keys():
        acc_gap, (active_ent, outside_ent) = calc_single_acc_gap_SRILM(slms[month], 
                author_counts, comments, sub_name, month=month, kind=kind,
                num_active_authors=num_active_authors, num_active_posts=num_active_posts,
                num_outside_authors=num_outside_authors, num_outside_posts=num_outside_posts)
        
        monthly_acc_gap[month] = acc_gap
        entropies['inside'].append(active_ent)
        entropies['outside'].append(outside_ent)
        
        print('Saving acc gap for', colored('month ' + str(month), 'green'))
    return monthly_acc_gap, entropies


###################################################################
# TODO: merge with above
# returns difference of the means of the cross entropy of outside vs. inside text divided by inside posts
def calc_single_acc_gap_SRILM(slms, author_counts, comments, sub_name, month=None, kind=None, num_active_authors=5, num_active_posts=10, num_outside_authors=50, num_outside_posts=1):
    if month is not None:
        month_string = str(month)
    else:
        month_string = 'total'
    print('Calculating cross entropy for', colored('month ' + month_string, 'green'), '.....')
    active_authors = get_active_users(author_counts, month, 'author', threshold=10, num_authors=num_active_authors, kind=kind)
    outside_authors = get_outside_users(author_counts, month, 'author', threshold=1, num_authors=num_outside_authors, kind=kind)
    print('sampled active users:', len(active_authors), 'sampled outside users:', len(outside_authors))

    active_comments = get_user_comments(comments, list(active_authors), month=month, num_posts=num_active_posts)
    outside_comments = get_user_comments(comments, list(outside_authors), month=month, num_posts=num_outside_posts)
    
    print('sampled active comments:', len(active_comments), 'sampled outside comments:', len(outside_comments))

    # export to SRILM test directory
    active_corpus_file = export_text(active_comments, name=sub_name+'_inside_month_'+month_string, corpus_path='../data/srilm_data/test')
    outside_corpus_file = export_text(outside_comments, name=sub_name+'_outside_month_'+month_string, corpus_path='../data/srilm_data/test')

    # run through SRILM LMs for the month and get the entropy
    active_ent = calc_month_entropy_SRILM(slms, active_corpus_file)
    outside_ent = calc_month_entropy_SRILM(slms, outside_corpus_file)
    
    # calculate the acc gap
    exp_val_active_ent = np.mean(active_ent)
    exp_val_outside_ent = np.mean(outside_ent)

    acc_gap = (exp_val_outside_ent - exp_val_active_ent) / exp_val_active_ent

    # also save variance of the acc gap by subtracting: https://www.kean.edu/~fosborne/bstat/05b2means.html
#     var_active_ent = np.var(active_ent)
#     var_outside_ent = np.var(outside_ent)

#     acc_gap_var = (var_active_ent/len(active_ent)) + (var_outside_ent/len(outside_ent))
    
    return acc_gap, (active_ent, outside_ent)
       