# LM building functions
This is a notebook just for function to build the language models. Since we used SRILM for building the models
this has to be a notebook rather than a module I could import (we just jupyter magic in it to access SRILM).

This notebook is run in pretty much every other notebook, since it has all the core functions in it. 
The functions have grown a little out of control, with lots of parameters. You can go into the other 
notebooks that import this to see example usage. 

# Imports 


In [1]:
import os
import sys
import numpy as np
import csv as csv
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import math
import json
import time
from datetime import datetime
from termcolor import colored
from sklearn.model_selection import train_test_split
import random
import seaborn as sns
import uuid
from scipy import stats
import scikit_posthocs as sp
from statsmodels import stats as sm




from nltk.util import bigrams, ngrams
from nltk.lm.preprocessing import pad_both_ends, flatten, padded_everygram_pipeline
from nltk.tokenize import word_tokenize


os.chdir('/homes/gws/taugust/Projects/ARK/community_guidelines/')

print(os.getcwd())


/homes/gws/taugust/ARK/community_guidelines


# Utility functions for LM building 
Take fron lib/SLM_building.py, added here so all the functions for SLM building could be in the same place
the lib/ file is in the process of being phased out


In [1]:
MONTHS = list(range(1,13)) 

# subs = ['AskHistorians', 'EverythingScience', 'Futurology', 'science', 'TrueReddit', 'dataisbeautiful', 'askscience']

def print_row(row):
    print(colored('----', 'red'), row)
    
def print_df_rows(df, sample=100, col='body'):
    df.sample(sample)[col].apply(print_row)

#### Gets entropy of passed text, uses data/tmp to store unique identifier
def get_entropy(text, slms):
    name = str(uuid.uuid4())
    corpus_file = export_text(text, name=name, corpus_path='data/tmp/')
    return calc_month_entropy_SRILM(slms, corpus_file)



# if no month, assume you want the entire df and set threshold for total counts
def get_active_users(author_df, month, author_col, threshold=5, num_authors=200, kind=None):
    if kind:
        author_df = author_df[author_df['kind'] == kind]  
    if month is not None:
        author_df = author_df[author_df[str(month)] >= threshold].drop_duplicates()
    else:
        print('No month...taking all author counts')
        author_df = author_df[author_df[[str(m) for m in MONTHS]].sum(axis=1) >= threshold].drop_duplicates() 
    if num_authors: 
        return author_df.sample(num_authors, replace=True)[author_col]
    return author_df[author_col]


# if no month, assume you want the entire df and set threshold for total counts
def get_first_active_users(author_df, month, author_col, threshold=5, num_authors=200, kind=None):
    if kind:
        author_df = author_df[author_df['kind'] == kind]  
    if month is not None:
        #####
        # ignore all authors who never wrote before this month
        ####
        prev_months = range(1, month)
        author_df =  author_df[author_df[[str(prev_m) for prev_m in prev_months]].sum(axis=1) == 0]
        author_df = author_df[author_df[str(month)] >= threshold].drop_duplicates()
    else:
        print('No month...taking all author counts')
        author_df = author_df[author_df[[str(m) for m in MONTHS]].sum(axis=1) >= threshold].drop_duplicates() 
    if num_authors: 
        return author_df.sample(num_authors, replace=True)[author_col]
    return author_df[author_col]
    
def get_outside_users(author_df, month, author_col, threshold=1, num_authors=None, kind=None):
    if kind:
        author_df = author_df[author_df['kind'] == kind]
    outside_users = author_df[author_df[[str(m) for m in MONTHS]].sum(axis=1) <= threshold]
    if month is not None:
        outside_users = outside_users[outside_users[str(month)] == threshold]
    else:
        print('No month...taking all author counts')    
    if num_authors:
        return outside_users[author_col].sample(num_authors, replace=True)
    return outside_users[author_col]
    
##################################################################   
# For each of these 200 active users
# select 5 random 10-word spans from 5 unique comments
# ASSUMING: from that month
# ASSUMING: this is one 10 word span from each comment, not 5 for each comment
# NOTE: not using this currently, mostly because using later spans actually might not be good 
# -- people tend to get more esoteric the longer they go one
##################################################################
def get_random_span(text, length):
    text = [w for w in word_tokenize(text)]
    try: 
        beg = random.randint(0, len(text) - length)
        end = beg + 10
        return text[beg:end]
    except:
        raise IndexError("Error: index out of range, probably happened if you didn't clean comments to be at least 10 words long")
        
        
# pulls 10 word spans from 50 top posts of a subreddit
def get_top_posts(s, df_top_posts, num_posts=50):
    df_top_posts_sub = df_top_posts[df_top_posts['subreddit_name'] == s]
    return df_top_posts_sub.sample(num_posts, replace=True)['body'].apply(lambda x: word_tokenize(x)[:10]) 

# not using
def get_guideline_text(s, df_subreddits, num_spans=50, text_col='cleaned'):
    full_descr = list(df_subreddits[df_subreddits['subreddit_name'] == s][text_col])[0]
    samples = []
    for i in range(num_spans):
        samples.append(get_random_span(full_descr, 10))
    return samples

##################################################################
# get 50 comments by active users (10 comments from 5 randomly sampled active users, 
# who were not used to construct the SLM) and 50 comments from randomly-sampled outsiders
# same length controlling effects used here - select 10 (text_len) word span from each comment
##################################################################
def get_user_comments(df, authors, month, num_posts, month_col='created_month', text_len=10):
    if month is not None:
        df = df[df[month_col] == int(month)]
    else:
        print('No month...taking all author counts')
    df_author = df[df['author'].isin(authors)]
    df_grouped = df_author.groupby('author')
    sampled_comments = []
    for a, g in df_grouped:
        if num_posts:
            if text_len is not None:
                sample = g.sample(num_posts, replace=True)['body'].apply(lambda x: word_tokenize(x)[:text_len])
            else: 
                sample = g.sample(num_posts, replace=True)['body'].apply(lambda x: word_tokenize(x))
        else: 
            if text_len is not None:
                sample = g['body'].apply(lambda x: word_tokenize(x)[:text_len]) 
            else: 
                sample = g['body'].apply(lambda x: word_tokenize(x)) 
        sampled_comments.extend(sample)
    return random.choices(sampled_comments, k=num_posts*len(authors))


# same as above but for the first user comment
def get_user_first_comments(df, authors, month, month_col='created_month', total_num=250, text_len=10):
    if month is not None:
        df = df[df[month_col] == int(month)]
    else:
        print('No month...taking all author counts')
    df_author = df[df['author'].isin(authors)]
    df_grouped = df_author.groupby('author')
    sampled_comments = []
    for a, g in df_grouped:
        if text_len is not None: 
            first = word_tokenize(g.sort_values(by='created_utc').iloc[0]['body'])[:text_len]
        else:
            first = word_tokenize(g.sort_values(by='created_utc').iloc[0]['body'])
        sampled_comments.extend([first])
    return random.choices(sampled_comments, k=total_num)



###################################################################
# Importing data
###################################################################
def import_csvs(sub, path='data/cleaned/train/2017/', ext='_train_2017.csv', comment_pre_path='data/cleaned/sub_comments/', comment_ext='_comments_2017.csv'):
    
    # currently importing the same comments file for test/train
    # This is because the authors have been seperated, so there shouldn't be any of the same messages
    # between the two sets (even though they pull text from the same file)
    comment_path = comment_pre_path+sub+comment_ext
    
    author_path = path+'author_counts/'+sub+'_author_counts'+ext
    
    print('Importing ', colored(comment_path, 'magenta'),'.....', end=' ')
    df_sub_comments = pd.read_csv(comment_path, quoting=csv.QUOTE_ALL, escapechar='\\')
    print('Done')
    print('Importing ', colored(author_path, 'magenta'),'.....', end=' ')
    df_author_counts = pd.read_csv(author_path, quoting=csv.QUOTE_ALL, escapechar='\\')
    print('Done')
    
    # renaming month columns per the issue of having a string float
    cols = df_author_counts.columns.tolist()
    df_author_counts = df_author_counts.rename(index=str, columns={c:str(int(float(c))) for c in cols[2:len(cols)-1]})
    
    
    return df_sub_comments, df_author_counts

# simpler function for just importing a csv
def import_csv(sub, path='data/cleaned/train/2017/', ext='_train_2017.csv', kind=None):

    path = path+sub+ext
    
    print('Importing ', colored(path, 'magenta'),'.....', end=' ')
    df = pd.read_csv(comment_path, quoting=csv.QUOTE_ALL, escapechar='\\')
    print('Done')
    
    return df


def calc_acc_gap(active_ent, outside_ent):
    exp_val_active_ent = np.mean(active_ent)
    exp_val_outside_ent = np.mean(outside_ent)
    return (exp_val_outside_ent - exp_val_active_ent) / exp_val_active_ent


# https://stackoverflow.com/questions/21532471/how-to-calculate-cohens-d-in-python
def cohens_d(x, y):
    return (x.mean() - y.mean()) / (math.sqrt((x.std() ** 2 + y.std() ** 2) / 2))


# SLM building functions based on SRILM
These have to stay in a notebook since it uses Jupyter magic to run the commands for SRILM. It also has mannnyy path dependencies, so make sure you know where 1) SRILM lives, 2) where you're store SRILM's lms, count files, and corperum, and 3) where this file exisits. Note that this file will change the directory to ...ARK/community_guidelines for simplicities sake

These functions also require some of the functions from SLM_building.py in the lib folder, so also running that there. Though ideally in future iterations this will be taken out

In [6]:
# export text for SRILM to train models on
# takes the form of a list of strings, tokenized
def export_text(text, name, corpus_path='../data/srilm_data/'):
    text = [' '.join(t) for t in text]
    pd.Series(text).to_csv(corpus_path + name + '.txt', sep='\n', index=False, quoting=csv.QUOTE_NONE)
    return corpus_path + name + '.txt'
    
    
def train_SRILM(ngram_count_command, corpus, count_file, lm, vocab=None):
    print('Reading text corpus at', colored(corpus, 'green'), ' and writing to count file ', colored(count_file, 'magenta'), '.....', end='')
    if vocab is not None:
        print('Training with vocab file...', colored(vocab, 'blue'), '...', end='')
        ! {ngram_count_command} -text {corpus} -order 2 -write {count_file} -unk -vocab {vocab}
    else:
        ! {ngram_count_command} -text {corpus} -order 2 -write {count_file} -unk
    print('Done')
    print('Training LM from count file', colored(count_file, 'magenta'), ' to ', colored(lm, 'red'), '....', end='')
    ! {ngram_count_command} -read {count_file} -order 2 -lm {lm} -gt1min 3 -gt1max 7 -gt2min 3 -gt2max 7
    print('Done')
    

    
    
# Calculate entropy for SRILM LM path
# requrires Jupyter magic
def get_SRILM_entropy(ngram_command, lm_path, test_text_path):
    ppl_output = ! {ngram_command} -ppl {test_text_path} -lm {lm_path}
    try:
        ppl = float(ppl_output[1].split(' ')[5])
    except IndexError:
        print('Index out of range, probably due to there being no model where you pointed')
        print('SRILM output: ', ppl_output)
    return math.log(ppl,2)


# full refers to if the LM uses the entire sentence or just the first 10 words
def construct_LM_SRILM(active_user_text, sub_name, month, index_num, kind, vocab=None, year='2018', ngram_count_command='./../../../tools/SRILM/bin/i686-m64/ngram-count', full=False):
    if month is not None:
        month_str = str(month) 
    else:
        month_str = 'total'
    if vocab is not None:
        vocab_str = vocab.replace('data/cleaned/', '').replace('.txt', '')
    else:
        vocab_str = ''
    corpus_path = export_text(active_user_text, name=sub_name+'_'+str(index_num)+'_month_'+month_str)
    if full:
        count_file = '../srilms_LMs/counts/' + 'full_' + kind + '_' + sub_name +'_'+str(index_num)+'_month_'+month_str+'.count'
        lm_path = '../srilms_LMs/' + year + '/alt_full_text_lms/' + sub_name+'_'+str(index_num)+'_month_'+month_str+'_'+vocab_str+'.lm'
    else: 
        count_file = '../srilms_LMs/counts/' + kind + '_' + sub_name +'_'+str(index_num)+'_month_'+month_str+'.count'
        lm_path = '../srilms_LMs/' + year + '/' + kind + '/' + sub_name + '/' +sub_name+'_'+str(index_num)+'_month_'+month_str+'_'+vocab_str+'.lm'
    train_SRILM(ngram_count_command, corpus_path, count_file, lm_path, vocab=vocab) # if you want a vocab, you can add it here
    return lm_path

# TODO not needed, added new param above
def construct_full_LM_SRILM(active_user_text, sub_name, month, index_num, kind, vocab=None, year='2018', ngram_count_command='./../../../tools/SRILM/bin/i686-m64/ngram-count'):
    if month is not None:
        month_str = str(month) 
    else:
        month_str = 'total'
    corpus_path = export_text(active_user_text, name=sub_name+'_'+str(index_num)+'_month_'+month_str)
    count_file = '../srilms_LMs/counts/' + 'full_' + kind + '_' + sub_name +'_'+str(index_num)+'_month_'+month_str+'.count'
    lm_path = '../srilms_LMs/' + year + '/alt_full_text_lms/' + sub_name+'_'+str(index_num)+'_month_'+month_str+'.lm'
    train_SRILM(ngram_count_command, corpus_path, count_file, lm_path, vocab=None) # if you want a vocab, you can add it here
    return lm_path

# build an SLMs for a single month
def build_SLMs_SRILM(df, author_counts, slm_count, month, name, num_authors, kind, threshold_count, year, full=False, vocab=None):
    print('Creating ', colored(str(slm_count) + ' SLMs ', 'red'), 'for', colored(' month ' + str(month), 'green'), '.....')
    slms = []
    for i in range(0, slm_count): 
        active_users = get_active_users(author_counts, month, 'author', threshold=threshold_count, num_authors=num_authors, kind=kind)
        if full:
            active_user_comments = get_user_comments(df, list(active_users), month=month, num_posts=threshold_count, text_len=None)
        else:
            active_user_comments = get_user_comments(df, list(active_users), month=month, num_posts=threshold_count, text_len=10)
        slm_path = construct_LM_SRILM(active_user_comments, name, month, i, kind=kind, year=year, full=full, vocab=vocab)
        slms.append(slm_path)
    return slms

# build an SLMs for a single month
# TODO not needed, added new param above
def build_full_SLMs_SRILM(df, author_counts, slm_count, month, name, num_authors, kind, threshold_count, year):
    print('Creating ', colored(str(slm_count) + ' Full SLMs ', 'red'), 'for', colored(' month ' + str(month), 'green'), '.....')
    slms = []
    for i in range(0, slm_count): 
        active_users = get_active_users(author_counts, month, 'author', threshold=threshold_count, num_authors=num_authors, kind=kind)
        active_user_comments = get_user_comments(df, list(active_users), month=month, num_posts=threshold_count, text_len=None)
        slm_path = construct_full_LM_SRILM(active_user_comments, name, month, i, kind=kind, year=year)
        slms.append(slm_path)
    return slms


# returns dict of {month:SLM}
def build_monthly_SLM_SRILM(df, author_counts, slm_count, name, use_saved_lms=False, kind=None, num_authors=200, threshold_count=5, year='2017', full=False, vocab=None):
    slm_dict = {}
    if vocab is not None:
        vocab_str = '_'+vocab.replace('data/cleaned/', '').replace('.txt', '')
    else:
        vocab_str = ''
    # if we don't want to remake the LMs, can just load our old ones
    if use_saved_lms:
        if full:
            for m in MONTHS:
                slms = []
                for i in range(0, slm_count):
                    slms.append('../srilms_LMs/'+ year + '/alt_full_text_lms/' + name +'_'+str(i)+'_month_'+str(m)+vocab_str+'.lm')
                slm_dict[m] = slms
            return slm_dict
        else:
            for m in MONTHS:
                slms = []
                for i in range(0, slm_count):
                    slms.append('../srilms_LMs/'+ year +'/' + kind + '/' + name + '/' + name +'_'+str(i)+'_month_'+str(m)+vocab_str+'.lm')
                slm_dict[m] = slms
            return slm_dict
    # otherwise just remake them
    else: 
        for m in MONTHS:
            slms = build_SLMs_SRILM(df, 
                                author_counts,
                                slm_count,
                                month=m,
                                name=name,
                                num_authors=num_authors, # num authors to sample
                                kind=kind, # kind of text, either posts or comments
                                threshold_count=threshold_count, # threshold of how many posts to use
                                year=year,
                                full=full,
                                vocab=vocab)
            slm_dict[m] = slms
        return slm_dict

# returns LM for entire df, not sampled by month
def build_total_SLM_SRILM(df, author_counts, slm_count, name, use_saved_lms=False, kind=None, num_authors=200, threshold_count=5, year='2018', full=False, vocab=None):
    slms = []
    if vocab is not None:
        vocab_str = '_'+vocab.replace('data/cleaned/', '').replace('.txt', '')
    else:
        vocab_str = ''
    # if we don't want to remake the LMs, can just load our old ones
    if use_saved_lms:
        if full:
            for i in range(0, slm_count):
                slms.append('../srilms_LMs/'+ year + '/alt_full_text_lms/' + name +'_'+str(i)+'_month_total'+vocab_str+'.lm')
            return slms
        for i in range(0, slm_count):
            slms.append('../srilms_LMs/'+ year + '/' + kind + '/' + name + '/' + name +'_'+str(i)+'_month_total'+vocab_str+'.lm')
        return slms
    # otherwise just remake them
    else: 
        slms = build_SLMs_SRILM(df, 
                                author_counts,
                                slm_count,
                                month=None,
                                name=name,
                                num_authors=num_authors, # num authors to sample
                                kind=kind, # kind of text, either posts or comments
                                threshold_count=threshold_count, # threshold of how many posts to use
                                year=year,
                                full=full,
                                vocab=vocab) 
        return slms
    
def calc_month_entropy_SRILM(slms, text_path):
    entropies = []
    for i, slm in enumerate(slms):
        entropies.append(get_SRILM_entropy('/homes/gws/taugust/tools/SRILM/bin/i686-m64/ngram', slm, text_path))
    return entropies


    
def calc_acc_gap_SRILM(slms, author_counts, comments, sub_name, kind=None, num_active_authors=10, num_active_posts=5, num_outside_authors=50, num_outside_posts=1, first=False):
    monthly_acc_gap = {}
    entropies = {'inside':[], 'outside':[]}
    for month in slms.keys():
        acc_gap, (active_ent, outside_ent) = calc_single_acc_gap_SRILM(slms[month], 
                author_counts, comments, sub_name, month=month, kind=kind,
                num_active_authors=num_active_authors, num_active_posts=num_active_posts,
                num_outside_authors=num_outside_authors, num_outside_posts=num_outside_posts, first=first)
        
        monthly_acc_gap[month] = acc_gap
        entropies['inside'].append(active_ent)
        entropies['outside'].append(outside_ent)
        
        print('Saving acc gap for', colored('month ' + str(month), 'green'))
    return monthly_acc_gap, entropies


# returns difference of the means of the cross entropy of outside vs. inside text divided by inside posts
def calc_single_acc_gap_SRILM(slms, author_counts, comments, sub_name, month=None, kind=None, num_active_authors=10, num_active_posts=5, num_outside_authors=50, num_outside_posts=1, active_threshold=5, first=False):
    if month is not None:
        month_string = str(month)
    else:
        month_string = 'total'
    print('Calculating cross entropy for', colored('month ' + month_string, 'green'), '.....')
    
    if first:
        active_authors = get_first_active_users(author_counts, month, 'author', threshold=active_threshold, num_authors=num_active_authors, kind=kind)
        active_comments = get_user_first_comments(comments, list(active_authors), month=month)
    else: 
        active_authors = get_active_users(author_counts, month, 'author', threshold=active_threshold, num_authors=num_active_authors, kind=kind)
        active_comments = get_user_comments(comments, list(active_authors), month=month, num_posts=num_active_posts)

    outside_authors = get_outside_users(author_counts, month, 'author', threshold=1, num_authors=num_outside_authors, kind=kind)
    print('sampled active users:', len(active_authors), 'sampled outside users:', len(outside_authors))

    
    outside_comments = get_user_comments(comments, list(outside_authors), month=month, num_posts=num_outside_posts)
    
#     active_comments = get_user_first_comments(comments, list(active_authors), month=month, total_num=50)
#     outside_comments = get_user_comments(comments, list(outside_authors), month=month, num_posts=num_outside_posts)
    
    print('sampled active comments:', len(active_comments), 'sampled outside comments:', len(outside_comments))

    # export to SRILM test directory
    active_corpus_file = export_text(active_comments, name=sub_name+'_inside_month_'+month_string, corpus_path='../data/srilm_data/test_')
    outside_corpus_file = export_text(outside_comments, name=sub_name+'_outside_month_'+month_string, corpus_path='../data/srilm_data/test_')

    # run through SRILM LMs for the month and get the entropy
    active_ent = calc_month_entropy_SRILM(slms, active_corpus_file)
    outside_ent = calc_month_entropy_SRILM(slms, outside_corpus_file)
    
    # calculate the acc gap
    exp_val_active_ent = np.mean(active_ent)
    exp_val_outside_ent = np.mean(outside_ent)

    acc_gap = (exp_val_outside_ent - exp_val_active_ent) / exp_val_active_ent

    
    return acc_gap, (active_ent, outside_ent)
       

# Utility funcs for calculating acc gaps

These functions are pretty weird, just super specialized to what I was doing, honestly using the above smaller functions are probably a safer bet if you can't remember exactly what these are for

In [None]:

# These functions a
def get_df_posts_and_comments(s, year):
    df_comments, df_author_counts_test = import_csvs(s, path='data/cleaned/test/'+year+'/', ext='_test_'+year+'.csv', comment_pre_path='data/cleaned/sub_comments/', comment_ext='_comments_'+year+'.csv')    
    df_posts, df_author_counts_test = import_csvs(s, path='data/cleaned/test/'+year+'/', ext='_test_'+year+'.csv', comment_pre_path='data/cleaned/sub_posts/', comment_ext='_posts_'+year+'.csv')
    df_posts = df_posts.rename(index=str, columns={'fulltext': 'body'})
    return df_comments, df_posts, df_author_counts_test
    


# Func for calculating acculuration gap of a passed subreddit for comments and posts based on POST-BASED LMs
# Since post based LMs are not month specific, this function samples 12 times for the same LM to get the same
# amount of observations as the comment based SLMs that sample by month

# The return value is also a little odd:
### total_acc_gap_comments = acc gap of comments (based on posts)
### entropies_comments =  all inside and outside entropies of comments, used to calculate total_acc_gap_comments
### total_acc_gap_posts = acc gap of posts
### entropies_posts = all inside and outside entropies of posts, used to calculate total_acc_gap_posts
def get_acc_gap_post(post_slms, s, year='2018'):
    df_comments, df_posts, df_author_counts_test = get_df_posts_and_comments(s, year)
    total_acc_gap_comments = []
    entropies_comments = {'inside':[], 'outside':[]}
    total_acc_gap_posts = []
    entropies_posts = {'inside':[], 'outside':[]}
    for i in range(1, 13):
        # Comments #
        ############
        gap_comment, (active_ent_comment, outside_ent_comment) = calc_single_acc_gap_SRILM(post_slms, df_author_counts_test, df_comments, s, kind='comment', month=None,
                                           num_active_authors=10, num_active_posts=5, num_outside_authors=50, num_outside_posts=1)
        total_acc_gap_comments.append(gap_comment)
        entropies_comments['inside'].append(active_ent_comment)
        entropies_comments['outside'].append(outside_ent_comment)
    
        # Posts #
        ############
        gap_post, (active_ent_post, outside_ent_post) = calc_single_acc_gap_SRILM(post_slms, df_author_counts_test, df_posts, s, kind='post', month=None, 
                                            num_active_authors=10, num_active_posts=5, num_outside_authors=50, num_outside_posts=1)
        total_acc_gap_posts.append(gap_post)
        entropies_posts['inside'].append(active_ent_post)
        entropies_posts['outside'].append(outside_ent_post)
        
    return (total_acc_gap_comments,entropies_comments), (total_acc_gap_posts, entropies_posts)

# Same function but for comment-based slms
def get_acc_gap_comment(comment_slms, s, year='2018'):
    df_comments, df_posts, df_author_counts_test = get_df_posts_and_comments(s, year)
    
    # Comments #
    ############
    # Just use the full acc gap function to loop through all the months 
    print('Getting avg acc gap for', colored('comments', 'green'), 'with passed slms')
    dict_gap_comments, entropies_comments = calc_acc_gap_SRILM(comment_slms, df_author_counts_test, df_comments, s, kind='comment',
                                       num_active_authors=10, num_active_posts=5, num_outside_authors=50, num_outside_posts=1)
    gap_comments = list(dict_gap_comments.values())
    
    # Posts #
    #########
    # Here again just loop through the SLMs and sample from the full population each time
    print('Getting average acc gap for', colored('posts', 'green'), 'with passed slms')
    total_acc_gap_posts = []
    entropies_posts = {'inside':[], 'outside':[]}
    for month in comment_slms.keys():
        gap_posts, (active_ent_posts, outside_ent_posts) = calc_single_acc_gap_SRILM(comment_slms[month], 
                df_author_counts_test, df_posts, s, kind='post', month=None, 
                num_active_authors=10, num_active_posts=5, num_outside_authors=50, num_outside_posts=1, active_threshold=5)
        total_acc_gap_posts.append(gap_posts)
        entropies_posts['inside'].append(active_ent_posts)
        entropies_posts['outside'].append(outside_ent_posts)

    return (gap_comments,entropies_comments), (total_acc_gap_posts, entropies_posts)



# specizlied function for flattening entropies, because they are just so damn nested
def flatten_entropies(entropy_posts, entropy_comments):
    # flatten these lists -- treating them each as an observation
    entropy_comments['inside'] = list(flatten(entropy_comments['inside']))
    entropy_comments['outside'] = list(flatten(entropy_comments['outside']))

    entropy_posts['inside'] = list(flatten(entropy_posts['inside']))
    entropy_posts['outside'] = list(flatten(entropy_posts['outside']))
    
    return entropy_posts, entropy_comments

# function for grouping inside and outside entropies,
# returns dfs grouped by post/comment - outside in one and inside in the other
def group_inside_outside(entropy_posts, entropy_comments):
    # group inside posts and comments and outside and outside
    inside_post_and_comments = {'posts':entropy_posts['inside'], 'comments':entropy_comments['inside']}
    outside_post_and_comments = {'posts':entropy_posts['outside'], 'comments':entropy_comments['outside']}

    # convert this into a df to plot more easily
    df_inside_post_and_comments = pd.DataFrame(inside_post_and_comments)
    df_outside_post_and_comments = pd.DataFrame(outside_post_and_comments)
    
    return df_inside_post_and_comments, df_outside_post_and_comments


# function for grouping comment and post entropies,
# returns dfs grouped by post/comment 
def group_comments_posts(entropy_posts, entropy_comments):
    # group inside posts and comments and outside and outside
    posts = {'inside':entropy_posts['inside'], 'outside':entropy_posts['outside']}
    comments = {'inside':entropy_comments['inside'], 'outside':entropy_comments['outside']}

    # convert this into a df to plot more easily
    df_posts_inside_outside = pd.DataFrame(posts)
    df_comments_inside_outside = pd.DataFrame(comments)
    
    return df_posts_inside_outside, df_comments_inside_outside



def plot_entropy_dist(df, labels, ax, title):
    for l in labels:
        sns.distplot(df[l], ax=ax, label=l)
    ax.set_title(title, fontsize=40)
    ax.set_xlabel('cross entropy', fontsize=25)
    ax.legend(fontsize=25)
    ax.tick_params(axis='both', which='major', labelsize=35)
    
    


# General utility functions 

In [None]:
def run_anova(values, non_param):
    if non_param: 
        f, p = stats.kruskal(*values)
    else: 
        f, p = stats.f_oneway(*values)
    # degrees of freedom for ANOVA
    anova_btwn = len(values) - 1
    anova_wthn = (len([val for sublist in values for val in sublist]) - (anova_btwn + 1))
    print('F ( ', anova_btwn, ', ', anova_wthn, ') =', ('%.3f' % f), ' p =', ('%.10f' % p))
