# Setup environment

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

In [None]:
# Clone `CreateDebateScraper` library from github
!git clone https://github.com/utkarsh512/CreateDebateScraper.git
%cd CreateDebateScraper/src/nested/

In [None]:
!pip install shifterator

In [None]:
from   copy                     import deepcopy
from   itertools                import accumulate
import json
from   matplotlib               import pyplot as plt
import networkx as nx
import nltk
import numpy as np
import pandas as pd
import pickle
import re
from   scipy                    import stats
import textwrap
from   thread                   import Comment, Thread
from   tqdm                     import tqdm
nltk.download('punkt') # For tokenizers
nltk.download('stopwords')
import matplotlib
from   nltk.tokenize            import TweetTokenizer
from   nltk.corpus              import stopwords
from   pprint                   import pprint
import shifterator as sh
import wordcloud
# import skbio
matplotlib.rcParams.update({'font.size': 18})
matplotlib.rcParams["figure.figsize"] = (12, 5)
STOP_WORDS = list(stopwords.words('english'))

In [None]:
tknz = TweetTokenizer()

def clean_text(text):
    """
    Preprocessing text
    """
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"www\S+", "", text)
    text = re.sub("-", " ", text)
    text = re.sub("\s+", " ", text)
    text = re.sub("\u2018", "X", text) 
    text = re.sub("\u2019", "X", text) 
    text = re.sub("\'", "X", text) 
    wordTokens_ = tknz.tokenize(text)
    wordTokens = list()
    for x in wordTokens_:
        x = ''.join([v for v in x if v.isalnum() or v == ' '])
        if len(x) > 0 and x != 'X':
            x = x.replace('X', '\'')
            wordTokens.append(x)
    return wordTokens

# Load CreateDebate dataset

In [None]:
comments = dict()

# Topical forums on CreateDebate. We have scraped comments for all of the
# following forurm.
categories = ['business', 'comedy', 'entertainment', 'health', 'law', 'nsfw',
              'politics2', 'religion', 'science', 'shopping', 'sports',
              'technology', 'travel', 'world']

# However, we will be analyzing comments from selected forum only!
# These forum have at least 10k comments each.
categories_selected = ['politics2', 'religion', 'world', 
                       'science', 'law', 'technology']

for x in categories_selected:
    comments[x] = list()

In [None]:
# Loading comments from select forums

for cat in tqdm(categories_selected):
    fp = open('/content/gdrive/MyDrive/DL/CreateDebate/' + cat + '/threads.log', 'rb')

    # Get all the `Thread` objects pickled while scraping.
    threads = list()
    try:
        while True:
            e = pickle.load(fp)
            threads.append(e)
    except EOFError:
        fp.close()

    # While classifying CreateDebate comments, we used comments as per author mode.
    # Hence, using the same mode to attach classification score with the comments.
    # 
    # score < 0.5 -> ad hominem comment
    #       > 0.5 -> non ad hominem comment
    authors = dict()
    for thread in threads:
        for k, v in thread.comments.items():
            try:
                authors[v.author].append(v)
            except:
                authors[v.author] = list()
                authors[v.author].append(v)

    ctr = 0
    # Load the classification score of the comments.
    with open('/content/gdrive/MyDrive/DL/CreateDebate/' + cat + '/comments_with_score.log', 'rb') as fp:
        cws = pickle.load(fp)
    # Attach classification score with the comments.
    for author in authors.keys():
        for i in range(len(authors[author])):
            comment = authors[author][i]
            foo = deepcopy(comment.__dict__)
            foo['tag'] = cat
            foo['score'] = cws[ctr][0]
            foo['validation'] = cws[ctr][1][0]
            comments[cat].append(foo)
            ctr += 1

In [None]:
def parse_tstring(tstring):
    """
    Parses comment's time to an integer to enable
    comparison between comments based on their time of posting
    """
    if tstring == 'Not Available':
        raise ValueError('Invalid posting time for parse_tstring')
    tstring = tstring.replace('T', '-').replace(':', '-').replace('+', '-').split('-')
    return int(''.join(tstring[:-2]))

In [None]:
# Loading AH score

with open('/content/gdrive/MyDrive/Temp/47-ah-score.pkl', 'rb') as fp:
    ah_score = pickle.load(fp)

# `ah_score` is a dictionary that contains the ah score of the comments written
# by all the users

# key: category -> user
# value: list of ah_score for given user for given category

# value > 0.5 --> ad hominem
# value < 0.5 --> non ad hominem

In [None]:
# Loading CreateDebate profile characteristics into dataframe
df = pd.read_json('/content/gdrive/MyDrive/DL/CreateDebate/profile/results.json', lines=True)

# Extract useful characteristics
reward_points_map = {k : v for k, v in zip(df['username'].tolist(), df['reward_points'].tolist())}
efficiency_map    = {k : v for k, v in zip(df['username'].tolist(), df['efficiency'].tolist())}
allies_map        = {k : len(v) for k, v in zip(df['username'].tolist(), df['allies'].tolist())}
enemies_map       = {k : len(v) for k, v in zip(df['username'].tolist(), df['enemies'].tolist())}
hostiles_map      = {k : len(v) for k, v in zip(df['username'].tolist(), df['hostiles'].tolist())}

In [None]:
def profile_characteristics_stats(user_subset):
    """
    Returns average and standard deviation of characteristics for given subset
    of users
    """
    rewards_ = list()
    efficiency_ = list()
    n_allies = list()
    n_enemies = list()
    n_hostiles = list()

    for user in user_subset:
        try:
            rewards_.append(reward_points_map[user])
        except:pass
        try:
            efficiency_.append(efficiency_map[user])
        except:pass
        try:
            n_allies.append(allies_map[user])
        except:pass
        try:
            n_enemies.append(enemies_map[user])
        except:pass
        try:
            n_hostiles.append(hostiles_map[user])
        except:pass
    
    grpd_data = [rewards_, efficiency_, n_allies, n_enemies, n_hostiles]
    avgs = [np.average(x) for x in grpd_data]
    stds = [np.std(x) for x in grpd_data]
    
    return avgs, stds

In [None]:
# Median ah score per category per author
#   key: category -> author
#   value: median ah score

ah_score_median = dict()

for category, author_data in ah_score.items():
    ah_score_median[category] = dict()
    for author, ah_scores in author_data.items():
        ah_score_median[category][author] = np.max(ah_scores)

In [None]:
comment_count = dict()
# key: category -> author
# value: number of comments written by author in the given forum

for category in categories_selected:
    comment_count[category] = dict()

    for comment in comments[category]:
        author = comment['author']
        try:
            comment_count[category][author] += 1
        except KeyError:
            comment_count[category][author] = 1

In [None]:
user_list = set()

for category in categories_selected:
    for comment in comments[category]:
        user_list.add(comment['author'])

user_list = list(user_list)

In [None]:
comment_count_user = dict()
# key: user -> category -> year (in string)
# value: number of comments posted by the user for that category in the given year

for user in user_list:
    comment_count_user[user] = dict()
    for category in categories_selected:
        comment_count_user[user][category] = dict()
        for year in range(2008, 2022):
            syear = str(year)
            comment_count_user[user][category][syear] = 0


for category in categories_selected:
    for comment in comments[category]:
        if comment['time'] == 'Not Available':
            continue
        year = comment['time'][:4]
        assert(int(year) < 2022 and int(year) >= 2008)
        comment_count_user[ comment['author'] ][ category ][ year ] += 1 

In [None]:
first_post_time = dict()
# key: category -> user
# value: post time of the first comment by given user in the given category
#        It is an integer as returned by parse_tstring routine

for category in categories_selected:
    first_post_time[category] = dict()

    for comment in comments[category]: 
        if comment['time'] == 'Not Available':
            continue
        author = comment['author']
        try:
            first_post_time[category][author] = min(first_post_time[category][author], parse_tstring(comment['time']))
        except KeyError:
            first_post_time[category][author] = parse_tstring(comment['time'])

In [None]:
def get_migrated_users(category1, category2):
    """
    Returns a list of <user_name, year_o, year_m> tuple

    year_o: Year in which user first posted in category1
    year_m: Year in which user first posted in category2

    The users in the returned list should have their first post in CreateDebate
    in category1
    """
    resultant_list = []

    def get_nz_idx(arr):
        idx = -1
        for i in range(len(arr)):
            if arr[i] != 0:
                idx = i
                break
        return idx

    for user in user_list:
        count_1_ = comment_count[category1].get(user, 0)
        count_2_ = comment_count[category2].get(user, 0)

        count_1    = [0 for _ in range(2008, 2022)]
        count_2    = [0 for _ in range(2008, 2022)]
        count_wo_1 = [0 for _ in range(2008, 2022)]

        # `count_1` stores number of comments posted by this user in category1
        # `count_2` stores number of comments posted by this user in category2
        # `count_wo_1` stores number of comments posted by this user in 
        #   CreateDebate but not in category1

        for year in range(2008, 2022):
            count_1[year - 2008] += comment_count_user[user][category1][str(year)]
            count_2[year - 2008] += comment_count_user[user][category2][str(year)]

            for category in categories_selected:
                if category == category1:
                    continue
                count_wo_1[year - 2008] += comment_count_user[user][category][str(year)]

        # Do we need to consider this user or not?
        idx_nz_cat_1    = get_nz_idx(count_1)      # index of first non-zero entry in count_1
        idx_nz_cat_2    = get_nz_idx(count_2)      # index of first non-zero entry in count_2
        idx_nz_cat_wo_1 = get_nz_idx(count_wo_1)   # index of first non-zero entry in count_wo_1

        # Condition to consider an user:
        # 1.    idx_nz_cat_1 != -1 (user has posted atleast one comment in category1)
        # 2.    idx_nz_cat_2 != -1 (user has posted atleast one comment in category2)
        # 3.    idx_nz_cat_1 < idx_nz_cat_wo_1 (the first post of the user should be in category1)

        if count_1_ != 0 and count_2_ != 0: # idx_nz_cat_1 != -1 and idx_nz_cat_2 != -1 and idx_nz_cat_1 < idx_nz_cat_wo_1:
            # Consider this user and append it to the list
            resultant_list.append((user, idx_nz_cat_1 + 2008, idx_nz_cat_2 + 2008))

    return resultant_list

In [None]:
def partition_migrated_users(migration_list, category1, category2):
    """
    Partitions the users into 4 categories: 
        AH-AH
        AH-NonAH
        NonAH-AH
        NonAH-NonAH
    
    migration_list should be obtained using get_migrated_users method
    """

    ah_ah_list = []
    ah_nonah_list = []
    nonah_ah_list = []
    nonah_nonah_list = []

    for entry in migration_list:
        median_score_1 = ah_score_median[category1][entry[0]]
        median_score_2 = ah_score_median[category2][entry[0]]

        if median_score_1 > 0.5 and median_score_2 > 0.5:
            ah_ah_list.append(entry)

        elif median_score_1 > 0.5 and median_score_2 < 0.5:
            ah_nonah_list.append(entry)
        
        elif median_score_1 < 0.5 and median_score_2 > 0.5:
            nonah_ah_list.append(entry)

        elif median_score_1 < 0.5 and median_score_2 < 0.5:
            nonah_nonah_list.append(entry)
        
        else:
            print(entry)

    return ah_ah_list, ah_nonah_list, nonah_ah_list, nonah_nonah_list 

In [None]:
def plot_origin_year(ah_ah, ah_nonah, nonah_ah, nonah_nonah):
    """
    Plot origin year
    """
    cnt_ah_ah = [0 for _ in range(2008, 2022)]
    cnt_ah_nonah = [0 for _ in range(2008, 2022)]
    cnt_nonah_ah = [0 for _ in range(2008, 2022)]
    cnt_nonah_nonah = [0 for _ in range(2008, 2022)]

    labels = [_ for _ in range(2008, 2022)]

    for user, year_o, year_m in ah_ah:
        cnt_ah_ah[year_o - 2008] += 1
    for user, year_o, year_m in ah_nonah:
        cnt_ah_nonah[year_o - 2008] += 1
    for user, year_o, year_m in nonah_ah:
        cnt_nonah_ah[year_o - 2008] += 1
    for user, year_o, year_m in nonah_nonah:
        cnt_nonah_nonah[year_o - 2008] += 1

    ticks = np.arange(len(labels))
    width = 0.20

    fig, ax = plt.subplots()
    subplot1 = ax.bar(ticks - width - width / 2, cnt_ah_ah, width, label='AH -> AH', tick_label=labels)
    subplot2 = ax.bar(ticks - width / 2, cnt_ah_nonah, width, label='AH -> NonAH', tick_label=labels)
    subplot3 = ax.bar(ticks + width / 2, cnt_nonah_ah, width, label='NonAH -> AH', tick_label=labels)
    subplot4 = ax.bar(ticks + width + width / 2, cnt_nonah_nonah, width, label='NonAH -> NonAH', tick_label=labels)

    ax.set_ylabel('#users')
    ax.set_xlabel('Year of joining forum 1')
    ax.set_xticks(ticks)
    ax.set_xticklabels(labels, rotation=45, ha='right')
    ax.legend()
    plt.show()

In [None]:
def plot_migration_year(ah_ah, ah_nonah, nonah_ah, nonah_nonah):
    """
    Plot migration year
    """
    cnt_ah_ah = [0 for _ in range(2008, 2022)]
    cnt_ah_nonah = [0 for _ in range(2008, 2022)]
    cnt_nonah_ah = [0 for _ in range(2008, 2022)]
    cnt_nonah_nonah = [0 for _ in range(2008, 2022)]

    labels = [_ for _ in range(2008, 2022)]

    for user, year_o, year_m in ah_ah:
        cnt_ah_ah[year_m - 2008] += 1
    for user, year_o, year_m in ah_nonah:
        cnt_ah_nonah[year_m - 2008] += 1
    for user, year_o, year_m in nonah_ah:
        cnt_nonah_ah[year_m - 2008] += 1
    for user, year_o, year_m in nonah_nonah:
        cnt_nonah_nonah[year_m - 2008] += 1

    ticks = np.arange(len(labels))
    width = 0.20

    fig, ax = plt.subplots()
    subplot1 = ax.bar(ticks - width - width / 2, cnt_ah_ah, width, label='AH -> AH', tick_label=labels)
    subplot2 = ax.bar(ticks - width / 2, cnt_ah_nonah, width, label='AH -> NonAH', tick_label=labels)
    subplot3 = ax.bar(ticks + width / 2, cnt_nonah_ah, width, label='NonAH -> AH', tick_label=labels)
    subplot4 = ax.bar(ticks + width + width / 2, cnt_nonah_nonah, width, label='NonAH -> NonAH', tick_label=labels)

    ax.set_ylabel('#users')
    ax.set_xlabel('Year of joining forum 2')
    ax.set_xticks(ticks)
    ax.set_xticklabels(labels, rotation=45, ha='right')
    ax.legend()
    plt.show()

In [None]:
def plot_delta_year(ah_ah, ah_nonah, nonah_ah, nonah_nonah):
    """
    Plot migration year - origin year
    """
    cnt_ah_ah = [0 for _ in range(2008, 2022)]
    cnt_ah_nonah = [0 for _ in range(2008, 2022)]
    cnt_nonah_ah = [0 for _ in range(2008, 2022)]
    cnt_nonah_nonah = [0 for _ in range(2008, 2022)]

    labels = [_ for _ in range(14)]

    for user, year_o, year_m in ah_ah:
        cnt_ah_ah[year_m - year_o] += 1
    for user, year_o, year_m in ah_nonah:
        cnt_ah_nonah[year_m - year_o] += 1
    for user, year_o, year_m in nonah_ah:
        cnt_nonah_ah[year_m - year_o] += 1
    for user, year_o, year_m in nonah_nonah:
        cnt_nonah_nonah[year_m - year_o] += 1

    ticks = np.arange(len(labels))
    width = 0.20

    fig, ax = plt.subplots()
    subplot1 = ax.bar(ticks - width - width / 2, cnt_ah_ah, width, label='AH -> AH', tick_label=labels)
    subplot2 = ax.bar(ticks - width / 2, cnt_ah_nonah, width, label='AH -> NonAH', tick_label=labels)
    subplot3 = ax.bar(ticks + width / 2, cnt_nonah_ah, width, label='NonAH -> AH', tick_label=labels)
    subplot4 = ax.bar(ticks + width + width / 2, cnt_nonah_nonah, width, label='NonAH -> NonAH', tick_label=labels)

    ax.set_ylabel('#users')
    ax.set_xlabel('Time difference in years')
    ax.set_xticks(ticks)
    ax.set_xticklabels(labels, rotation=45, ha='right')
    ax.legend()
    plt.show()

In [None]:
reader_addr = '/content/gdrive/MyDrive/DL/CreateDebate/Politics/threads.log'
reader = open(reader_addr, 'rb')
threads = []
try:
    while True:
        e = pickle.load(reader)
        threads.append(e)
except:
    reader.close()

In [None]:
def build_graph(user_subset, n1 = 0, n2 = 0):
    """Builds support graph and dispute graph from hyper-parameters n1 and n2
    inputs
    :param n1: threshold on number of level-1 comments
    :param n2: threshold on number of direct replies

    output
    (author_map : dict, reverse_map : list, author_count : int, support_graph : nx.DiGraph, support_matrix: list, dispute_graph : nxDiGraph, dispute_matrix : list)
    """

    athr = dict()
    for e in threads:
        if 'root' in e.metaL.keys():
            for key in e.metaL['root'].keys():
                cmnt = e.comments[key]
                cur_athr = cmnt.author
                try:
                    athr[cur_athr] += 1
                except:
                    athr[cur_athr] = 1
        if 'root' in e.metaR.keys():
            for key in e.metaR['root'].keys():
                cmnt = e.comments[key]
                cur_athr = cmnt.author
                try:
                    athr[cur_athr] += 1
                except:
                    athr[cur_athr] = 1

    L1_athr = dict()
    for x in athr:
        if athr[x] >= n1:
            L1_athr[x] = True

    athr = dict()

    def dfs(Map, cmntMap, athr, cid='root'):
        if cid == 'root':
            for key in Map[cid].keys():
                dfs(Map[cid], cmntMap, athr, key)
            return
        cur_author = cmntMap[cid].author

        try:
            athr[cur_author] += len(Map[cid].keys())
        except:
            athr[cur_author] = len(Map[cid].keys())

        for key in Map[cid].keys():
            dfs(Map[cid], cmntMap, athr, key)

    for e in threads:
        if 'root' in e.metaL.keys():
            dfs(e.metaL, e.comments, athr)
        if 'root' in e.metaR.keys():
            dfs(e.metaR, e.comments, athr) 

    A = []
    for x in athr:
        if x not in user_subset:
            continue
        if athr[x] >= n2:
            try:
                z = L1_athr[x]
                A.append(x)
            except KeyError:
                pass

    author_map = dict()
    reverse_map = [""] * len(A)
    author_count = len(A)

    for i in range(author_count):
        author_map[A[i]] = i
        reverse_map[i] = A[i]

    support_matrix = [[0 for j in range(author_count)] for i in range(author_count)]
    dispute_matrix = [[0 for j in range(author_count)] for i in range(author_count)]

    def dfs1(Map, cmntMap, cid='root'):
        if cid == 'root':
            for key in Map[cid].keys():
                dfs1(Map[cid], cmntMap, key)
            return

        cur_author = cmntMap[cid].author
        cur_pol = cmntMap[cid].polarity
        
        if cur_author in author_map and cur_pol != 'Not Available':
            cur_author_id = author_map[cur_author]
            for key in Map[cid].keys():
                nxt_author = cmntMap[key].author
                nxt_pol = cmntMap[key].polarity
                if nxt_author in author_map and nxt_pol != 'Not Available':
                    nxt_author_id = author_map[nxt_author]
                    if cur_pol == nxt_pol:
                        support_matrix[nxt_author_id][cur_author_id] += 1
                    else:
                        dispute_matrix[nxt_author_id][cur_author_id] += 1

        for key in Map[cid].keys():
            dfs1(Map[cid], cmntMap, key)

    for e in threads:
        if 'root' in e.metaL:
            dfs1(e.metaL, e.comments)
        if 'root' in e.metaR:
            dfs1(e.metaR, e.comments)

    support_graph = nx.DiGraph()
    for i in range(author_count):
        for j in range(author_count):
            if support_matrix[i][j] != 0:
                support_graph.add_weighted_edges_from([(i, j, support_matrix[i][j])])

    dispute_graph = nx.DiGraph()
    for i in range(author_count):
        for j in range(author_count):
            if dispute_matrix[i][j] != 0:
                dispute_graph.add_weighted_edges_from([(i, j, dispute_matrix[i][j])])
    
    return (author_map, reverse_map, author_count, support_graph, support_matrix, dispute_graph, dispute_matrix)

In [None]:
user_map, user_reverse_map, user_count, support_graph, support_matrix, dispute_graph, dispute_matrix = build_graph(user_list)

In [None]:
support_centrality_dict = nx.algorithms.centrality.degree_centrality(support_graph)
dispute_centrality_dict = nx.algorithms.centrality.degree_centrality(dispute_graph)

In [None]:
def get_centrality_stats(user_subset):
    s_c = []
    d_c = []

    for user in user_subset:
        try:
            s_c.append(support_centrality_dict[user_map[user]])
        except:
            pass
        try:
            d_c.append(dispute_centrality_dict[user_map[user]])
        except:
            pass
    
    return np.average(s_c), np.std(s_c), np.average(d_c), np.std(d_c)

In [None]:
def plot_wordcloud(user_name, categories, time_1, time_2):
    texts = []

    for category in categories:
        for comment in comments[category]:
            if comment['author'] != user_name:
                continue
            if comment['time'] == 'Not Available':
                continue
            time_ = comment['time'][:10]
            if time_ < time_1 or time_ >= time_2:
                continue
            cleaned_comment_body = ' '.join(clean_text(comment['body']))
            texts.append(cleaned_comment_body)

    texts = ' '.join(texts)
    word_cloud = wordcloud.WordCloud(stopwords=STOP_WORDS,
                                     collocations=False).generate(texts)

    plt.imshow(word_cloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()

# plot_wordcloud('excon', ['politics2'], '2018-01-01', '2019-01-01')

In [None]:
time_list = []

for year in range(2008, 2022):
    for month in range(1, 10):
        time_list.append(f'{year}-0{month}-01')
    for month in range(10, 13):
        time_list.append(f'{year}-{month}-01')

In [None]:
def find_time_idx(time_):
    for i, x in enumerate(time_list):
        if x == time_:
            return i
    return -1

In [None]:
# Jensen Shannon Divergence

def plot_js_div(username, categories1, categories2):
    text1 = []
    text2 = []

    for category in categories1:
        for comment in comments[category]:
            user = comment['author']
            if user != username:
                continue
            text1.append(comment['body'])
    
    for category in categories2:
        for comment in comments[category]:
            user = comment['author']
            if user != username:
                continue
            text2.append(comment['body'])
    
    text1 = ' '.join(text1)
    text2 = ' '.join(text2)
    word_tokens_1 = clean_text(text1)
    word_tokens_2 = clean_text(text2)

    dict1 = dict()
    dict2 = dict()

    for token in word_tokens_1:
        try:
            dict1[token] += 1
        except KeyError: 
            dict1[token] = 1
    
    for token in word_tokens_2: 
        try: 
            dict2[token] += 1
        except KeyError: 
            dict2[token] = 1
        
    sh_instance = sh.JSDivergenceShift(type2freq_1=dict1,
                                       type2freq_2=dict2,
                                       weight_1=0.5,
                                       weight_2=0.5,
                                       base=2,
                                       alpha=1)
    
    sh_instance.get_shift_graph(title='Jensen-Shannon Divergence Shifts')

# Analysis

In [None]:
category1 = 'politics2'
category2 = 'religion'

############################
category1_ = 'politics2'

In [None]:
migration_list = get_migrated_users(category1, category2)
#migration_list_ = get_migrated_users(category1_, category2)

In [None]:
partitions = partition_migrated_users(migration_list, category1, category2)
#partitions_ = partition_migrated_users(migration_list_, category1_, category2)

In [None]:
# for i in range(4):
#     partitions[i].extend(partitions_[i])

In [None]:
for i in range(4):
    print(len(partitions[i]))

In [None]:
plot_origin_year(*partitions)

In [None]:
plot_migration_year(*partitions)

In [None]:
plot_delta_year(*partitions)

In [None]:
ah_ah_list, ah_nonah_list, nonah_ah_list, nonah_nonah_list = partitions

In [None]:
ah_list = ah_ah_list + ah_nonah_list 

nonah_list = nonah_ah_list + nonah_nonah_list

## Wordcloud Generation

### NonAH - AH wordcloud

In [None]:
for x in nonah_ah_list:
    print(x)
    user = x[0]
    print(comment_count['politics2'].get(user, 0) + comment_count['religion'].get(user, 0))
    print(comment_count['world'][user])

In [None]:
idx = find_time_idx('2008-01-01')
while (idx < len(time_list)):
    try:
        plot_wordcloud(user_name='Brylos',
                    categories=['religion', 'politics2'],
                    time_1=time_list[idx],
                    time_2=time_list[idx + 1])
        print(time_list[idx])
        print()
    except:
        pass
    finally:
        idx += 1

In [None]:
idx = find_time_idx('2008-01-01')
while (idx < len(time_list)):
    try:
        plot_wordcloud(user_name='Brylos',
                    categories=['world'],
                    time_1=time_list[idx],
                    time_2=time_list[idx + 1])
        print(time_list[idx])
        print()
    except:
        pass
    finally:
        idx += 1

In [None]:
plot_js_div('Brylos', ['religion', 'politics2'], ['world'])

### AH - NonAH wordcloud

In [None]:
for x in ah_nonah_list:
    print(x)
    user = x[0]
    print(comment_count['politics2'].get(user, 0) + comment_count['religion'].get(user, 0))
    print(comment_count['world'][user])

In [None]:
idx = find_time_idx('2008-01-01')
while (idx < len(time_list)):
    try:
        plot_wordcloud(user_name='Cinder000',
                    categories=['religion', 'politics2'],
                    time_1=time_list[idx],
                    time_2=time_list[idx + 1])
        print(time_list[idx])
        print()
    except:
        pass
    finally:
        idx += 1

In [None]:
idx = find_time_idx('2008-01-01')
while (idx < len(time_list)):
    try:
        plot_wordcloud(user_name='Cinder000',
                    categories=['world'],
                    time_1=time_list[idx],
                    time_2=time_list[idx + 1])
        print(time_list[idx])
        print()
    except:
        pass
    finally:
        idx += 1

In [None]:
plot_js_div('Cinder000', ['religion', 'politics2'], ['world'])

### AH-AH wordcloud

In [None]:
for x in ah_ah_list:print(x[0], comment_count['religion'].get(x[0], 0) + comment_count['politics2'].get(x[0], 0), comment_count['world'][x[0]])

In [None]:
plot_js_div('BrontoLite', ['religion', 'politics2'], ['world'])

In [None]:
idx = find_time_idx('2008-01-01')
while (idx < len(time_list)):
    try:
        plot_wordcloud(user_name='BrontoLite',
                    categories=['religion', 'politics2'],
                    time_1=time_list[idx],
                    time_2=time_list[idx + 1])
        print(time_list[idx])
        print()
    except:
        pass
    finally:
        idx += 1

In [None]:
idx = find_time_idx('2008-01-01')
while (idx < len(time_list)):
    try:
        plot_wordcloud(user_name='BrontoLite',
                    categories=['world'],
                    time_1=time_list[idx],
                    time_2=time_list[idx + 1])
        print(time_list[idx])
        print()
    except:
        pass
    finally:
        idx += 1

## Average and Std. Deviation of characteristics across different groups

In [None]:
def get_users(migration_list):
    users = []
    for user, year_o, year_m in migration_list:
        users.append(user)
    return users

In [None]:
ah_ah_users = get_users(ah_ah_list)
ah_nonah_users = get_users(ah_nonah_list)
nonah_ah_users = get_users(nonah_ah_list)
nonah_nonah_users = get_users(nonah_nonah_list)

ah_users = get_users(ah_list)
nonah_users = get_users(nonah_list)

In [None]:
# ah -> ah
avgs, stds = profile_characteristics_stats(ah_users)
print(avgs)
print(stds)

In [None]:
# ah -> ah
avgs, stds = profile_characteristics_stats(nonah_users)
print(avgs)
print(stds)

In [None]:
# ah -> ah
avgs, stds = profile_characteristics_stats(ah_ah_users)
print(avgs)
print(stds)

In [None]:
# ah -> nonah
avgs, stds = profile_characteristics_stats(ah_nonah_users)
print(avgs)
print(stds)

In [None]:
# nonah -> ah
avgs, stds = profile_characteristics_stats(nonah_ah_users)
print(avgs)
print(stds)

In [None]:
# nonah -> nonah
avgs, stds = profile_characteristics_stats(nonah_nonah_users)
print(avgs)
print(stds)

In [None]:
_1, _2, _3, support_graph_, _4, dispute_graph_, _6 = build_graph(ah_users)

try:
    support_graph_r = nx.algorithms.reciprocity(support_graph_)
except:
    support_graph_r = -1

try:
    dispute_graph_r = nx.algorithms.reciprocity(dispute_graph_)
except:
    dispute_graph_r = -1

print('Support graph reciprocity', support_graph_r)
print('Dispute graph reciprocity', dispute_graph_r)

print(get_centrality_stats(ah_users))

In [None]:
_1, _2, _3, support_graph_, _4, dispute_graph_, _6 = build_graph(nonah_users)

try:
    support_graph_r = nx.algorithms.reciprocity(support_graph_)
except:
    support_graph_r = -1

try:
    dispute_graph_r = nx.algorithms.reciprocity(dispute_graph_)
except:
    dispute_graph_r = -1

print('Support graph reciprocity', support_graph_r)
print('Dispute graph reciprocity', dispute_graph_r)

print(get_centrality_stats(nonah_users))

In [None]:
_1, _2, _3, support_graph_, _4, dispute_graph_, _6 = build_graph(ah_ah_users)

try:
    support_graph_r = nx.algorithms.reciprocity(support_graph_)
except:
    support_graph_r = -1

try:
    dispute_graph_r = nx.algorithms.reciprocity(dispute_graph_)
except:
    dispute_graph_r = -1

print('Support graph reciprocity', support_graph_r)
print('Dispute graph reciprocity', dispute_graph_r)

print(get_centrality_stats(ah_ah_users))

In [None]:
_1, _2, _3, support_graph_, _4, dispute_graph_, _6 = build_graph(ah_nonah_users)

try:
    support_graph_r = nx.algorithms.reciprocity(support_graph_)
except:
    support_graph_r = -1

try:
    dispute_graph_r = nx.algorithms.reciprocity(dispute_graph_)
except:
    dispute_graph_r = -1

print('Support graph reciprocity', support_graph_r)
print('Dispute graph reciprocity', dispute_graph_r)

print(get_centrality_stats(ah_nonah_users))

In [None]:
_1, _2, _3, support_graph_, _4, dispute_graph_, _6 = build_graph(nonah_ah_users)

try:
    support_graph_r = nx.algorithms.reciprocity(support_graph_)
except:
    support_graph_r = -1

try:
    dispute_graph_r = nx.algorithms.reciprocity(dispute_graph_)
except:
    dispute_graph_r = -1

print('Support graph reciprocity', support_graph_r)
print('Dispute graph reciprocity', dispute_graph_r)

print(get_centrality_stats(nonah_ah_users))

In [None]:
_1, _2, _3, support_graph_, _4, dispute_graph_, _6 = build_graph(nonah_nonah_users)

try:
    support_graph_r = nx.algorithms.reciprocity(support_graph_)
except:
    support_graph_r = -1

try:
    dispute_graph_r = nx.algorithms.reciprocity(dispute_graph_)
except:
    dispute_graph_r = -1

print('Support graph reciprocity', support_graph_r)
print('Dispute graph reciprocity', dispute_graph_r)

print(get_centrality_stats(nonah_nonah_users))