In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

In [None]:
!pip install scikit-bio

In [None]:
# Clone `CreateDebateScraper` library from github
!git clone https://github.com/utkarsh512/CreateDebateScraper.git
%cd CreateDebateScraper/src/nested/

In [None]:
from   copy                     import deepcopy
from   itertools                import accumulate
import json
from   matplotlib               import pyplot as plt
import networkx as nx
import nltk
import numpy as np
import pandas as pd
import pickle
import re
from   scipy                    import stats
import textwrap
from   thread                   import Comment, Thread
from   tqdm                     import tqdm
nltk.download('punkt') # For tokenizers
import matplotlib
from   nltk.tokenize            import TweetTokenizer
from   pprint                   import pprint
import skbio
matplotlib.rcParams.update({'font.size': 18})
matplotlib.rcParams["figure.figsize"] = (12, 5)

# Helper function

In [None]:
tknz = TweetTokenizer()

def clean_text(text):
    """
    Preprocessing text
    """
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"www\S+", "", text)
    text = re.sub("-", " ", text)
    text = re.sub("\s+", " ", text)
    text = re.sub("\u2018", "X", text) 
    text = re.sub("\u2019", "X", text) 
    text = re.sub("\'", "X", text) 
    wordTokens_ = tknz.tokenize(text)
    wordTokens = list()
    for x in wordTokens_:
        x = ''.join([v for v in x if v.isalnum() or v == ' '])
        if len(x) > 0 and x != 'X':
            x = x.replace('X', '\'')
            wordTokens.append(x)
    return wordTokens

In [None]:
SLUR_WORDS = {
  "jews": [
    "jews",
    "oven dodger",
    "nazi",
    "dirty jew",
    "holocaust",
    "kikesucker",
    "hook nose",
    "kike"
  ],
  "homosexual": [
    "faggots usually",
    "fucking queer",
    "the biggest faggot",
    "dyke",
    "you fucking faggot",
    "hate faggots",
    "queer",
    "homosexual",
    "the faggots",
    "faggot",
    "faggots usually have",
    "gay",
    "faggots",
    "dykey",
    "ugly dyke",
    "faggots like you",
    "you a fag",
    "lesbian",
    "homo",
    "is a faggot",
    "like a faggot",
    "dykes",
    "faggots like",
    "faggot if you ever"
  ],
  "women": [
    "ugly dyke",
    "woman terrorist",
    "nigress",
    "bitch",
    "slut",
    "women",
    "sheeboon",
    "negress",
    "mud shark",
    "women threat",
    "you a lame bitch",
    "your a cunt",
    "white bitch",
    "niggeress",
    "hoe",
    "dykes",
    "niggress",
    "sheboon",
    "feminazi"
  ],
  "blacks": [
    "pavement ape",
    "the niggers",
    "negress",
    "porch monkey",
    "that nigger",
    "this nigger",
    "sheboon",
    "all niggers",
    "eurafrica",
    "shut up nigger",
    "picaninny",
    "african attack",
    "spearchucker",
    "how many niggers",
    "nigger",
    "africa",
    "niggers are in my",
    "dindu nuffin",
    "stupid nigger",
    "moolie",
    "niggers",
    "bluegum",
    "nigger ass",
    "you niggers",
    "fucking nigger",
    "nigger music",
    "niggress",
    "you a nigger",
    "many niggers are",
    "nigress",
    "blacks",
    "teenaper",
    "sheeboon",
    "dumb nigger",
    "niggeress",
    "pickaninny",
    "nigga"
  ],
  "muslim": [
    "muslim immigrant",
    "islam",
    "mudslime",
    "mooslem",
    "muslim refugee",
    "musslime",
    "shitlam",
    "muslim invasion",
    "moslime",
    "mooslamic",
    "muzzie",
    "allah akbar",
    "mooslime",
    "musloid",
    "mudslimes",
    "muslim",
    "muslimes",
    "moslum",
    "mussie",
    "muzrat",
    "muslim countries",
    "muzzy",
    "moslim",
    "jihadi",
    "muslim country",
    "moslem",
    "muzzrat",
    "mooslim"
  ],
  "arabs": [
    "towel head",
    "goatfucker",
    "arabs",
    "goathumper",
    "raghead",
    "rag head",
    "goathumping",
    "towelhead",
    "camel jockey",
    "sandnigger",
    "camel fucker",
    "sand nigger"
  ],
  "generic": [
    "to rape",
    "raped and",
    "shithole country",
    "get raped",
    "raped",
    "is a fucking",
    "shit skin",
    "raped by",
    "hate you",
    "fake empowerment",
    "abusive women",
    "fuck you too",
    "violence",
    "wit a lame nigga",
    "they all look",
    "alllivesmatter",
    "shithole countries",
    "fucking hate",
    "trailer trash",
    "kill all",
    "terrorist threat",
    "harassment",
    "kill yourself",
    "shitskin",
    "okay to be white",
    "fucking hate you"
  ],
  "white": [
    "full of white",
    "white trash",
    "white devil",
    "white",
    "are all white",
    "white boy",
    "white ass",
    "white bitch",
    "hillbilly",
    "whigger",
    "white christian",
    "white person",
    "all white",
    "white nigger",
    "redneck",
    "white honky",
    "wigger",
    "them white"
  ],
  "economy": [
    "ghetto"
  ],
  "immigrant": [
    "illegal immigrants",
    "immigrant not welcome",
    "immigrant terror",
    "mexcrement",
    "go back to where you come from",
    "muslim refugee",
    "illegal aliens",
    "refugee",
    "protect from immigrants",
    "negro",
    "refugees",
    "immigrant",
    "refugee invasion",
    "go back to where they come from",
    "refugees impact",
    "bring ebola",
    "immigrants",
    "illegal alien",
    "immigrant invasion",
    "bring disease"
  ],
  "mental": [
    "retard",
    "mongoloid",
    "retarded"
  ],
  "asians": [
    "asians",
    "ching chong",
    "chinaman"
  ]
}

# Loading CreateDebate dataset

In [None]:
comments = dict()

# Topical forums on CreateDebate. We have scraped comments for all of the
# following forurm.
categories = ['business', 'comedy', 'entertainment', 'health', 'law', 'nsfw',
              'politics2', 'religion', 'science', 'shopping', 'sports',
              'technology', 'travel', 'world']

# However, we will be analyzing comments from selected forum only!
# These forum have at least 10k comments each.
categories_selected = ['politics2', 'religion', 'world', 
                       'science', 'law', 'technology']

for x in categories_selected:
    comments[x] = list()

In [None]:
# Loading comments from select forums

for cat in tqdm(categories_selected):
    fp = open('/content/gdrive/MyDrive/DL/CreateDebate/' + cat + '/threads.log', 'rb')

    # Get all the `Thread` objects pickled while scraping.
    threads = list()
    try:
        while True:
            e = pickle.load(fp)
            threads.append(e)
    except EOFError:
        fp.close()

    # While classifying CreateDebate comments, we used comments as per author mode.
    # Hence, using the same mode to attach classification score with the comments.
    # 
    # score < 0.5 -> ad hominem comment
    #       > 0.5 -> non ad hominem comment
    authors = dict()
    for thread in threads:
        for k, v in thread.comments.items():
            try:
                authors[v.author].append(v)
            except:
                authors[v.author] = list()
                authors[v.author].append(v)

    ctr = 0
    # Load the classification score of the comments.
    with open('/content/gdrive/MyDrive/DL/CreateDebate/' + cat + '/comments_with_score.log', 'rb') as fp:
        cws = pickle.load(fp)
    # Attach classification score with the comments.
    for author in authors.keys():
        for i in range(len(authors[author])):
            comment = authors[author][i]
            foo = deepcopy(comment.__dict__)
            foo['tag'] = cat
            foo['score'] = cws[ctr][0]
            foo['validation'] = cws[ctr][1][0]
            comments[cat].append(foo)
            ctr += 1

# Loading slur count data for CreateDebate

In [None]:
# Loading computation from cache

with open('/content/gdrive/MyDrive/Temp/47-ah-score.pkl', 'rb') as fp:
    ah_score = pickle.load(fp)

with open('/content/gdrive/MyDrive/Temp/47-slur-count.pkl', 'rb') as fp:
    slur_count = pickle.load(fp)

# Loading CreateDebate user profile data

In [None]:
df = pd.read_json('/content/gdrive/MyDrive/DL/CreateDebate/profile/results.json', lines=True)

In [None]:
df

In [None]:
reward_points_map = {k : v for k, v in zip(df['username'].tolist(), df['reward_points'].tolist())}
efficiency_map    = {k : v for k, v in zip(df['username'].tolist(), df['efficiency'].tolist())}
allies_map        = {k : len(v) for k, v in zip(df['username'].tolist(), df['allies'].tolist())}
enemies_map       = {k : len(v) for k, v in zip(df['username'].tolist(), df['enemies'].tolist())}
hostiles_map      = {k : len(v) for k, v in zip(df['username'].tolist(), df['hostiles'].tolist())}

In [None]:
def get_stats_from_profile_data(user_subset):
    rewards_ = list()
    efficiency_ = list()
    n_allies = list()
    n_enemies = list()
    n_hostiles = list()

    for user in user_subset:
        rewards_.append(reward_points_map[user])
        efficiency_.append(efficiency_map[user])
        n_allies.append(allies_map[user])
        n_enemies.append(enemies_map[user])
        n_hostiles.append(hostiles_map[user])
    
    grpd_data = [rewards_, efficiency_, n_allies, n_enemies, n_hostiles]
    avgs = [np.average(x) for x in grpd_data]
    stds = [np.std(x) for x in grpd_data]
    
    return avgs, stds

# Analysis

In [None]:
# Median ah score per category per author
#   key: category -> author
#   value: median ah score
ah_score_median = dict()

for category, author_data in ah_score.items():
    ah_score_median[category] = dict()
    for author, ah_scores in author_data.items():
        ah_score_median[category][author] = np.median(ah_scores)

In [None]:
user_list = set()

for category in categories_selected:
    for comment in comments[category]:
        user_list.add(comment['author'])

user_list = list(user_list)

In [None]:
len(user_list)

In [None]:
ah_activity_matrix = [[0 for j in range(4)] for i in range(len(user_list))]

In [None]:
EXTREME_AH_UPPER = 1
EXTREME_AH_LOWER = 0.95

MODERATE_AH_UPPER = 0.8
MODERATE_AH_LOWER = 0.7

LOW_AH_UPPER = 0.6
LOW_AH_LOWER = 0.5

In [None]:
for i, user in enumerate(user_list):
    for j, category in enumerate(categories_selected):
        try:
            current_score = ah_score_median[category][user]
            if EXTREME_AH_LOWER <= current_score and current_score <= EXTREME_AH_UPPER:
                ah_activity_matrix[i][1] += 1
            elif MODERATE_AH_LOWER <= current_score and current_score <= MODERATE_AH_UPPER:
                ah_activity_matrix[i][2] += 1
            elif LOW_AH_LOWER <= current_score and current_score <= LOW_AH_UPPER:
                ah_activity_matrix[i][3] += 1
            else:
                ah_activity_matrix[i][0] += 1
        except KeyError:
            ah_activity_matrix[i][0] += 1
            pass

In [None]:
diversity_map = dict()
# key: author
# value: diversity of the user

# Note that we include only those users who are present in the following categories for at least one topical forum:
# * EXTREME_AH
# * MODERATE_AH
# * LOW_AH

for i, user in enumerate(user_list):

    # Check whether to include this user in the study or not
    flag = ah_activity_matrix[i][1] or ah_activity_matrix[i][2] or ah_activity_matrix[i][3]
    if not flag:
        continue
    
    # It's okay to include this user in the study
    div = skbio.diversity.alpha.shannon(ah_activity_matrix[i]) / 2 # divided by 2 to normalize `div` in [0, 1] range
    diversity_map[user] = div

In [None]:
len(diversity_map)

In [None]:
plt.hist(diversity_map.values(), bins=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
plt.xlabel('Normalized Shannon diversity index')
plt.ylabel('#Users (in log-scale)')
plt.yscale('log')
plt.title('Normalized Shannon diversity index for CreateDebate users')

In [None]:
# Thresholds to partition these users as 
# - highly diverse
# - moderately diverse

HIGH_DIVERSITY_LOWER = 0.6

MODERATE_DIVERSITY_LOWER = 0.3
MODERATE_DIVERSITY_UPPER = 0.4

high_diversity_users = set()
moderate_diversity_users = set()

In [None]:
for user, d in diversity_map.items():
    if d >= HIGH_DIVERSITY_LOWER:
        high_diversity_users.add(user)
    elif d >= MODERATE_DIVERSITY_LOWER and d <= MODERATE_DIVERSITY_UPPER:
        moderate_diversity_users.add(user)

In [None]:
print("Number of highly diverse users: ", len(high_diversity_users))
print("Number of moderately diverse users: ", len(moderate_diversity_users))

## Network study for highly diverse / moderately diverse groups

**Ref**: Notebook #12

In [None]:
reader_addr = '/content/gdrive/MyDrive/DL/CreateDebate/Politics/threads.log'
reader = open(reader_addr, 'rb')
threads = []
try:
    while True:
        e = pickle.load(reader)
        threads.append(e)
except:
    reader.close()

In [None]:
def build_graph(user_subset, n1 = 0, n2 = 0):
    """Builds support graph and dispute graph from hyper-parameters n1 and n2
    inputs
    :param n1: threshold on number of level-1 comments
    :param n2: threshold on number of direct replies

    output
    (author_map : dict, reverse_map : list, author_count : int, support_graph : nx.DiGraph, support_matrix: list, dispute_graph : nxDiGraph, dispute_matrix : list)
    """

    athr = dict()
    for e in threads:
        if 'root' in e.metaL.keys():
            for key in e.metaL['root'].keys():
                cmnt = e.comments[key]
                cur_athr = cmnt.author
                try:
                    athr[cur_athr] += 1
                except:
                    athr[cur_athr] = 1
        if 'root' in e.metaR.keys():
            for key in e.metaR['root'].keys():
                cmnt = e.comments[key]
                cur_athr = cmnt.author
                try:
                    athr[cur_athr] += 1
                except:
                    athr[cur_athr] = 1

    L1_athr = dict()
    for x in athr:
        if athr[x] >= n1:
            L1_athr[x] = True

    athr = dict()

    def dfs(Map, cmntMap, athr, cid='root'):
        if cid == 'root':
            for key in Map[cid].keys():
                dfs(Map[cid], cmntMap, athr, key)
            return
        cur_author = cmntMap[cid].author

        try:
            athr[cur_author] += len(Map[cid].keys())
        except:
            athr[cur_author] = len(Map[cid].keys())

        for key in Map[cid].keys():
            dfs(Map[cid], cmntMap, athr, key)

    for e in threads:
        if 'root' in e.metaL.keys():
            dfs(e.metaL, e.comments, athr)
        if 'root' in e.metaR.keys():
            dfs(e.metaR, e.comments, athr) 

    A = []
    for x in athr:
        if x not in user_subset:
            continue
        if athr[x] >= n2:
            try:
                z = L1_athr[x]
                A.append(x)
            except KeyError:
                pass

    author_map = dict()
    reverse_map = [""] * len(A)
    author_count = len(A)

    for i in range(author_count):
        author_map[A[i]] = i
        reverse_map[i] = A[i]

    support_matrix = [[0 for j in range(author_count)] for i in range(author_count)]
    dispute_matrix = [[0 for j in range(author_count)] for i in range(author_count)]

    def dfs1(Map, cmntMap, cid='root'):
        if cid == 'root':
            for key in Map[cid].keys():
                dfs1(Map[cid], cmntMap, key)
            return

        cur_author = cmntMap[cid].author
        cur_pol = cmntMap[cid].polarity
        
        if cur_author in author_map and cur_pol != 'Not Available':
            cur_author_id = author_map[cur_author]
            for key in Map[cid].keys():
                nxt_author = cmntMap[key].author
                nxt_pol = cmntMap[key].polarity
                if nxt_author in author_map and nxt_pol != 'Not Available':
                    nxt_author_id = author_map[nxt_author]
                    if cur_pol == nxt_pol:
                        support_matrix[nxt_author_id][cur_author_id] += 1
                    else:
                        dispute_matrix[nxt_author_id][cur_author_id] += 1

        for key in Map[cid].keys():
            dfs1(Map[cid], cmntMap, key)

    for e in threads:
        if 'root' in e.metaL:
            dfs1(e.metaL, e.comments)
        if 'root' in e.metaR:
            dfs1(e.metaR, e.comments)

    support_graph = nx.DiGraph()
    for i in range(author_count):
        for j in range(author_count):
            if support_matrix[i][j] != 0:
                support_graph.add_weighted_edges_from([(i, j, support_matrix[i][j])])

    dispute_graph = nx.DiGraph()
    for i in range(author_count):
        for j in range(author_count):
            if dispute_matrix[i][j] != 0:
                dispute_graph.add_weighted_edges_from([(i, j, dispute_matrix[i][j])])
    
    return (author_map, reverse_map, author_count, support_graph, support_matrix, dispute_graph, dispute_matrix)

### Highly diverse users

In [None]:
# Observing variation in properties of group G with variations in n1 and n2

thresholds = [1, 2, 5, 10, 20, 50, 100, 200, 500, 1000]
thresholds_str = []
for x in thresholds:
    thresholds_str.append(str(x))
n = len(thresholds)
count = [[0 for j in range(n)] for i in range(n)]
support_graph_r = [[0 for j in range(n)] for i in range(n)]
dispute_graph_r = [[0 for j in range(n)] for i in range(n)]
s_scc = [[0 for j in range(n)] for i in range(n)]
d_scc = [[0 for j in range(n)] for i in range(n)]
for i in range(n):
    for j in range(n):
        try:
            _1, _2, cnt, support_graph, _4, dispute_graph, _6 = build_graph(high_diversity_users, thresholds[i], thresholds[j])
            count[i][j] = cnt
            support_graph_r[i][j] = nx.algorithms.reciprocity(support_graph)
            dispute_graph_r[i][j] = nx.algorithms.reciprocity(dispute_graph)
            s_scc[i][j] = nx.number_strongly_connected_components(support_graph)
            d_scc[i][j] = nx.number_strongly_connected_components(dispute_graph)
        except:
            pass

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(dispute_graph_r, interpolation='nearest')
fig.colorbar(cax)
ax.set_xticks(np.arange(n))
ax.set_yticks(np.arange(n))
ax.set_xticklabels(thresholds_str)
ax.set_yticklabels(thresholds_str)
ax.set_ylabel("$\lambda$", rotation='horizontal')
ax.set_xlabel("$\\rho$")
plt.setp(ax.get_xticklabels(), rotation=90)

In [None]:
avgs, stds = get_stats_from_profile_data(high_diversity_users)

In [None]:
avgs

In [None]:
stds

### Moderately diverse group

In [None]:
# Observing variation in properties of group G with variations in n1 and n2

thresholds = [1, 2, 5, 10, 20, 50, 100, 200, 500, 1000]
thresholds_str = []
for x in thresholds:
    thresholds_str.append(str(x))
n = len(thresholds)
count = [[0 for j in range(n)] for i in range(n)]
support_graph_r = [[0 for j in range(n)] for i in range(n)]
dispute_graph_r = [[0 for j in range(n)] for i in range(n)]
s_scc = [[0 for j in range(n)] for i in range(n)]
d_scc = [[0 for j in range(n)] for i in range(n)]
for i in range(n):
    for j in range(n):
        try:
            _1, _2, cnt, support_graph, _4, dispute_graph, _6 = build_graph(moderate_diversity_users, thresholds[i], thresholds[j])
            count[i][j] = cnt
            support_graph_r[i][j] = nx.algorithms.reciprocity(support_graph)
            dispute_graph_r[i][j] = nx.algorithms.reciprocity(dispute_graph)
            s_scc[i][j] = nx.number_strongly_connected_components(support_graph)
            d_scc[i][j] = nx.number_strongly_connected_components(dispute_graph)
        except:
            pass

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(dispute_graph_r, interpolation='nearest')
fig.colorbar(cax)
ax.set_xticks(np.arange(n))
ax.set_yticks(np.arange(n))
ax.set_xticklabels(thresholds_str)
ax.set_yticklabels(thresholds_str)
ax.set_ylabel("$\lambda$", rotation='horizontal')
ax.set_xlabel("$\\rho$")
plt.setp(ax.get_xticklabels(), rotation=90)

In [None]:
avgs, stds = get_stats_from_profile_data(moderate_diversity_users)

In [None]:
avgs

In [None]:
stds

# Temporal variation in diversity

In [None]:
forum_ah = dict()
# key: author -> year
# value: diversity array

ah_score_list = dict()
# key: author -> year -> category
# value: ah score for comments

In [None]:
for user in user_list:
    ah_score_list[user] = dict()
    for year in range(2008, 2022):
        syear = str(year)
        ah_score_list[user][syear] = dict()
        for category in categories_selected:
            ah_score_list[user][syear][category] = list()

In [None]:
for category in categories_selected:
    for comment in tqdm(comments[category]):
        year = comment['time'][:4]
        try:
            int(year)
        except:
            # Time is not available for given comment
            continue
        author = comment['author']
        score = 1 - comment['score']
        ah_score_list[author][year][category].append(score)

In [None]:
T = tuple([0, 0, 0, 0, 0, 0])

for user in tqdm(user_list):
    if not (user in high_diversity_users or user in moderate_diversity_users):
        continue
    forum_ah[user] = dict()
    for year in range(2008, 2022):
        syear = str(year)
        which = [0 for _ in range(len(categories_selected))]
        for i, category in enumerate(categories_selected):
            try:
                median_score = np.median(ah_score_list[user][syear][category])
                if median_score >= EXTREME_AH_LOWER and median_score <= EXTREME_AH_UPPER:
                    which[i] = 1
                elif median_score >= MODERATE_AH_LOWER and median_score <= MODERATE_AH_UPPER:
                    which[i] = 2
                elif median_score >= LOW_AH_LOWER and median_score <= LOW_AH_UPPER:
                    which[i] = 3
                else:
                    which[i] = 0
            except KeyError:
                which[i] = 0
        forum_ah[user][syear] = tuple(which) # skbio.diversity.alpha.shannon(freq) / 2
        if (T != forum_ah[user][syear]):
            print(user, year)

In [None]:
pdf = pd.DataFrame(forum_ah)

In [None]:
pdf

In [None]:
def plot_change_direction(user_subset, year1, year2, initial_gid, final_gid):
    x = list(categories_selected)
    y = [0 for _ in range(len(x))]
    for user in user_subset:
        idx = 0
        for initial_gid_it, final_gid_it in zip(forum_ah[user][year1], forum_ah[user][year2]):
            if initial_gid_it == initial_gid and final_gid_it == final_gid:
                y[idx] += 1
            idx += 1
    for i in range(len(y)):
        y[i] = (y[i] / len(user_subset)) * 100
    plt.bar(x, y)
    plt.xlabel('Forums')
    plt.ylabel('% users')
    plt.title(f'{initial_gid}{final_gid}: {year1} -> {year2}')

In [None]:
plot_change_direction(high_diversity_users, '2017', '2018', 3, 3)

In [None]:
diversity = dict()
# key: author -> year
# value: diversity value

ah_score_list = dict()
# key: author -> year -> category
# value: ah score for comments

In [None]:
for user in user_list:
    ah_score_list[user] = dict()
    for year in range(2008, 2022):
        syear = str(year)
        ah_score_list[user][syear] = dict()
        for category in categories_selected:
            ah_score_list[user][syear][category] = list()

In [None]:
for category in categories_selected:
    for comment in tqdm(comments[category]):
        year = comment['time'][:4]
        try:
            int(year)
        except:
            # Time is not available for given comment
            continue
        author = comment['author']
        score = 1 - comment['score']
        ah_score_list[author][year][category].append(score)

In [None]:
for user in tqdm(user_list):
    if not (user in high_diversity_users):#or user in moderate_diversity_users):
        continue
    diversity[user] = dict()
    for year in range(2008, 2022):
        syear = str(year)
        freq = [0, 0, 0, 0]
        for category in categories_selected:
            try:
                median_score = np.median(ah_score_list[user][syear][category])
                if median_score >= EXTREME_AH_LOWER and median_score <= EXTREME_AH_UPPER:
                    freq[1] += 1
                elif median_score >= MODERATE_AH_LOWER and median_score <= MODERATE_AH_UPPER:
                    freq[2] += 1
                elif median_score >= LOW_AH_LOWER and median_score <= LOW_AH_UPPER:
                    freq[3] += 1
                else:
                    freq[0] += 1
            except KeyError:
                freq[0] += 1
        diversity[user][syear] = freq #skbio.diversity.alpha.shannon(freq) / 2

In [None]:
for k, v in diversity.items():
    for k1, v1 in v.items():
        if (v1 != [6, 0, 0, 0]):
            print(k, k1, v1)

In [None]:
def plot_diversity_users():
    x = [str(year) for year in range(2008, 2022)]
    y1 = [np.average([diversity[user][year] for user in high_diversity_users]) for year in x]
    y2 = [np.average([diversity[user][year] for user in moderate_diversity_users]) for year in x]
    plt.plot(x, y1, label='High')
    plt.plot(x, y2, label='Moderate')
    plt.xlabel('Year')
    plt.ylabel('Diversity')
    plt.legend()
    plt.title(f'Avg. Shannon diveristy index for highly and moderately diverse group')

In [None]:
plot_diversity_users()