In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

In [None]:
!git clone https://github.com/utkarsh512/CreateDebate-Scraper.git

In [None]:
%cd CreateDebate-Scraper/src/nested/

In [None]:
from thread import Thread, Comment
import pickle
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors

In [None]:
reader_addr = '/content/gdrive/MyDrive/DL/CreateDebate/Politics/threads.log'
reader = open(reader_addr, 'rb')
threads = []
e = Thread()
try:
    while True:
        e = pickle.load(reader)
        threads.append(e)
except:
    reader.close()

In [None]:
def build_graph(n1 = 0, n2 = 0):
    """Builds support graph and dispute graph from hyper-parameters n1 and n2
    inputs
    :param n1: threshold on number of level-1 comments
    :param n2: threshold on number of direct replies

    output
    (author_map : dict, reverse_map : list, author_count : int, support_graph : nx.DiGraph, support_matrix: list, dispute_graph : nxDiGraph, dispute_matrix : list)
    """

    athr = dict()
    for e in threads:
        if 'root' in e.metaL.keys():
            for key in e.metaL['root'].keys():
                cmnt = e.comments[key]
                cur_athr = cmnt.author
                try:
                    athr[cur_athr] += 1
                except:
                    athr[cur_athr] = 1
        if 'root' in e.metaR.keys():
            for key in e.metaR['root'].keys():
                cmnt = e.comments[key]
                cur_athr = cmnt.author
                try:
                    athr[cur_athr] += 1
                except:
                    athr[cur_athr] = 1

    L1_athr = dict()
    for x in athr:
        if athr[x] >= n1:
            L1_athr[x] = True

    athr = dict()

    def dfs(Map, cmntMap, athr, cid='root'):
        if cid == 'root':
            for key in Map[cid].keys():
                dfs(Map[cid], cmntMap, athr, key)
            return
        cur_author = cmntMap[cid].author

        try:
            athr[cur_author] += len(Map[cid].keys())
        except:
            athr[cur_author] = len(Map[cid].keys())

        for key in Map[cid].keys():
            dfs(Map[cid], cmntMap, athr, key)

    for e in threads:
        if 'root' in e.metaL.keys():
            dfs(e.metaL, e.comments, athr)
        if 'root' in e.metaR.keys():
            dfs(e.metaR, e.comments, athr) 

    A = []
    for x in athr:
        if athr[x] >= n2:
            try:
                z = L1_athr[x]
                A.append(x)
            except KeyError:
                pass

    author_map = dict()
    reverse_map = [""] * len(A)
    author_count = len(A)

    for i in range(author_count):
        author_map[A[i]] = i
        reverse_map[i] = A[i]

    support_matrix = [[0 for j in range(author_count)] for i in range(author_count)]
    dispute_matrix = [[0 for j in range(author_count)] for i in range(author_count)]

    def dfs1(Map, cmntMap, cid='root'):
        if cid == 'root':
            for key in Map[cid].keys():
                dfs1(Map[cid], cmntMap, key)
            return

        cur_author = cmntMap[cid].author
        cur_pol = cmntMap[cid].polarity
        
        if cur_author in author_map and cur_pol != 'Not Available':
            cur_author_id = author_map[cur_author]
            for key in Map[cid].keys():
                nxt_author = cmntMap[key].author
                nxt_pol = cmntMap[key].polarity
                if nxt_author in author_map and nxt_pol != 'Not Available':
                    nxt_author_id = author_map[nxt_author]
                    if cur_pol == nxt_pol:
                        support_matrix[nxt_author_id][cur_author_id] += 1
                    else:
                        dispute_matrix[nxt_author_id][cur_author_id] += 1

        for key in Map[cid].keys():
            dfs1(Map[cid], cmntMap, key)

    for e in threads:
        if 'root' in e.metaL:
            dfs1(e.metaL, e.comments)
        if 'root' in e.metaR:
            dfs1(e.metaR, e.comments)

    support_graph = nx.DiGraph()
    for i in range(author_count):
        for j in range(author_count):
            if support_matrix[i][j] != 0:
                support_graph.add_weighted_edges_from([(i, j, support_matrix[i][j])])

    dispute_graph = nx.DiGraph()
    for i in range(author_count):
        for j in range(author_count):
            if dispute_matrix[i][j] != 0:
                dispute_graph.add_weighted_edges_from([(i, j, dispute_matrix[i][j])])
    
    return (author_map, reverse_map, author_count, support_graph, support_matrix, dispute_graph, dispute_matrix)

In [None]:
author_map, reverse_map, author_count, support_graph, support_matrix, dispute_graph, dispute_matrix = build_graph()

In [None]:
# Observing variation in properties of group G with variations in n1 and n2

thresholds = [1, 2, 5, 10, 20, 50, 100, 200, 500, 1000]
thresholds_str = []
for x in thresholds:
    thresholds_str.append(str(x))
n = len(thresholds)
count = [[0 for j in range(n)] for i in range(n)]
support_graph_r = [[0 for j in range(n)] for i in range(n)]
dispute_graph_r = [[0 for j in range(n)] for i in range(n)]
s_scc = [[0 for j in range(n)] for i in range(n)]
d_scc = [[0 for j in range(n)] for i in range(n)]
for i in range(n):
    for j in range(n):
        _1, _2, cnt, support_graph, _4, dispute_graph, _6 = build_graph(thresholds[i], thresholds[j])
        count[i][j] = cnt
        support_graph_r[i][j] = nx.algorithms.reciprocity(support_graph)
        dispute_graph_r[i][j] = nx.algorithms.reciprocity(dispute_graph)
        s_scc[i][j] = nx.number_strongly_connected_components(support_graph)
        d_scc[i][j] = nx.number_strongly_connected_components(dispute_graph)

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(d_scc, interpolation='nearest')
fig.colorbar(cax)
ax.set_xticks(np.arange(n))
ax.set_yticks(np.arange(n))
ax.set_xticklabels(thresholds_str)
ax.set_yticklabels(thresholds_str)
ax.set_ylabel("$\lambda$", rotation='horizontal')
ax.set_xlabel("$\\rho$")
plt.setp(ax.get_xticklabels(), rotation=90)
plt.savefig('variation_in_number_of_scc_in_dispute_graph_create_debate.eps', format='eps')

# Truncated Create Debate dataset

In [None]:
def build_graph_th(n1 = 0, n2 = 0, Th = 10 ** 10):
    athr = dict()
    for e in threads:
        ctr = 0
        if 'root' in e.metaL.keys():
            for key in e.metaL['root'].keys():
                if ctr >= Th:
                    break
                cmnt = e.comments[key]
                cur_athr = cmnt.author
                try:
                    athr[cur_athr] += 1
                except:
                    athr[cur_athr] = 1
                ctr += 1
        if 'root' in e.metaR.keys():
            for key in e.metaR['root'].keys():
                if ctr >= Th:
                    break
                cmnt = e.comments[key]
                cur_athr = cmnt.author
                try:
                    athr[cur_athr] += 1
                except:
                    athr[cur_athr] = 1
                ctr += 1
    L1_athr = dict()
    for x in athr:
        if athr[x] >= n1:
            L1_athr[x] = True

    athr = dict()

    def dfs(Map, cmntMap, athr, cid='root'):
        if cid == 'root':
            for key in Map[cid].keys():
                dfs(Map[cid], cmntMap, athr, key)
            return
        cur_author = cmntMap[cid].author

        try:
            athr[cur_author] += min(Th, len(Map[cid].keys()))
        except:
            athr[cur_author] = min(Th, len(Map[cid].keys()))
        
        # for key in Map[cid].keys():
            # dfs(Map[cid], cmntMap, athr, key)
    
    for e in threads:
        if 'root' in e.metaL.keys():
            dfs(e.metaL, e.comments, athr)
        if 'root' in e.metaR.keys():
            dfs(e.metaR, e.comments, athr) 

    A = []
    for x in athr:
        if athr[x] >= n2:
            try:
                z = L1_athr[x]
                A.append(x)
            except KeyError:
                pass

    author_map = dict()
    reverse_map = [""] * len(A)
    author_count = len(A)

    for i in range(author_count):
        author_map[A[i]] = i
        reverse_map[i] = A[i]

    matrix = [[0 for j in range(author_count)] for i in range(author_count)]

    ctr = [0]

    def dfs1(Map, cmntMap, cid='root'):
        if cid == 'root':
            for key in Map[cid].keys():
                dfs1(Map[cid], cmntMap, key)
            return

        cur_author = cmntMap[cid].author
        cur_pol = cmntMap[cid].polarity
        
        if cur_author in author_map:
            cur_author_id = author_map[cur_author]
            for key in Map[cid].keys():
                if ctr[0] >= Th:
                    break
                nxt_author = cmntMap[key].author
                nxt_pol = cmntMap[key].polarity
                if nxt_author in author_map:
                    nxt_author_id = author_map[nxt_author]
                    matrix[nxt_author_id][cur_author_id] += 1
                    ctr[0] += 1

        # for key in Map[cid].keys():
            # dfs1(Map[cid], cmntMap, key)

    for e in threads:
        ctr[0] = 0
        if 'root' in e.metaL:
            dfs1(e.metaL, e.comments)
        if 'root' in e.metaR:
            dfs1(e.metaR, e.comments)

    ntwrk = nx.DiGraph()
    for i in range(author_count):
        for j in range(author_count):
            if matrix[i][j] != 0:
                ntwrk.add_weighted_edges_from([(i, j, matrix[i][j])])
    
    return (author_map, reverse_map, author_count, ntwrk, matrix)

In [None]:
# Observing variation in properties of group G with variations in n1 and n2

thresholds = [1, 2, 5, 10, 20, 50, 100, 200, 500, 1000]
thresholds_str = []
for x in thresholds:
    thresholds_str.append(str(x))
n = len(thresholds)
count = [[0 for j in range(n)] for i in range(n)]
reci = [[0 for j in range(n)] for i in range(n)]
n_scc = [[0 for j in range(n)] for i in range(n)]
for i in range(n):
    for j in range(n):
        _1, _2, cnt, graph, _3= build_graph_th(thresholds[i], thresholds[j], Th=50)
        count[i][j] = cnt
        reci[i][j] = nx.algorithms.reciprocity(graph)
        n_scc[i][j] = nx.number_strongly_connected_components(graph)

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(n_scc, interpolation='nearest')
fig.colorbar(cax)
ax.set_xticks(np.arange(n))
ax.set_yticks(np.arange(n))
ax.set_xticklabels(thresholds_str)
ax.set_yticklabels(thresholds_str)
ax.set_ylabel("$\lambda$", rotation='horizontal')
ax.set_xlabel("$\\rho$")
plt.setp(ax.get_xticklabels(), rotation=90)
plt.savefig('variation_in_number_of_scc_create_debate_t.eps', format='eps')

# Centrality measures for $n_1 = n_2 = 20$

In [None]:
author_map, reverse_map, author_count, support_graph, support_matrix, dispute_graph, dispute_matrix = build_graph(n1=0, n2=0)

In [None]:
def get_degree_cen(G, author_map, reverse_map, author_count):
    deg_map = nx.algorithms.centrality.degree_centrality(G)
    avg = 0
    for k in deg_map.keys():
        avg += deg_map[k]
    avg /= author_count
    print(f'Average Degree Centrality: {avg}')
    s = []
    for k in deg_map.keys():
        s.append((deg_map[k], reverse_map[k]))
    s = sorted(s, reverse=True)
    x = []
    y = []
    for i in range(min(10, len(s))):
        x.append(s[i][1])
        y.append(s[i][0])
    plt.figure(dpi = 400)
    plt.bar(x, y, color='#8722B9')
    plt.xlabel('Author')
    plt.ylabel('Degree Centrality')
    plt.xticks(rotation=90)
    plt.show()

In [None]:
def get_betweenness_cen(G, author_map, reverse_map, author_count):
    deg_map = nx.algorithms.centrality.betweenness_centrality(G)
    avg = 0
    for k in deg_map.keys():
        avg += deg_map[k]
    avg /= author_count
    print(f'Average Betweenness Centrality: {avg}')
    s = []
    for k in deg_map.keys():
        s.append((deg_map[k], reverse_map[k]))
    s = sorted(s, reverse=True)
    x = []
    y = []
    for i in range(min(10, len(s))):
        x.append(s[i][1])
        y.append(s[i][0])
    plt.figure(dpi = 400)
    plt.bar(x, y, color='#8722B9')
    plt.xlabel('Author')
    plt.ylabel('Betweeness Centrality')
    plt.xticks(rotation=90)
    plt.show()

In [None]:
def get_influential_nodes(G, reverse_map):
    x = nx.algorithms.centrality.voterank(G)
    for i in range(min(20, len(x))):
        print(f'{i + 1}. {reverse_map[x[i]]}')

In [None]:
def get_influential_authors(G, reverse_map):
    x = nx.algorithms.centrality.voterank(G)
    lst = list()
    sz = len(x)
    print(sz)
    for i in range(sz // 20):
        lo = i * 20
        hi = min(sz, (i + 1) * 20)
        new_lst = list()
        for j in range(lo, hi):
            new_lst.append(reverse_map[x[j]])
        if len(new_lst):
            lst.append(new_lst)
    return lst

In [None]:
foo = get_influential_authors(dispute_graph, reverse_map)

In [None]:
for i in range(len(foo)):
    print(foo[i])
print(len(foo))