# Experimental Setup

In [None]:
# Mount Google drive to Colab
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

In [None]:
# Clone `CreateDebateScraper` library from github
!git clone https://github.com/utkarsh512/CreateDebateScraper.git
%cd CreateDebateScraper/src/nested/

In [None]:
import numpy as np
import pandas as pd
import matplotlib
from   matplotlib import pyplot as plt
import seaborn as sns

from thread import (Comment,
                    Thread)

from copy import deepcopy
import pickle
from tqdm import tqdm
from pprint import pprint

import networkx as nx

In [None]:
# Setup for plotting
sns.set(style='darkgrid')
matplotlib.rcParams['figure.dpi'] = 120
matplotlib.rcParams['font.size'] = 18
matplotlib.rcParams['figure.figsize'] = (12, 5)

# Analysis

In [None]:
comments = dict()

# Topical forums on CreateDebate. We have scraped comments for all of the
# following forurm.
categories = ['business', 'comedy', 'entertainment', 'health', 'law', 'nsfw',
              'politics2', 'religion', 'science', 'shopping', 'sports',
              'technology', 'travel', 'world']

# However, we will be analyzing comments from selected forum only!
# These forum have at least 10k comments each.
categories_selected = ['politics2', 'religion', 'world', 
                       'science', 'law', 'technology']

for x in categories_selected:
    comments[x] = list()

In [None]:
# Loading comments from select forums

for cat in tqdm(categories_selected):
    fp = open('/content/gdrive/MyDrive/DL/CreateDebate/' + cat + '/threads.log', 'rb')

    # Get all the `Thread` objects pickled while scraping.
    threads = list()
    try:
        while True:
            e = pickle.load(fp)
            threads.append(e)
    except EOFError:
        fp.close()

    # While classifying CreateDebate comments, we used comments as per author mode.
    # Hence, using the same mode to attach classification score with the comments.
    # 
    # score < 0.5 -> ad hominem comment
    #       > 0.5 -> non ad hominem comment
    authors = dict()
    for thread in threads:
        for k, v in thread.comments.items():
            try:
                authors[v.author].append((v, k))
            except:
                authors[v.author] = list()
                authors[v.author].append((v, k))

    ctr = 0
    # Load the classification score of the comments.
    with open('/content/gdrive/MyDrive/DL/CreateDebate/' + cat + '/comments_with_score.log', 'rb') as fp:
        cws = pickle.load(fp)
    # Attach classification score with the comments.
    for author in authors.keys():
        for i in range(len(authors[author])):
            comment, cid = authors[author][i]
            foo = deepcopy(comment.__dict__)
            foo['tag'] = cat
            foo['score'] = cws[ctr][0]
            foo['validation'] = cws[ctr][1][0]
            foo['id'] = int(cid[3:])
            comments[cat].append(foo)
            ctr += 1

In [None]:
# Loading CreateDebate profile characteristics into dataframe
df = pd.read_json('/content/gdrive/MyDrive/DL/CreateDebate/profile/results.json', lines=True)

# Extract useful characteristics
reward_points_map = {k : v for k, v in zip(df['username'].tolist(), df['reward_points'].tolist())}
efficiency_map    = {k : v for k, v in zip(df['username'].tolist(), df['efficiency'].tolist())}
allies_map        = {k : len(v) for k, v in zip(df['username'].tolist(), df['allies'].tolist())}
enemies_map       = {k : len(v) for k, v in zip(df['username'].tolist(), df['enemies'].tolist())}
hostiles_map      = {k : len(v) for k, v in zip(df['username'].tolist(), df['hostiles'].tolist())}

In [None]:
def profile_characteristics_stats(user_subset):
    """
    Returns average and standard deviation of profile characteristics for 
    given subset of users.

    :param user_subset: Iterable containing usernames

    >>> avgs, stds = profile_characterisitics_stat(user_subset)
    >>> rewards_avg, efficiency_avg, n_allies_avg, n_enemies_avg, n_hostiles_avg = avgs
    >>> rewards_std, efficiency_std, n_allies_std, n_enemies_std, n_hostiles_std = stds

    Note that profile characteristics for some users might not be present in our
    dataset as some users might have deleted their account when we scraped the
    forum to obtain these characteristics.
    """
    rewards_ = list()
    efficiency_ = list()
    n_allies = list()
    n_enemies = list()
    n_hostiles = list()

    for user in user_subset:
        try:
            rewards_.append(reward_points_map[user])
        except:pass
        try:
            efficiency_.append(efficiency_map[user])
        except:pass
        try:
            n_allies.append(allies_map[user])
        except:pass
        try:
            n_enemies.append(enemies_map[user])
        except:pass
        try:
            n_hostiles.append(hostiles_map[user])
        except:pass
    
    grpd_data = [rewards_, efficiency_, n_allies, n_enemies, n_hostiles]
    avgs = [np.average(x) for x in grpd_data]
    stds = [np.std(x) for x in grpd_data]
    
    return avgs, stds

In [None]:
user_list = set()

for category in categories_selected:
    for comment in comments[category]:
        user_list.add(comment['author'])

user_list = list(user_list)

In [None]:
# Get a list of all comment thread representative to build user network graph

threads = []

for category in categories_selected:
    reader_addr = f'/content/gdrive/MyDrive/DL/CreateDebate/{category}/threads.log'
    reader = open(reader_addr, 'rb')
    try:
        while True:
            e = pickle.load(reader)
            threads.append(e)
    except:
        reader.close()

In [None]:
def build_graph(user_subset, n1 = 0, n2 = 0):
    """
    Builds user network graph from hyper-parameters n1 and n2
    
    Inputs
    ------
    :param n1: threshold on number of level-1 comments
    :param n2: threshold on number of direct replies

    Output
    ------
    (
        author_map: dict,
        reverse_map: list,
        author_count: int, 
        graph: nx.DiGraph,
        matrix: list
    )
    """

    # Uses globally defined `threads` variable to construct this dictionary.
    # You may choose which categories to be included while building `threads`

    # key  : author name
    # value: count of level-1 comments
    athr = dict()

    for e in threads:
        if 'root' in e.metaL.keys():
            for key in e.metaL['root'].keys():
                cmnt = e.comments[key]
                cur_athr = cmnt.author
                try:
                    athr[cur_athr] += 1
                except:
                    athr[cur_athr] = 1
        if 'root' in e.metaR.keys():
            for key in e.metaR['root'].keys():
                cmnt = e.comments[key]
                cur_athr = cmnt.author
                try:
                    athr[cur_athr] += 1
                except:
                    athr[cur_athr] = 1
    
    # Filter those authors who satisfy the contraint on number of level-1 comments
    L1_athr = dict()
    for x in athr:
        if athr[x] >= n1:
            L1_athr[x] = True

    # Now use `athr` for storing count of direct replies
    # key  : author name
    # value: count of direct replies received
    athr = dict()

    # Depth-first search utility to get number of direct replies for each author
    def dfs(Map, cmntMap, athr, cid='root'):
        if cid == 'root':
            for key in Map[cid].keys():
                dfs(Map[cid], cmntMap, athr, key)
            return

        cur_author = cmntMap[cid].author
        try:
            athr[cur_author] += len(Map[cid].keys())
        except:
            athr[cur_author] = len(Map[cid].keys())

        for key in Map[cid].keys():
            dfs(Map[cid], cmntMap, athr, key)

    # Traverse thread-tree to get number of direct replies for each author
    for e in threads:
        if 'root' in e.metaL.keys():
            dfs(e.metaL, e.comments, athr)
        if 'root' in e.metaR.keys():
            dfs(e.metaR, e.comments, athr) 
    
    # Filter authors who now satify both the contrainsts on count of 
    # - level-1 comments
    # - direct replies
    A = []
    for x in athr:
        if x not in user_subset:
            continue
        if athr[x] >= n2:
            try:
                z = L1_athr[x]
                A.append(x)
            except KeyError:
                pass

    # key  : author name
    # value: corresponing node number in the support/dispute network
    author_map = dict()

    # To get author name for node number
    reverse_map = ["" for _ in range(len(A))]
    author_count = len(A)

    for i in range(author_count):
        author_map[A[i]] = i
        reverse_map[i] = A[i]
    
    # Weighted adjacency matrices for user network
    # Weight for directed edge b/w Node A and Node B corresponsds to the number
    # of times Node A directly-replied Node B.
    matrix = [[0 for j in range(author_count)] for i in range(author_count)]

    # Depth-first search utility to build the adjacency matrices for graph.
    def dfs1(Map, cmntMap, cid='root'):
        if cid == 'root':
            for key in Map[cid].keys():
                dfs1(Map[cid], cmntMap, key)
            return

        cur_author = cmntMap[cid].author
        
        if cur_author in author_map:
            cur_author_id = author_map[cur_author]
            for key in Map[cid].keys():
                nxt_author = cmntMap[key].author
                if nxt_author in author_map:
                    nxt_author_id = author_map[nxt_author]
                    matrix[nxt_author_id][cur_author_id] += 1

        for key in Map[cid].keys():
            dfs1(Map[cid], cmntMap, key)

    for e in threads:
        if 'root' in e.metaL:
            dfs1(e.metaL, e.comments)
        if 'root' in e.metaR:
            dfs1(e.metaR, e.comments)
        
    # Create NetworkX graphs from the adjacency matrices.
    # We need nx graphs in order to get various network stats provided in nx
    # library.
    graph = nx.DiGraph()
    for i in range(author_count):
        for j in range(author_count):
            if matrix[i][j] != 0:
                graph.add_weighted_edges_from([(i, j, matrix[i][j])])
    
    return (author_map, reverse_map, author_count, graph, matrix)

In [None]:
# Construct global user network for entire CreateDebate corpus
user_map, user_reverse_map, user_count, Graph, Matrix = build_graph(user_list)

In [None]:
def get_reciprocity_stats(user_subset):
    """
    Returns reciprocity for given subset of users in local network

    >>> r = get_reciprocity_stats(user_subset)
    """
    _, _, _, Graph_, _ = build_graph(user_subset)

    try:
        r = nx.algorithms.reciprocity(Graph_)
    except:
        r = None

    return r

In [None]:
# Get dicts containing centrality value for each node from global network.
# This will be used for computing stats for user subset.
centrality_dict = nx.algorithms.centrality.degree_centrality(Graph)

In [None]:
def get_centrality_stats(user_subset):
    """
    Returns mean and standard deviation of degree centrality for given user 
    subset in the global network.

    >>> c_avg, c_std = get_centrality_stats(user_subset)
    """
    c = []

    for user in user_subset:
        try:
            c.append(centrality_dict[user_map[user]])
        except:
            pass
    
    return np.average(c), np.std(c)

In [None]:
# Get dicts containing clustering coeffieient for each node from global network. 
# This will be used for computing stats for user subset.
clustering_dict = nx.algorithms.cluster.clustering(Graph)

In [None]:
def get_clustering_stats(user_subset):
    """
    Returns mean and standard deviation of clustering coefficient for given user 
    subset in the global network.

    >>> c_avg, c_std = get_clustering_stats(user_subset)
    """
    c = []

    for user in user_subset:
        try:
            c.append(clustering_dict[user_map[user]])
        except:
            pass
    
    return np.average(c), np.std(c)

In [None]:
def display_stats(user_subset):
    n                          = len(user_subset)
    r                          = get_reciprocity_stats(user_subset) 
    deg_avg, deg_std           = get_centrality_stats(user_subset)
    clu_avg, clu_std           = get_clustering_stats(user_subset)
    user_chr_avg, user_chr_std = profile_characteristics_stats(user_subset) 

    print('Size: %d' % n)
    print('Graph reciprocity: %.2f' % r)

    print('Graph degree centrality: %.5f ± %.5f' % (deg_avg, deg_std))

    print('Graph clustering coeff: %.2f ± %.2f' % (clu_avg, clu_std))

    print('Reward points: %.2f ± %.2f' % (user_chr_avg[0], user_chr_std[0]))
    print('Efficiency   : %.2f ± %.2f' % (user_chr_avg[1], user_chr_std[1]))
    print('# Allies     : %.2f ± %.2f' % (user_chr_avg[2], user_chr_std[2]))
    print('# Enemies    : %.2f ± %.2f' % (user_chr_avg[3], user_chr_std[3]))
    print('# Hostiles   : %.2f ± %.2f' % (user_chr_avg[4], user_chr_std[4]))

# Network study

In [None]:
for_against_debates = dict()
perspective_debates = dict()

for cat in categories_selected:
    for_against_debates[cat] = list()
    perspective_debates[cat] = list()

    for comment in comments[cat]:
        if comment['polarity'] == 'Not Available':
            perspective_debates[cat].append(deepcopy(comment))
        else:
            for_against_debates[cat].append(deepcopy(comment))

In [None]:
# For now, only Politics users are considered!
for_against_user_set = set()
perspective_user_set = set()

for comment in for_against_debates['politics2']:
    for_against_user_set.add(comment['author'])

for comment in perspective_debates['politics2']:
    perspective_user_set.add(comment['author'])

print(f'{len(for_against_user_set)} & {len(perspective_user_set)}')

In [None]:
# Encoding labels used while classification.
# Refer to notebook#63.
label_map = {
    'faulty generalization': 0,
    'false causality': 1,
    'circular reasoning': 2, 
    'ad populum': 3,
    'ad hominem': 4,
    'fallacy of logic': 5,
    'appeal to emotion': 6,
    'false dilemma': 7,
    'equivocation': 8,
    'fallacy of extension': 9,
    'fallacy of relevance': 10,
    'fallacy of credibility': 11,
    'intentional': 12,
}

inverse_label_map = dict()
for k, v in label_map.items():
    inverse_label_map[v] = k

In [None]:
def load_obj(file_path):
    """Load a pickled object from given path
    :param file_path: Path to the pickle file of the object
    :type file_path: string
    """
    with open(file_path, 'rb') as f:
        return pickle.load(f)

In [None]:
# Load labels and scores obtained during classification 
for_against_labels_and_scores = \
  load_obj('/content/gdrive/MyDrive/Temp/63-for_against_labels_and_scores.pkl')
perspective_labels_and_scores = \
  load_obj('/content/gdrive/MyDrive/Temp/63-perspective_labels_and_scores.pkl')

In [None]:
for_against_logical = dict()
perspective_logical = dict()
# key: logical fallacy class
# value: list of comments 

for k in label_map.keys():
    for_against_logical[k] = list()
    perspective_logical[k] = list()

In [None]:
for comment, labels_and_scores in zip(for_against_debates['politics2'], for_against_labels_and_scores):
    label = int(labels_and_scores[0]['label'].lstrip('LABEL_'))
    for_against_logical[inverse_label_map[label]].append(comment)

In [None]:
for comment, labels_and_scores in zip(perspective_debates['politics2'], perspective_labels_and_scores):
    label = int(labels_and_scores[0]['label'].lstrip('LABEL_'))
    perspective_logical[inverse_label_map[label]].append(comment)

In [None]:
def get_user_subset_for_against(cls, flag=0):
    user_subset = set()
    for comment in for_against_logical[cls]:
        user_subset.add(comment['author'])
    if flag:
        user_subset = for_against_user_set - user_subset
    return user_subset

def get_user_subset_perspective(cls, flag=0):
    user_subset = set()
    for comment in perspective_logical[cls]:
        user_subset.add(comment['author'])
    if flag:
        user_subset = perspective_user_set - user_subset
    return user_subset

In [None]:
classes_selected = ('faulty generalization', 'ad hominem', 'fallacy of logic', 'intentional')

In [None]:
current_logical_cls = 'ad hominem'

In [None]:
A = get_user_subset_for_against(cls=current_logical_cls, flag=0)
B = get_user_subset_for_against(cls=current_logical_cls, flag=1)
C = get_user_subset_perspective(cls=current_logical_cls, flag=0)
D = get_user_subset_perspective(cls=current_logical_cls, flag=1)

print(len(A), len(B), len(C), len(D))

In [None]:
display_stats(D)