# Objective

- In this notebook, we will be using the top five logical fallacy (on the basis of `%comment`).

- For each logical fallacy, we will analyze the top five dependency paths. For each dependency path, we will see top 5 actual words and 2 comments.

- We will perform this study on two user group
    - Entire CreateDebate users
    - Top 10/100 users (on the basis of coreness value from *core-periphery* study) for *Perspective* debates.

- The goal is to see whether the top dependecies change for the two user groups.

# Notebook Setup

In [None]:
# Mount Google drive to Colab
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
# Clone `CreateDebateScraper` library from github for handling
# CreateDebate dataset
!git clone https://github.com/utkarsh512/CreateDebateScraper.git
%cd CreateDebateScraper/src/nested/

Cloning into 'CreateDebateScraper'...
remote: Enumerating objects: 176, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 176 (delta 5), reused 4 (delta 4), pack-reused 170[K
Receiving objects: 100% (176/176), 207.95 KiB | 3.85 MiB/s, done.
Resolving deltas: 100% (61/61), done.
/content/CreateDebateScraper/src/nested


In [None]:
# Install `cpnet` library for core-periphery analysis
!pip install cpnet

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting cpnet
  Downloading cpnet-0.0.21-py3-none-any.whl (30 kB)
Collecting simanneal>=0.4.2
  Downloading simanneal-0.5.0-py2.py3-none-any.whl (5.6 kB)
Installing collected packages: simanneal, cpnet
Successfully installed cpnet-0.0.21 simanneal-0.5.0


In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib
from   matplotlib import pyplot as plt
import seaborn as sns

from thread import (Comment,
                    Thread)

from collections import deque
from copy import deepcopy
import pickle
import json
from tqdm import tqdm
from pprint import pprint
from functools import lru_cache

import networkx as nx
import spacy
import cpnet
import nltk 



In [None]:
# Setup for plotting
sns.set(style='darkgrid')
matplotlib.rcParams['figure.dpi'] = 120
matplotlib.rcParams['font.size'] = 18
matplotlib.rcParams['figure.figsize'] = (18, 5)

# Setup for nltk
nltk.download('punkt') # For tokenizers
nltk.download('stopwords')
nltk.download('wordnet') # For lemmatizers
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer

# Setup for spacy
!python -m spacy download en_core_web_sm
scapy_nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


2023-03-06 20:20:05.655851: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-03-06 20:20:05.655969: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-03-06 20:20:07.557708: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:267] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download

# Load CreateDebate Dataset

In [None]:
comments = dict()

# Topical forums on CreateDebate. We have scraped comments for all of the
# following forurm.
categories = ['business', 'comedy', 'entertainment', 'health', 'law', 'nsfw',
              'politics2', 'religion', 'science', 'shopping', 'sports',
              'technology', 'travel', 'world']

# However, we will be analyzing comments from selected forum only!
# These forum have at least 10k comments each.
categories_selected = ['politics2', 'religion', 'world', 
                       'science', 'law', 'technology']

for x in categories_selected:
    comments[x] = list()

In [None]:
# Loading comments from select forums

for cat in tqdm(categories_selected):
    fp = open('/content/gdrive/MyDrive/DL/CreateDebate/' + cat + '/threads.log', 'rb')

    # Get all the `Thread` objects pickled while scraping.
    threads = list()
    try:
        while True:
            e = pickle.load(fp)
            threads.append(e)
    except EOFError:
        fp.close()

    # While classifying CreateDebate comments, we used comments as per author mode.
    # Hence, using the same mode to attach classification score with the comments.
    # 
    # score < 0.5 -> ad hominem comment
    #       > 0.5 -> non ad hominem comment
    authors = dict()
    for thread in threads:
        for k, v in thread.comments.items():
            try:
                authors[v.author].append((v, k))
            except:
                authors[v.author] = list()
                authors[v.author].append((v, k))

    ctr = 0
    # Load the classification score of the comments.
    with open('/content/gdrive/MyDrive/DL/CreateDebate/' + cat + '/comments_with_score.log', 'rb') as fp:
        cws = pickle.load(fp)
    # Attach classification score with the comments.
    for author in authors.keys():
        for i in range(len(authors[author])):
            comment, cid = authors[author][i]
            foo = deepcopy(comment.__dict__)
            foo['tag'] = cat
            foo['score'] = cws[ctr][0]
            foo['validation'] = cws[ctr][1][0]
            foo['id'] = int(cid[3:])
            comments[cat].append(foo)
            ctr += 1

100%|██████████| 6/6 [00:17<00:00,  2.99s/it]


In [None]:
# Get all usernames in a list
user_list = set()

for category in categories_selected:
    for comment in comments[category]:
        user_list.add(comment['author'])

user_list = list(user_list)

# Utilities for Core-Periphery Analysis

In [None]:
def build_graph(user_subset, threads, n1 = 0, n2 = 0):
    """
    Builds user network graph from hyper-parameters n1 and n2
    
    Inputs
    ------
    :param user_subset: list of users to consider, rest will be ignored
    :param threads: list containing `Thread`s of comment
    :param n1: threshold on number of level-1 comments
    :param n2: threshold on number of direct replies

    Output
    ------
    (
        author_map: dict,
        reverse_map: list,
        author_count: int, 
        graph: nx.DiGraph,
        matrix: list
    )
    """

    # Uses globally defined `threads` variable to construct this dictionary.
    # You may choose which categories to be included while building `threads`

    # key  : author name
    # value: count of level-1 comments
    athr = dict()

    for e in threads:
        if 'root' in e.metaL.keys():
            for key in e.metaL['root'].keys():
                cmnt = e.comments[key]
                cur_athr = cmnt.author
                try:
                    athr[cur_athr] += 1
                except:
                    athr[cur_athr] = 1
        if 'root' in e.metaR.keys():
            for key in e.metaR['root'].keys():
                cmnt = e.comments[key]
                cur_athr = cmnt.author
                try:
                    athr[cur_athr] += 1
                except:
                    athr[cur_athr] = 1
    
    # Filter those authors who satisfy the contraint on number of level-1 comments
    L1_athr = dict()
    for x in athr:
        if athr[x] >= n1:
            L1_athr[x] = True

    # Now use `athr` for storing count of direct replies
    # key  : author name
    # value: count of direct replies received
    athr = dict()

    # Depth-first search utility to get number of direct replies for each author
    def dfs(Map, cmntMap, athr, cid='root'):
        if cid == 'root':
            for key in Map[cid].keys():
                dfs(Map[cid], cmntMap, athr, key)
            return

        cur_author = cmntMap[cid].author
        try:
            athr[cur_author] += len(Map[cid].keys())
        except:
            athr[cur_author] = len(Map[cid].keys())

        for key in Map[cid].keys():
            dfs(Map[cid], cmntMap, athr, key)

    # Traverse thread-tree to get number of direct replies for each author
    for e in threads:
        if 'root' in e.metaL.keys():
            dfs(e.metaL, e.comments, athr)
        if 'root' in e.metaR.keys():
            dfs(e.metaR, e.comments, athr) 
    
    # Filter authors who now satify both the contrainsts on count of 
    # - level-1 comments
    # - direct replies
    A = []
    for x in athr:
        if x not in user_subset:
            continue
        if athr[x] >= n2:
            try:
                z = L1_athr[x]
                A.append(x)
            except KeyError:
                pass

    # key  : author name
    # value: corresponing node number in the support/dispute network
    author_map = dict()

    # To get author name for node number
    reverse_map = ["" for _ in range(len(A))]
    author_count = len(A)

    for i in range(author_count):
        author_map[A[i]] = i
        reverse_map[i] = A[i]
    
    # Weighted adjacency matrices for user network
    # Weight for directed edge b/w Node A and Node B corresponsds to the number
    # of times Node A directly-replied Node B.
    matrix = [[0 for j in range(author_count)] for i in range(author_count)]

    # Depth-first search utility to build the adjacency matrices for graph.
    def dfs1(Map, cmntMap, cid='root'):
        if cid == 'root':
            for key in Map[cid].keys():
                dfs1(Map[cid], cmntMap, key)
            return

        cur_author = cmntMap[cid].author
        
        if cur_author in author_map:
            cur_author_id = author_map[cur_author]
            for key in Map[cid].keys():
                nxt_author = cmntMap[key].author
                if nxt_author in author_map:
                    nxt_author_id = author_map[nxt_author]
                    matrix[nxt_author_id][cur_author_id] += 1

        for key in Map[cid].keys():
            dfs1(Map[cid], cmntMap, key)

    for e in threads:
        if 'root' in e.metaL:
            dfs1(e.metaL, e.comments)
        if 'root' in e.metaR:
            dfs1(e.metaR, e.comments)
        
    # Create NetworkX graphs from the adjacency matrices.
    # We need nx graphs in order to get various network stats provided in nx
    # library.
    graph = nx.DiGraph()
    for i in range(author_count):
        for j in range(author_count):
            if matrix[i][j] != 0:
                graph.add_weighted_edges_from([(i, j, matrix[i][j])])
    
    return (author_map, reverse_map, author_count, graph, matrix)

In [None]:
def get_coreness_dict(graph):
    """Get coreness value for each nodes as dictionary. We use Boyd algorithm.
    :param graph: `nx.DiGraph` object

    For more info about the algorithm used, see
    https://github.com/skojaku/core-periphery-detection
    """
    algorithm = cpnet.MINRES()
    algorithm.detect(graph)
    coreness = algorithm.get_coreness()
    return coreness        

# Utilities for Dependency Parsing

In [None]:
def load_empath_dictionary():
    """
    Returns a dict[str, list] object where keys are categories and values are 
    associated words for that category
    """
    empath_dict = dict()
    with open('/content/gdrive/MyDrive/DL/empath/dictionary.tsv', 'r') as f:
        for line in f:
            cols = line.strip().split("\t")
            name = cols[0]
            terms = cols[1:]
            empath_dict[name] = list()
            for t in set(terms):
                empath_dict[name].append(t)
    return empath_dict

In [None]:
empath_dict = load_empath_dictionary()

In [None]:
tokens_count = list()
for v in empath_dict.values():
    tokens_count.append(len(v))
print(f'Average token count {np.average(tokens_count)}, Std. dev {np.std(tokens_count)}')

Average token count 83.29381443298969, Std. dev 28.771070501829353


In [None]:
# SOTA slur word dictionary (from Punyajoy)
with open('/content/gdrive/MyDrive/DL/slurwords/slur_dictionary.json') as f:
    slur_words_dict = json.load(f)

In [None]:
# Hate-targets broad categories
# Paper: "A Measurement Study of Hate Speech in Social Media", Mainack Mondal
with open('/content/gdrive/MyDrive/DL/empath/hate_categories.json') as f:
    hate_targets_dict = json.load(f)
pprint(hate_targets_dict)

{'Behavior': ['negative_emotion',
              'timidity',
              'disappointment',
              'animal',
              'smell',
              'anger',
              'torment',
              'shame',
              'lust',
              'sadness',
              'rage',
              'dominant_personality',
              'violence',
              'childish',
              'pet',
              'irritability',
              'fear',
              'sexual',
              'ridicule',
              'wealthy',
              'weakness',
              'nervousness',
              'envy',
              'aggression',
              'hate'],
 'Class': ['economy', 'poor', 'stealing'],
 'Crime': ['prison', 'crime', 'terrorism'],
 'Disablity': ['mental'],
 'Ethnicity': ['immigrant', 'arabs', 'asians'],
 'Gender': ['women', 'feminine'],
 'Physical': ['monster',
              'ugliness',
              'youth',
              'appearance',
              'disgust',
              'hygiene',
        

In [None]:
hate_targets_raw = dict()
# key: hate_targets
# value: list of raw tokens associated with that target

for k, v in hate_targets_dict.items():
    hate_targets_raw[k] = list()
    for token_type in v:
        if token_type in slur_words_dict:
            hate_targets_raw[k].extend(slur_words_dict[token_type])
        if token_type in empath_dict:
            hate_targets_raw[k].extend(empath_dict[token_type])

In [None]:
lemmatizer = WordNetLemmatizer()
replace_underscores_with_whitespaces = lambda z: ' '.join(z.split('_'))

hate_targets = dict()
# key: hate_targets
# value: list of processed tokens associated with that target

for k, v in hate_targets_raw.items():
    temp = list(map(lemmatizer.lemmatize, v))
    hate_targets[k] = set(map(replace_underscores_with_whitespaces, temp))

# pprint(hate_targets)

In [None]:
def create_dependency_graph(doc):
    """Create dependency graph of tokens using scapy"""
    dependency_edges = list() # (parent, child, relationship)
    id_to_text = dict()
    id_to_token = dict()
    root = None
    node_count = 0

    for token in doc:
        node_count += 1
        parent = token.head.i
        child = token.i
        relationship = token.dep_
        id_to_text[child] = lemmatizer.lemmatize(token.text)
        id_to_token[child] = token
        if relationship == 'ROOT':
            root = child
            continue
        dependency_edges.append((parent, child, relationship))

    dependency_graph = dict()
    for i in range(node_count): 
        dependency_graph[i] = list()
    for p, c, r in dependency_edges:
        dependency_graph[p].append((c, r))
        dependency_graph[c].append((p, r))
    
    return dependency_graph, id_to_text, id_to_token, root

In [None]:
def get_personal_pronoun_ids(id_to_token):
    """Index Generator: Generates ids which are indices of personal pronouns"""
    for k, v in id_to_token.items():
        if v.tag_ == 'PRP': # Personal pronoun tag in scapy
            yield k

In [None]:
def get_pronoun_ids(id_to_token):
    """Generates ids which are indices of pronouns"""
    for k, v in id_to_token.items():
        if v.pos_ == 'PRON': # Pronoun tag in scapy
            yield k

In [None]:
def get_trigger_ids(id_to_text, trigger_type):
    """Generates ids which are indices of triggers
    :param id_to_text: id_to_text returned by create_dependency_graph
    :type id_to_text: dict
    :param trigger_type: What type of triggers?
    :type trigger_type: str
    """
    for k, v in id_to_text.items():
        if v in hate_targets[trigger_type]:
            yield k

In [None]:
def breadth_first_search(dependency_graph, source):
    """Performs breadth first search
    :param dependency_graph: Dependency graph returned by create_dependency_graph
    :type dependency_graph: dict
    :param source: Source node ID
    :type source: int
    """
    q = deque()
    used = set()
    d = dict() # distance of nodes from source
    p = dict() # parent in bfs
    r = dict() # relation observed

    q.append(source)
    used.add(source)
    p[source] = -1
    d[source] = 0

    while len(q):
        v = q.popleft()
        for u, rel in dependency_graph[v]:
            if u in used:
                continue
            used.add(u)
            q.append(u)
            d[u] = d[v] + 1
            p[u] = v
            r[u] = rel

    return d, p, r

In [None]:
def generate_path_from_bfs(source, dest, dist_dict, parent_dict, relation_dict):
    """Generate path from source to dest. Path will contain relationships 
    encountered in bfs.
    """
    assert dist_dict[source] == 0
    assert dest in dist_dict 

    indices_list = list() # to store indices along the path

    path = list()
    cur = dest
    while cur != source:
        path.append(relation_dict[cur])
        indices_list.append(cur)
        cur = parent_dict[cur]
    indices_list.append(cur)
    
    path_string = ' -> '.join(path)
    return path_string, indices_list

In [None]:
def get_trigger_count(texts, index_generator, n_process=2, batch_size=1000):
    """Computes count of ad hominem triggers associated with indices generated
    by index_generator
    """

    trigger_count = dict()
    for trigger_type in hate_targets.keys():
        trigger_count[trigger_type] = 0

    docs = scapy_nlp.pipe(texts, n_process=n_process, batch_size=batch_size)

    for doc in tqdm(docs, total=len(texts)):
        # Parse comment text and create dependency graph
        dependency_graph, id_to_text, id_to_token, root \
                                             = create_dependency_graph(doc)

        # Extract the indices using iterator
        for index in index_generator(id_to_token):
            dist, parent, relation = breadth_first_search(dependency_graph,
                                                          index)
            for trigger_type in hate_targets.keys():
                for trigger_id in get_trigger_ids(id_to_text, trigger_type):
                    if trigger_id in dist:
                        trigger_count[trigger_type] += 1
    
    return trigger_count

In [None]:
def get_trigger_count_by_username_ctx(texts, index_generator, n_process=2, batch_size=1000):
    """Computes count of ad hominem triggers associated with indices generated
    by index_generator
    """

    trigger_count = dict()
    for trigger_type in hate_targets.keys():
        trigger_count[trigger_type] = dict()

    docs = scapy_nlp.pipe(texts, n_process=n_process, batch_size=batch_size,
                          as_tuples=True)

    for doc, context in tqdm(docs, total=len(texts)):
        # Parse comment text and create dependency graph
        dependency_graph, id_to_text, id_to_token, root \
                                             = create_dependency_graph(doc)

        # Extract the indices using iterator
        for index in index_generator(id_to_token):
            dist, parent, relation = breadth_first_search(dependency_graph,
                                                          index)
            for trigger_type in hate_targets.keys():
                for trigger_id in get_trigger_ids(id_to_text, trigger_type):
                    if trigger_id in dist:
                        if context['username'] in trigger_count[trigger_type]:
                            trigger_count[trigger_type][context['username']] += 1
                        else:
                            trigger_count[trigger_type][context['username']] = 1
    
    return trigger_count

In [None]:
def get_trigger_count_by_path(texts, index_generator, n_process=2, batch_size=1000):
    """Computes no. of occurence of dependency paths for given texts and returns it
    :param texts: list of comment body (text)
    :param index_generator: `get_personal_pronoun_ids` or `get_pronoun_ids`
    :param n_process: No. of processes spawned for processing, refer to pipe utility in spacy
    :param batch_size: Batch size while processing, refer to pipe utility in spacy
    """

    trigger_count = dict()
    # key: dependency path
    # value: no. of occurence of given dependency path in `texts`

    docs = scapy_nlp.pipe(texts, n_process=n_process, batch_size=batch_size)

    for doc in tqdm(docs, total=len(texts)):
        # Parse comment text and create dependency graph
        dependency_graph, id_to_text, id_to_token, root \
                                             = create_dependency_graph(doc)

        # Extract the indices using iterator
        for index in index_generator(id_to_token):
            dist, parent, relation = breadth_first_search(dependency_graph,
                                                          index)
            for trigger_type in hate_targets.keys():
                for trigger_id in get_trigger_ids(id_to_text, trigger_type):
                    if trigger_id in dist:
                        path, _ = generate_path_from_bfs(index, trigger_id, dist,
                                                         parent, relation)
                        if path in trigger_count:
                            trigger_count[path] += 1
                        else:
                            trigger_count[path] = 1
    
    return trigger_count

In [None]:
def get_words_along_paths(texts, index_generator, n_process=2, batch_size=1000):
    """For each dependency path encountered, it will store the actual words
    which exist as we traverse the path
    :param texts: list of comment body (text)
    :param index_generator: `get_personal_pronoun_ids` or `get_pronoun_ids`
    :param n_process: No. of processes spawned for processing, refer to pipe utility in spacy
    :param batch_size: Batch size while processing, refer to pipe utility in spacy
    """
    word_path = dict()
    # key: dependency path
    # value: list containing the actual words instead of relationship along the 
    #        dependency path

    docs = scapy_nlp.pipe(texts, n_process=n_process, batch_size=batch_size)

    for doc in tqdm(docs, total=len(texts)):
        # Parse comment text and create dependency graph
        dependency_graph, id_to_text, id_to_token, root \
                                             = create_dependency_graph(doc)

        # Extract the indices using iterator
        for index in index_generator(id_to_token):
            dist, parent, relation = breadth_first_search(dependency_graph,
                                                          index)
            for trigger_type in hate_targets.keys():
                for trigger_id in get_trigger_ids(id_to_text, trigger_type):
                    if trigger_id in dist:
                        path, word_indices = \
                            generate_path_from_bfs(index, trigger_id, dist,
                                                   parent, relation)
                        words = list()
                        for word_index in word_indices:
                            words.append(id_to_text[word_index])
                        words_string = ' -> '.join(words)

                        if path not in word_path:
                            word_path[path] = list()
                        word_path[path].append(words_string)             
    
    return word_path

In [None]:
def get_comments_by_path(texts, index_generator, n_process=2, batch_size=1000):
    """For each dependency path encountered, it will store the actual words
    which exist as we traverse the path

    :param texts: list of comment body (text)
    :param index_generator: `get_personal_pronoun_ids` or `get_pronoun_ids`
    :param n_process: No. of processes spawned for processing, refer to pipe utility in spacy
    :param batch_size: Batch size while processing, refer to pipe utility in spacy
    """
    comment_list = list()

    comment_ids = dict()
    # key: dependency path
    # value: dict of <comment_pos, occurence of given dependency path in the given comment>

    docs = scapy_nlp.pipe(texts, n_process=n_process, batch_size=batch_size)

    for doc in tqdm(docs, total=len(texts)):
        # Parse comment text and create dependency graph
        comment_list.append(doc.text)
        curr_comment_pos = len(comment_list) - 1
        dependency_graph, id_to_text, id_to_token, root \
                                             = create_dependency_graph(doc)

        # Extract the indices using iterator
        for index in index_generator(id_to_token):
            dist, parent, relation = breadth_first_search(dependency_graph,
                                                          index)
            for trigger_type in hate_targets.keys():
                for trigger_id in get_trigger_ids(id_to_text, trigger_type):
                    if trigger_id in dist:
                        path, word_indices = \
                            generate_path_from_bfs(index, trigger_id, dist,
                                                   parent, relation)
                        words = list()
                        for word_index in word_indices:
                            words.append(id_to_text[word_index])
                        words_string = ' -> '.join(words)
                        if path not in comment_ids:
                            comment_ids[path] = dict()
                        if curr_comment_pos not in comment_ids[path]:
                            comment_ids[path][curr_comment_pos] = list()
                        comment_ids[path][curr_comment_pos].append(words_string)
    
    return comment_ids, comment_list

# Computation for all CreateDebate users

In [None]:
for_against_debates = dict()
perspective_debates = dict()

for cat in categories_selected:
    for_against_debates[cat] = list()
    perspective_debates[cat] = list()

    for comment in comments[cat]:
        if comment['polarity'] == 'Not Available':
            perspective_debates[cat].append(deepcopy(comment))
        else:
            for_against_debates[cat].append(deepcopy(comment))

In [None]:
# For now, only Politics users are considered!
for_against_user_set = set()
perspective_user_set = set()

for comment in for_against_debates['politics2']:
    for_against_user_set.add(comment['author'])

for comment in perspective_debates['politics2']:
    perspective_user_set.add(comment['author'])

print(f'{len(for_against_user_set)} & {len(perspective_user_set)}')

6761 & 2002


In [None]:
# Encoding labels used while classification.
# Refer to notebook#63.
label_map = {
    'faulty generalization': 0,
    'false causality': 1,
    'circular reasoning': 2, 
    'ad populum': 3,
    'ad hominem': 4,
    'fallacy of logic': 5,
    'appeal to emotion': 6,
    'false dilemma': 7,
    'equivocation': 8,
    'fallacy of extension': 9,
    'fallacy of relevance': 10,
    'fallacy of credibility': 11,
    'intentional': 12,
}

inverse_label_map = dict()
for k, v in label_map.items():
    inverse_label_map[v] = k

In [None]:
def load_obj(file_path):
    """Load a pickled object from given path
    :param file_path: Path to the pickle file of the object
    :type file_path: string
    """
    with open(file_path, 'rb') as f:
        return pickle.load(f)

In [None]:
def save_obj(obj, file_path):
    """Save an object to given path via pickling
    :param obj: Object to pickle
    :param file_path: Path for pickling
    :type file_path: string
    """
    with open(file_path, 'wb') as f:
        return pickle.dump(obj, f)

In [None]:
# Load labels and scores obtained during classification of for-against and
# perspective debates into the logical fallacies
for_against_labels_and_scores = \
  load_obj('/content/gdrive/MyDrive/Temp/63-for_against_labels_and_scores.pkl')
perspective_labels_and_scores = \
  load_obj('/content/gdrive/MyDrive/Temp/63-perspective_labels_and_scores.pkl')

In [None]:
for_against_logical = dict()
perspective_logical = dict()
# key: logical fallacy class
# value: list of comments 

for k in label_map.keys():
    for_against_logical[k] = list()
    perspective_logical[k] = list()

for comment, labels_and_scores in zip(for_against_debates['politics2'], for_against_labels_and_scores):
    label = int(labels_and_scores[0]['label'].lstrip('LABEL_'))
    for_against_logical[inverse_label_map[label]].append(comment)

for comment, labels_and_scores in zip(perspective_debates['politics2'], perspective_labels_and_scores):
    label = int(labels_and_scores[0]['label'].lstrip('LABEL_'))
    perspective_logical[inverse_label_map[label]].append(comment)

In [None]:
@lru_cache(maxsize=16)
def get_user_subset_for_against(cls):
    user_subset = set()
    for comment in for_against_logical[cls]:
        user_subset.add(comment['author'])
    return frozenset(user_subset)

@lru_cache(maxsize=16)
def get_user_subset_perspective(cls):
    user_subset = set()
    for comment in perspective_logical[cls]:
        user_subset.add(comment['author'])
    return frozenset(user_subset)

In [None]:
classes_selected = ('fallacy of relevance', 
                    'faulty generalization', 
                    'ad hominem', 
                    'intentional',
                    'appeal to emotion')

In [None]:
for_against_texts = dict()
perspective_texts = dict()

for k in classes_selected:
    for_against_texts[k] = [comment['body'].lower() for comment in for_against_logical[k]]
    perspective_texts[k] = [comment['body'].lower() for comment in perspective_logical[k]]

In [None]:
# Trigger count by path

trigger_count_by_path_for_against = dict()
trigger_count_by_path_perspective = dict()

for k in classes_selected:
    print(k)
    trigger_count_by_path_for_against[k] = get_trigger_count_by_path(for_against_texts[k], get_personal_pronoun_ids)
    trigger_count_by_path_perspective[k] = get_trigger_count_by_path(perspective_texts[k], get_personal_pronoun_ids)

fallacy of relevance


100%|██████████| 15549/15549 [04:52<00:00, 53.14it/s] 
100%|██████████| 5099/5099 [01:17<00:00, 65.50it/s] 


faulty generalization


100%|██████████| 17420/17420 [05:17<00:00, 54.81it/s] 
100%|██████████| 5508/5508 [01:25<00:00, 64.12it/s] 


ad hominem


100%|██████████| 13054/13054 [03:27<00:00, 62.95it/s] 
100%|██████████| 4974/4974 [01:11<00:00, 69.62it/s] 


intentional


100%|██████████| 11516/11516 [02:35<00:00, 74.10it/s] 
100%|██████████| 4478/4478 [00:46<00:00, 95.94it/s] 


appeal to emotion


100%|██████████| 4006/4006 [00:50<00:00, 79.11it/s] 
100%|██████████| 1335/1335 [00:17<00:00, 77.58it/s] 


In [None]:
save_obj(trigger_count_by_path_for_against, '/content/gdrive/MyDrive/Temp/69-trigger_count_by_path_for_against.pkl')
save_obj(trigger_count_by_path_perspective, '/content/gdrive/MyDrive/Temp/69-trigger_count_by_path_perspective.pkl')

In [None]:
# Words along paths

words_along_paths_for_against = dict()
words_along_paths_perspective = dict()

for k in classes_selected:
    print(k)
    words_along_paths_for_against[k] = get_words_along_paths(for_against_texts[k], get_personal_pronoun_ids)
    words_along_paths_perspective[k] = get_words_along_paths(perspective_texts[k], get_personal_pronoun_ids)

fallacy of relevance


100%|██████████| 15549/15549 [05:03<00:00, 51.19it/s] 
  0%|          | 0/5099 [00:00<?, ?it/s]Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/queues.py", line 245, in _feed
    send_bytes(obj)
  File "/usr/lib/python3.8/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/usr/lib/python3.8/multiprocessing/connection.py", line 405, in _send_bytes
    self._send(buf)
  File "/usr/lib/python3.8/multiprocessing/connection.py", line 368, in _send
    n = write(self._handle, buf)
BrokenPipeError: [Errno 32] Broken pipe
100%|██████████| 5099/5099 [01:19<00:00, 63.93it/s] 


faulty generalization


100%|██████████| 17420/17420 [05:18<00:00, 54.69it/s] 
100%|██████████| 5508/5508 [01:26<00:00, 63.61it/s] 


ad hominem


100%|██████████| 13054/13054 [03:30<00:00, 61.98it/s] 
100%|██████████| 4974/4974 [01:08<00:00, 72.28it/s]


intentional


100%|██████████| 11516/11516 [02:34<00:00, 74.69it/s] 
100%|██████████| 4478/4478 [00:54<00:00, 81.83it/s] 


appeal to emotion


100%|██████████| 4006/4006 [00:55<00:00, 72.52it/s] 
100%|██████████| 1335/1335 [00:18<00:00, 72.32it/s] 


In [None]:
save_obj(words_along_paths_for_against, '/content/gdrive/MyDrive/Temp/69-words_along_paths_for_against.pkl')
save_obj(words_along_paths_perspective, '/content/gdrive/MyDrive/Temp/69-words_along_paths_perspective.pkl')

In [None]:
# Comments by path

comment_ids_for_against = dict()
comment_list_for_against = dict()
comment_ids_perspective = dict()
comment_list_perspective = dict()

for k in classes_selected:
    print(k)
    comment_ids_for_against[k], comment_list_for_against[k] = get_comments_by_path(for_against_texts[k], get_personal_pronoun_ids)
    comment_ids_perspective[k], comment_list_perspective[k] = get_comments_by_path(perspective_texts[k], get_personal_pronoun_ids)

fallacy of relevance


100%|██████████| 15549/15549 [05:08<00:00, 50.33it/s]
100%|██████████| 5099/5099 [01:20<00:00, 63.58it/s] 


faulty generalization


100%|██████████| 17420/17420 [05:18<00:00, 54.76it/s] 
100%|██████████| 5508/5508 [01:26<00:00, 63.49it/s] 


ad hominem


100%|██████████| 13054/13054 [03:37<00:00, 59.97it/s] 
100%|██████████| 4974/4974 [01:12<00:00, 68.33it/s] 


intentional


100%|██████████| 11516/11516 [02:28<00:00, 77.69it/s] 
100%|██████████| 4478/4478 [00:50<00:00, 89.31it/s] 


appeal to emotion


100%|██████████| 4006/4006 [00:49<00:00, 80.47it/s] 
100%|██████████| 1335/1335 [00:17<00:00, 74.45it/s] 


In [None]:
save_obj(comment_ids_for_against, '/content/gdrive/MyDrive/Temp/69-comment_ids_for_against.pkl')
save_obj(comment_list_for_against, '/content/gdrive/MyDrive/Temp/69-comment_list_for_against.pkl')
save_obj(comment_ids_perspective, '/content/gdrive/MyDrive/Temp/69-comment_ids_perspective.pkl')
save_obj(comment_list_perspective, '/content/gdrive/MyDrive/Temp/69-comment_list_perspective.pkl')

# Computation for Top 20 Perspective users

- on the basis of coreness value from core-periphery study

In [None]:
def get_perspective_threads():
    """Returns a list of threads posted as Perspective debates"""
    threads = []

    for category in categories_selected:
        reader_addr = f'/content/gdrive/MyDrive/DL/CreateDebate/{category}/threads.log'
        reader = open(reader_addr, 'rb')
        try:
            while True:
                e = pickle.load(reader)
                for_against_counter = 0
                perspective_counter = 0
                for k, v in e.comments.items():
                    if v.polarity == 'Not Available':
                        perspective_counter += 1
                    else:
                        for_against_counter += 1
                if perspective_counter > for_against_counter:
                    threads.append(e)
        except:
            reader.close()
    
    return threads

In [None]:
perspective_threads = get_perspective_threads()

In [None]:
P_author_map, P_reverse_map, P_author_count, P_graph, P_matrix = build_graph(user_list, perspective_threads)

In [None]:
P_coreness = get_coreness_dict(P_graph)

In [None]:
P_coreness_list = list()
for k, v in P_coreness.items():
    P_coreness_list.append((k, v))
P_coreness_list = sorted(P_coreness_list, key=lambda z: z[1], reverse=True)

In [None]:
P_top_users = list()

for i in range(20): # select top-20 users on coreness value
    P_top_users.append(P_reverse_map[P_coreness_list[i][0]])

In [None]:
P_top_users_set = set(P_top_users)

In [None]:
P_comments_for_against = dict()
P_comments_perspective = dict()

for k in classes_selected:
    P_comments_for_against[k] = list()
    P_comments_perspective[k] = list()

    for comment in for_against_logical[k]:
        if comment['author'] in P_top_users_set:
            P_comments_for_against[k].append(deepcopy(comment))

    for comment in perspective_logical[k]: 
        if comment['author'] in P_top_users_set:
            P_comments_perspective[k].append(deepcopy(comment))

In [None]:
for k in classes_selected:
    print(k, len(P_comments_for_against[k]), len(P_comments_perspective[k]))

fallacy of relevance 2995 2348
faulty generalization 3209 2300
ad hominem 3630 2490
intentional 3108 2461
appeal to emotion 804 572


In [None]:
P_texts_for_against = dict()
P_texts_perspective = dict()

for k in classes_selected:
    print(k)
    P_texts_for_against[k] = [comment['body'].lower() for comment in P_comments_for_against[k]]
    P_texts_perspective[k] = [comment['body'].lower() for comment in P_comments_perspective[k]]

fallacy of relevance
faulty generalization
ad hominem
intentional
appeal to emotion


In [None]:
# Trigger count by path
P_trigger_count_by_path_for_against = dict()
P_trigger_count_by_path_perspective = dict()

for k in classes_selected: 
    print(k)
    P_trigger_count_by_path_for_against[k] = get_trigger_count_by_path(P_texts_for_against[k], get_personal_pronoun_ids)
    P_trigger_count_by_path_perspective[k] = get_trigger_count_by_path(P_texts_perspective[k], get_personal_pronoun_ids)

save_obj(P_trigger_count_by_path_for_against, '/content/gdrive/MyDrive/Temp/69-P_trigger_count_by_path_for_against.pkl')
save_obj(P_trigger_count_by_path_perspective, '/content/gdrive/MyDrive/Temp/69-P_trigger_count_by_path_perspective.pkl')

fallacy of relevance


100%|██████████| 2995/2995 [00:53<00:00, 55.71it/s]
100%|██████████| 2348/2348 [00:29<00:00, 78.40it/s] 


faulty generalization


100%|██████████| 3209/3209 [00:55<00:00, 57.99it/s] 
100%|██████████| 2300/2300 [00:35<00:00, 64.73it/s] 


ad hominem


100%|██████████| 3630/3630 [00:47<00:00, 75.92it/s] 
100%|██████████| 2490/2490 [00:27<00:00, 90.17it/s]


intentional


100%|██████████| 3108/3108 [00:33<00:00, 93.38it/s] 
100%|██████████| 2461/2461 [00:21<00:00, 113.57it/s]


appeal to emotion


100%|██████████| 804/804 [00:11<00:00, 68.68it/s] 
100%|██████████| 572/572 [00:07<00:00, 75.57it/s] 


In [None]:
# Words along paths
P_words_along_paths_for_against = dict()
P_words_along_paths_perspective = dict()

for k in classes_selected:
    print(k)
    P_words_along_paths_for_against[k] = get_words_along_paths(P_texts_for_against[k], get_personal_pronoun_ids)
    P_words_along_paths_perspective[k] = get_words_along_paths(P_texts_perspective[k], get_personal_pronoun_ids)

save_obj(P_words_along_paths_for_against, '/content/gdrive/MyDrive/Temp/69-P_words_along_paths_for_against.pkl')
save_obj(P_words_along_paths_perspective, '/content/gdrive/MyDrive/Temp/69-P_words_along_paths_perspective.pkl')

fallacy of relevance


100%|██████████| 2995/2995 [00:50<00:00, 59.13it/s] 
100%|██████████| 2348/2348 [00:25<00:00, 90.83it/s] 


faulty generalization


100%|██████████| 3209/3209 [00:54<00:00, 58.90it/s] 
100%|██████████| 2300/2300 [00:32<00:00, 70.35it/s] 


ad hominem


100%|██████████| 3630/3630 [00:54<00:00, 67.10it/s] 
100%|██████████| 2490/2490 [00:26<00:00, 92.89it/s]


intentional


100%|██████████| 3108/3108 [00:35<00:00, 88.28it/s] 
100%|██████████| 2461/2461 [00:20<00:00, 121.75it/s]


appeal to emotion


100%|██████████| 804/804 [00:10<00:00, 74.83it/s] 
100%|██████████| 572/572 [00:08<00:00, 70.41it/s] 


In [None]:
# Comments by path
P_comment_ids_for_against = dict()
P_comment_list_for_against = dict()
P_comment_ids_perspective = dict()
P_comment_list_perspective = dict()

for k in classes_selected:
    print(k)
    P_comment_ids_for_against[k], P_comment_list_for_against[k] = get_comments_by_path(P_texts_for_against[k], get_personal_pronoun_ids)
    P_comment_ids_perspective[k], P_comment_list_perspective[k] = get_comments_by_path(P_texts_perspective[k], get_personal_pronoun_ids)

save_obj(P_comment_ids_for_against, '/content/gdrive/MyDrive/Temp/69-P_comment_ids_for_against.pkl')
save_obj(P_comment_list_for_against, '/content/gdrive/MyDrive/Temp/69-P_comment_list_for_against.pkl')
save_obj(P_comment_ids_perspective, '/content/gdrive/MyDrive/Temp/69-P_comment_ids_perspective.pkl')
save_obj(P_comment_list_perspective, '/content/gdrive/MyDrive/Temp/69-P_comment_list_perspective.pkl')

fallacy of relevance


100%|██████████| 2995/2995 [00:55<00:00, 54.44it/s]
100%|██████████| 2348/2348 [00:24<00:00, 94.41it/s] 


faulty generalization


100%|██████████| 3209/3209 [00:54<00:00, 58.86it/s] 
100%|██████████| 2300/2300 [00:33<00:00, 68.35it/s] 


ad hominem


100%|██████████| 3630/3630 [00:49<00:00, 72.63it/s] 
100%|██████████| 2490/2490 [00:28<00:00, 85.97it/s]


intentional


100%|██████████| 3108/3108 [00:33<00:00, 92.16it/s] 
100%|██████████| 2461/2461 [00:19<00:00, 128.11it/s]


appeal to emotion


100%|██████████| 804/804 [00:11<00:00, 67.55it/s] 
100%|██████████| 572/572 [00:11<00:00, 50.81it/s]


# Analysis

In [None]:
Types = ('for-against-all', 'perspective-all', 'for-against-top', 'perspective-top')

TriggerCount = (
    trigger_count_by_path_for_against,
    trigger_count_by_path_perspective,
    P_trigger_count_by_path_for_against,
    P_trigger_count_by_path_perspective
)

WordsAlongPaths = (
    words_along_paths_for_against,
    words_along_paths_perspective,
    P_words_along_paths_for_against,
    P_words_along_paths_perspective,
)

CommentIds = (
    comment_ids_for_against,
    comment_ids_perspective,
    P_comment_ids_for_against,
    P_comment_ids_perspective
)

CommentList = (
    comment_list_for_against,
    comment_list_perspective,
    P_comment_list_for_against,
    P_comment_list_perspective
)

In [None]:
data = dict()

for i, Type in enumerate(Types):
    data[Type] = dict()

    for cls in classes_selected:
        data[Type][cls] = dict()

        # Get top 5 dependency paths
        dependency_path_list = list()
        for k, v in TriggerCount[i][cls].items():
            dependency_path_list.append((k, v))
        dependency_path_list = sorted(dependency_path_list, key=lambda z: z[1], reverse=True)[:5]

        for dependency_path, _ in dependency_path_list:
            detailed_depenency_path = ' -> '.join(map(spacy.explain, dependency_path.split(' -> ')))
            data[Type][cls][detailed_depenency_path] = list()

            # Get most used actual words for this dependency
            words_dict = dict()
            for words in WordsAlongPaths[i][cls][dependency_path]:
                if words not in words_dict:
                    words_dict[words] = 0
                words_dict[words] += 1
            words_list = list()
            for k, v in words_dict.items():
                words_list.append((k, v))
            words_list = sorted(words_list, key=lambda z: z[1], reverse=True)[:5]

            for words, _ in words_list:
                data[Type][cls][detailed_depenency_path].append(words)

In [None]:
with open('./words.json', 'w') as f:
    json.dump(data, f, indent=4)

In [None]:
def reduce_array(a):
    d = {}
    for x in a:
        if x not in d:
            d[x] = 0
        d[x] += 1
    lst = []
    for k, v in d.items():
        lst.append((k, v))
    lst = sorted(lst, key=lambda z: z[1], reverse=True)
    s = []
    for k, v in lst:
        s.append(f'{k} (x{v}), ')
    return ''.join(s)

In [None]:
data = dict()

for i, Type in enumerate(Types):
    data[Type] = dict()

    for cls in classes_selected:
        data[Type][cls] = dict()

        # Get top 5 dependency paths
        dependency_path_list = list()
        for k, v in TriggerCount[i][cls].items():
            dependency_path_list.append((k, v))
        dependency_path_list = sorted(dependency_path_list, key=lambda z: z[1], reverse=True)[:5]

        for dependency_path, _ in dependency_path_list:
            detailed_depenency_path = ' -> '.join(map(spacy.explain, dependency_path.split(' -> ')))
            data[Type][cls][detailed_depenency_path] = list()

            # Get 2 comments
            comment_ids_list = list()
            for k, v in CommentIds[i][cls][dependency_path].items():
                comment_ids_list.append((k, len(v)))
            comment_ids_list = sorted(comment_ids_list, key=lambda z: z[1], reverse=True)

            filtered_cids = list()
            for cid, _ in comment_ids_list:
                cur_comment_text = CommentList[i][cls][cid]
                if (len(cur_comment_text.split()) > 200):
                    continue
                filtered_cids.append(cid)


            for cid in filtered_cids[:2]:
                comment_body = CommentList[i][cls][cid]
                comment_triggers = reduce_array(CommentIds[i][cls][dependency_path][cid])
                data[Type][cls][detailed_depenency_path].append(dict(comment=comment_body, path=comment_triggers))

In [None]:
with open('./comments.json', 'w') as f:
    json.dump(data, f, indent=4)