In [None]:
import pandas as pd
import re
import nltk
import json
import itertools
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import tokenize
from itertools import *
from click import style
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import logging
import os
import json_lines
import seaborn as sns
tqdm.pandas()


nltk.download('stopwords')
nltk.download('punkt')
pd.set_option('display.max_colwidth', None)

# Load dataset

In [2]:
with open("../data/debateorg/debateorg.json", "r") as d:
    debates_dict = json.load(d)
with open("../data/debateorg/users.json", "r") as u:
      users_dict = json.load(u)

In [3]:
def extract_data(debates_data: dict, users_data: dict) -> pd.DataFrame:
    """Extract and combines debates and user data into a single dataframe. Return the dataframe.
    Currently, only the birthday, education, gender and political orientation are extracted and
    returned as user-defining features.
    Arguments:
    debates_data -- Dictionary containing the debates data.
    users_data -- Dictionary containing the users and their properties.
    """
    extracted_data = []
    properties_of_interest = ["birthday", "ethnicity", "gender", "political_ideology", "education", 
                              "interested", "income", "looking", "party", "relationship", "win_ratio", 
                              "religious_ideology", "number_of_all_debates", "big_issues_dict"]

    for key, debate in tqdm(debates_data.items()):
        # Sometimes, the users of the debate didn't exist anymore at the time
        # the data was collected.
        try:
            category = debate["category"]
        except KeyError:
            category = None
            
        try:
            title = debate["title"]
        except KeyError:
            title = None
        
        try:
            date = debate["start_date"]
        except KeyError:
            date = None
        
        try:
            user1 = users_data[debate["participant_1_name"]]
        except KeyError:
            user1 = None

        try:
            user2 = users_data[debate["participant_2_name"]]
        except KeyError:
            user2 = None

        # If both users do not exist, skip this debate
        if not user1 and not user2:
            logging.debug("Both users are absent from debate data. Skipping.")
            continue

        # For each round in this debate...
        for debate_round in debate["rounds"]:
            # For each argument in this round...
            for argument in debate_round:
                arguing_user = (
                    user1 if argument["side"] == debate["participant_1_position"] else user2)
                
                arguing_user_name = (
                    debate["participant_1_name"] if argument["side"] == debate["participant_1_position"] else debate["participant_2_name"])

                # Skip this argument if arguing user does not exist in the dta
                if not arguing_user:
                    continue
                    
                # Filtering for votes
                votes = []
                for vote in debate['votes']:
                    votes.append(vote['votes_map'][arguing_user_name])

                # Filtering for relevant properties
                properties = {
                    key: value
                    for key, value in arguing_user.items() if key in properties_of_interest}

                # Save the text and find the political ideology of the user.
                extracted_data.append({
                    "argument": argument["text"],
                    "title": title,
                    "category": category,
                    "date": date,
                    **properties,
                    "votes": votes})

    return pd.DataFrame(columns=["argument", "title", "category", "date", *properties_of_interest, "votes"], data=extracted_data)

In [None]:
df_debates = extract_data(debates_dict, users_dict)

# Load Target Terms

## Queerness

In [10]:
def queerness_tt():
    targets_1 = ["gay", "gays", "lesbian", "lesbians", "bisexual", "bisexuals", "homosexual", "homosexuals",
                "transgender", "transgenders", "sapphic", "sapphics", "pansexual", "pansexuals", "queer", "queers",
                "genderqueer", "genderqueers", "transsexual", "transsexuals", "crosssexual", "crosssexuals", "epicene",
                 "epicenes", "lgbtq", "lgbtqi", "hermaphroditic", "hermaphrodite", "hermaphrodites", "androgynous", "androgyne",
                 "androgynes", "intersex"]
    targets_2 = ["straight", "straights", "monosexual", "monosexuals", "heterosexual", "heterosexuals", "cisgender",
                "cisgenders", "sexually normal", "hetero", "heteros", "unisexual", "unisexuals"]  
    attributes_1 = ["mentally ill", "polygamous", "flamboyant", "pedophile", "sexual predator", "mental disorder",
                    "irreligious", "villain", "serial killer", "depress", "adultery", "drug addict", "sadist", "arts",
                    "artistic", "theatre", "dramatic", "show queen", "fashion", "predator", "party drugs",
                    "illicit drugs", "alcohol", "sexual abuse", "child rapist", "indecisi", "promiscuous",
                    "sexually manipulative", "prostitute", "drag queen", "drag king", "criminal", "hypersexual",
                    "asexual", "pervert", "perverts", "poofter", "faggot", "fag", "weak", "timid", "loser", "loner",
                    "lonely", "sin", "disease", "confuse", "strange", "weird", "coward", "sexually abnormal",
                    "immoral", "insecure", "repulsive", "frustrat", "sinful", "sensitive", "weak minded", "lack self control",
                    "oversex", "submissive", "emotional"]
    attributes_2 = ["mentally", "strong", "monogamous", "modest", "teleiophile", "protector", "mental stability",
                    "hero", "police officer", "cheerful", "healthy", "compassionate", "scientific", "science", "subtle",
                    "unfashion", "defender", "determined", "moral", "chaste", "cisgender", "lawful", "decent", "saint",
                    "faggot", "strong", "daring", "winner", "cheerful", "virtue", "health", "assured", "normal",
                    "normal", "brave", "moral", "confident", "delightful", "satisfied", "innocent", "tough",
                    "dominating", "unemotional"]
    return targets_1, targets_2, attributes_1, attributes_2

## Religion

In [43]:
def religious_tt():
    targets_1 = ["muslim", "muslims", "islam", "islamic", "islamism", "arab", "arabs", "quran", "koran"]
    targets_2 = ["christian", "christians", "christianity", "church", "bible", "priest"]   
    attributes_1 = ["terrorist", "terrorism", "threat", "threaten", "dangerous", "deadly", "attack", "bomb", "murder",
                    "hijack", "criminal", "enemy", "war", "enemies", "extremist", "radical", "hate", "rich", "lazy",
                    "flashy", "lazy sheik", "oil sheik", "rapist", "slaves", "shooter", "belly dancers", "hijab",
                    "viel", "vielded", "oppressed", "patriarchy", "misogyny", "patriarchal", "deport", "detain",
                    "thieves", "thug", "charlatan", "brute", "power hungry", "beard", "wealthy oilmen",
                    "harem maiden", "suicide bomb", "headscarves", "violence", "violent", "hostile", "sinister"]
    attributes_2 = ["police", "nonviolence", "safety", "defend", "helpful", "safe", "defend", "build", "restore",
                    "relinquish", "lawful", "friend", "peace", "friends", "moderate", "conservative", "love", "poor",
                    "energetic", "simple", "defender", "freemen", "hero", "hat", "unviel", "unvielded", "rejoiced",
                    "matriarchy", "philogyny", "matriarchal", "admit", "liberate", "police", "hero", "honest", "mild",
                    "gratified", "clean shave", "negotiator", "compassion", "gentle", "kind", "happy"]
    return targets_1, targets_2, attributes_1, attributes_2


# Preprocess Dataset

In [5]:
# Returns number of words in string
def countWords(string):
    state = 'OUT'
    wc = 0
 
    # Scan all characters one by one
    for i in range(len(string)):
 
        # If next character is a separator,
        # set the state as OUT
        if (string[i] == ' ' or string[i] == '\n' or
            string[i] == '\t'):
            state = 'OUT'
 
        # If next character is not a word
        # separator and state is OUT, then
        # set the state as IN and increment
        # word count
        elif state == 'OUT':
            state = 'IN'
            wc += 1
 
    # Return the number of words
    return wc
 

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

stop_words = set(stopwords.words('english')) 

def preprocessStringToTokens(opinion_string):
    # 1. First Lower Case everything
    opinion_string = opinion_string.lower()

    # 2. Decontract words
    opinion_string = decontracted(opinion_string)

    # 3. Remove numbers
    opinion_string = re.sub(r'\d+', '', opinion_string)

    # 4. Remove URLs
    opinion_string = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', opinion_string)

    # 5. Split into sentences
    opinion_sentences = tokenize.sent_tokenize(opinion_string)

    # 6. Split into tokens and remove punctuation
    wordMap = []
    for s in opinion_sentences:
        s = re.sub(r"[^A-Za-z]+", " ", s)
        wordMap.append(word_tokenize(s))

    return wordMap

def preprocessForAnnotation(opinion_string):
    # 1. Lowercase letters
    opinion_string = opinion_string.lower()
    
    # 2. Remove numbers
    opinion_string = re.sub(r'\d+', '', opinion_string)

    # 3. Remove URLs
    opinion_string = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', opinion_string)  
    
    # 4. Remove special characters
    opinion_string = opinion_string.replace('\n',' ')
    opinion_string = opinion_string.replace('\r',' ')
    
    # 5. Remove large white spaces    
    opinion_string = ' '.join(opinion_string.split())
    return opinion_string

def splitIntoSentences(opinion_sentence):
    # Split into sentences
    opinion_sentences = tokenize.sent_tokenize(opinion_sentence)
    
    return opinion_sentences

def preprocessStringToSentences(opinion_string):
    # 1. First Lower Case everything
    opinion_string = opinion_string.lower()

    # 2. Decontract words
    opinion_string = decontracted(opinion_string)

    # 3. Remove numbers
    opinion_string = re.sub(r'\d+', '', opinion_string)

    # 4. Remove URLs
    opinion_string = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', opinion_string)

    # 5. Split into sentences
    opinion_sentences = tokenize.sent_tokenize(opinion_string)

    # 6. Remove punctuation
    wordMap = []
    for s in opinion_sentences:
        s = re.sub(r"[^A-Za-z]+", " ", s)
        wordMap.append(s)

    return wordMap

In [None]:
df_debates['preprocessedArgTokens'] = df_debates.progress_apply(lambda row: preprocessStringToTokens(row['argument']),axis=1)
df_debates['annotationComment'] = df_debates.progress_apply(lambda row: preprocessForAnnotation(row['argument']),axis=1)
df_debates['annotationSentence'] = df_debates.progress_apply(lambda row: splitIntoSentences(row['annotationComment']),axis=1)
df_debates['preprocessedArgSentence'] = df_debates.progress_apply(lambda row: preprocessStringToSentences(row['argument']),axis=1)

In [7]:
df_debates = df_debates.reset_index()
df_debates = df_debates.rename(columns = {'index':'ID'})

## Extract biased co-occurrence in sliding window

In [12]:
def get_windows(words, window_size):
    i = 0
    while i < len(words):
        center_word = words[i]
        # Calculate context words
        if ((i-window_size) < 0):
            first_word_index = 0
        else:
            first_word_index = (i-window_size)
        if (i + window_size) < (len(words)):
            last_word_index = i + window_size
        else:
            last_word_index = len(words)
        context_words = words[first_word_index:i] + words[(i+1):last_word_index+1]
        yield center_word, context_words
        i += 1

In [13]:
# Create Cartesian product
def cartesianProductList(list1, list2):
    cartesianProductList = []
    for element in itertools.product(list1, list2):
        cartesianProductList.append(element)
    return cartesianProductList

# Count occurrences
def countOccurrence(comment, targetWordList):
    totalCount = 0
    for sentence in comment: 
        for e in targetWordList:
            temp = sentence.count(e)
            totalCount = totalCount + temp
    return totalCount

# Check whether tuple in sentence
def findBias(comment, cartesianProductList):
    biasedSentences = []
    foundTuples = []
    for sentence in comment:
        for tup in cartesianProductList:
            if tup[0] in sentence and tup[1] in sentence:
                biasedSentences.append(sentence)
                foundTuples.append(tup)
    return pd.Series([biasedSentences, foundTuples, len(foundTuples)])

# Returns number of words in string
def countWords(string):
    state = 'OUT'
    wc = 0
 
    # Scan all characters one by one
    for i in range(len(string)):
 
        # If next character is a separator,
        # set the state as OUT
        if (string[i] == ' ' or string[i] == '\n' or
            string[i] == '\t'):
            state = 'OUT'
 
        # If next character is not a word
        # separator and state is OUT, then
        # set the state as IN and increment
        # word count
        elif state == 'OUT':
            state = 'IN'
            wc += 1
 
    # Return the number of words
    return wc
 

# Check whether context word is target word and whether attribute is in context
def findBiasInSlidingWindow(comment, sentences, t_list, a_list, windowSize):
    biasedSentences = []
    foundTuples = []
    print(comment)
    for index, sentence in enumerate(comment):
        for center_word, context_words in get_windows(sentence, windowSize):
            if center_word in t_list:
                for e in a_list:
                    if countWords(e) > 1:
                        split = e.split()
                        if split[0] in context_words and split[1] in context_words:
                            biasedSentences.append(sentences[index])
                            foundTuples.append(tuple([e, center_word]))
                    elif e in context_words:
                        biasedSentences.append(sentences[index])
                        foundTuples.append(tuple([e, center_word]))
    return pd.Series([biasedSentences, foundTuples, len(foundTuples)])

# Check whether tuple in sentence
def findBiasCount(comment, cartesianProductList):
    foundTuples = []
    for sentence in comment:
        for tup in cartesianProductList:
            if tup[0] in sentence and tup[1] in sentence:
                foundTuples.append(tup)
    return len(foundTuples)

def calculateSummarySlidingWindow(dataSet, targetTermFunction, windowSize):
    t1, t2, a1, a2 = targetTermFunction()
    t1 = [x.lower() for x in t1]
    t2 = [x.lower() for x in t2]
    a1 = [x.lower() for x in a1]
    a2 = [x.lower() for x in a2]
    df = dataSet.copy()

    # Count word occurrences
    df['minorityCount'] =  df.progress_apply(lambda row: countOccurrence(row['preprocessedArgTokens'], t1),axis=1)
    df['majorityCount'] =  df.progress_apply(lambda row: countOccurrence(row['preprocessedArgTokens'], t2),axis=1)
    

    totalMinorityOccurrence = df['minorityCount'].sum() 
    totalMajorityOccurrence = df['majorityCount'].sum() 

    df[['T1 x A1 Sentences', 'T1 x A1 Tuples Found', 'T1 x A1 Count']] = df.progress_apply(lambda row: findBiasInSlidingWindow(row['preprocessedArgTokens'], row['annotationSentence'], t1, a1, windowSize),axis=1)
   
    t1_a1_total = df['T1 x A1 Count'].sum() 
    if (t1_a1_total > 0):
        t1_a1_percentage = (t1_a1_total / totalMinorityOccurrence) * 100
        t1_a1_percentage = round(t1_a1_percentage, 2)
    else:
        t1_a1_percentage = 0

    return pd.Series([totalMinorityOccurrence, totalMajorityOccurrence, t1_a1_total, t1_a1_percentage]), df