In [1]:
%load_ext autoreload

In [11]:
import sys
import os
import json
import re
import pandas as pd
import numpy as np
import nltk

from sklearn.model_selection import train_test_split

pd.set_option('display.max_colwidth', None)

## Load and process Jo's Data:

In [6]:
data_path = '/workspace/ceph_data/argument-undermining/jo_data'

In [7]:
def load_df_from_json(path):
    with open(path, 'r') as f:
        json_preds =[]
        for line in f:
            row = json.loads(line)
            json_preds.append(row)
    
    df = pd.DataFrame(json_preds)
    return df

In [8]:
posts_df_1 = load_df_from_json(data_path + '/emnlp20_arg_attack/data/cmv/posts-radio.jsonlist')
posts_df_2 = load_df_from_json(data_path + '/emnlp20_arg_attack/data/cmv/posts-nairobi.jsonlist')
posts_df = pd.concat([posts_df_1, posts_df_2])
posts_df  = posts_df.drop_duplicates(subset=['id'])
posts_df['post_id'] = posts_df['id'].apply(lambda x: 't3_'+x)

In [6]:
comments_df_1 = load_df_from_json(data_path + '/emnlp20_arg_attack/data/cmv/comments-radio.jsonlist')
comments_df_2 = load_df_from_json(data_path +'/emnlp20_arg_attack/data/cmv/comments-nairobi.jsonlist')
comments_df = pd.concat([comments_df_1, comments_df_2])
comments_df = comments_df.drop_duplicates(subset=['id'])
comments_df['comment_id'] = comments_df['id'].apply(lambda x: 't1_'+x)

In [7]:
post_comments_rel = pd.read_csv(data_path +'/emnlp20_arg_attack/data/posts-qsents.csv')

In [9]:
sents_feats_df = pd.read_csv(data_path +'/feat-combined.csv')
sents_txt_df = pd.read_csv(data_path +'/posts-sents.csv')

#### Preparing data for detecting attackable sentences:

In [64]:
attackable_df = pd.merge(sents_txt_df[['post_id', 'sentence_no', 'sentence']], 
                 sents_feats_df[['split', 'post_id', 'sentence_no', 'direct', 'success_direct', 'all_4', 'success_all_4']], 
                 on=['post_id','sentence_no']
)

attackable_df = pd.merge(attackable_df, posts_df[['post_id', 'title']], on='post_id')

In [65]:
attackable_df_grouped = attackable_df.groupby('post_id').agg({'sentence' : lambda x: list(x),'split': lambda x: list(x)[0],
    'direct': lambda x: list(x), 'all_4': lambda x: list(x), 'title':lambda x: list(x)[0]
}).reset_index()
attackable_df_grouped['qouted_sent_idx'] = attackable_df_grouped.apply(lambda row: [ x1 or x2 for x1, x2 in zip(row['direct'], row['all_4'])], axis=1)

In [66]:
attackable_df_grouped.columns = ['post_id', 'post', 'split', 'direct', 'all_4', 'title', 'qouted_sent_idx']

In [68]:
attackable_df_grouped.to_pickle(data_path + '/vul_data.pickle')

In [69]:
attackable_df_grouped.split.value_counts()

train    25839
val       8763
test      8558
Name: split, dtype: int64

In [33]:
attackable_df_grouped.head()

Unnamed: 0,post_id,post,split,direct,all_4,title,qouted_sent_idx
0,t3_1u4mmo,"[I believe that a church is like any other profit making business and therefore most pay similar taxes., I am not a religious person myself so I do not know the complete logistics of the revenue and spending of a church but I do understand they can yield a good amount of cash., http://www.patheos.com/blogs/friendlyatheist/2012/06/16/the-yearly-cost-of-religious-tax-exemptions-71000000000/ not entirely sure how accurate this article is, but it states that we could gain an additional 71 billion dollars per year and in a time where we as a country are in TRILLIONS of dollars of debt taxing a church seems reasonable to me., CMV]",train,"[1, 1, 0, 0]","[0, 0, 0, 0]",I believe that churches and other religious institutions should have to pay taxes. CMV,"[1, 1, 0, 0]"
1,t3_1u4mo5,"[So it seems to be common that, when caught cheating, violence is directed toward the person the significant other (SO) cheated with., I don't understand why., You were in a relationship with your SO., You had no agreement with the other party., If you are going to beat the shit out of anyone, it should be your SO., S/he is the one that cheated on you., The other party, provided they aren't a friend or acquaintance doesn't owe you anything., Why should beating the ever-loving shit out of your SO's cheat-mate be acceptable?]",train,"[1, 0, 1, 1, 1, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0]",I don't believe it is acceptable to attack the person that your SO cheated on you with. CMV,"[1, 0, 1, 1, 1, 0, 0, 0]"
2,t3_1u4txm,"[(I've tried researching discussions on this topic before but I only found ones involving incredibly biased participants., Hope you guys can do better) I honestly don't understand how there is a significant number of people who believe that capitalism* can be replaced., History shows that absolutely no ruler in a socialist or communist regime is capable of upholding their ideals., If you subscribe to the notion that socialism/communism is supposed to exist without a state, then I respond that in any circumstance, whether it's survival or business, etc, people will gravitate towards leaders, whether it's in a formal or informal fashion (for instance, my workplace has a fairly ""flat"" structure but even within my group, I always seek the advice of the more experienced people)., Furthermore, even though I do agree that everyone has the right to have their MOST basic needs fulfilled (via, say, some basic income that allows people to eat, drink water and practice basic hygiene with anything else having to be worked for), I don't understand how one can transition to an economic system that no longer seeks profit., Yes, infinite growth in a finite world and all that, but without profit, we have stagnation., If you were to tell me that my income would not change for the rest of my life, I would only do the bare minimum to get by, regardless of how much I enjoyed my job., I deserve to be rewarded for my efforts., Therefore I do not understand why is it not possible to simply retool the existing system (by enforcing tax laws more harshly on large fortunes and boosting the welfare state, for instance) to allow everyone to have a living wage., Yes, some might call it implausible but I find it much less implausible than the alternative., Forgive my lack of coherence and I understand I might have some incorrect notions about this topic but that's what I am here for., I await a hopefully enlightening discussion., * Note: let's be exact here: I also think that when people online call for the end of capitalism, most of them are referring to the capitalism practiced IN THE UNITED STATES: As a European citizen, I honestly look at said system more as borderline Cyberpunk-esque anarcho capitalism more than anything else., Do you think the discussions I mentioned are mostly a result of semantic differences?, (I have heavily considered this as well)., If so, please mention it below.]",train,"[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","There is no viable alternative to capitalism, though the current system can be altered to serve people's needs. CMV","[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]"
3,t3_1u4wid,"[I say ""almost always"" because you never know when a LeBron-type athletic QB will come along., My only caveat is a bona fide freak of nature., I've yet to see such a player., My reasons: 1., The NFL is a pass-first league., Without an elite passer, teams cannot compete for Super Bowls., Aside from a few anomalies, this is a hard and fast rule., The role of the QB is to get the ball to the open man downfield., A completed pass, on average, nets more yardage than a run, therefore a completed pass is more valuable than a run., Teams that put a high value on a QB's ability to run the ball are over-valuing this skill., 2., QB's that have the ability to run learn from a young age to bail on pass plays too early whenever they feel the rush closing in., They have likely been the best and fastest athletes on the field during their entire amateur careers, so running has likely led to big plays., In the NFL, the defenses are bigger and faster and can close on a QB much more quickly than high school or college defenders., 3., QB's with elite running ability take more risks during a game., By holding the ball or leaving the pocket, they are opening themselves up to big hits resulting in fumbles and devastating injuries., Injured QB's have no value to their team., Chronically injured QB's drafted as a franchise QB not only have no value, but they can leave a team wallowing in mediocrity for years/decades., Injuries must be avoided at all costs., 4., Most running QB's have to be ""broken"" of their penchant for scrambling in lieu of keeping their eyes downfield., This causes their development to be stunted in comparison to QB's whose sole focus has been honing their elite passing skills only., In a league where the continued employment of coaches and GM's is contingent upon winning right now, a delay in the development of a QB can cost seasons and jobs (and thus continuity)., Franchises with frequent management turnover are not competitive., 5., Tom Brady, Peyton Manning, Drew Brees - all have basically zero running skills whatsoever yet all three are the best of their generation and Super Bowl Champs., *The following is an aside and not meant to be part of my view that needs changing., In conclusion (and just to add a little more controversy to my ramblings), it is my position that Cam Newton, RG 3, and Colin Kaepernick will never win a Super Bowl., If we look ahead to the 2014 NFL draft, my belief is that the biggest gamble and potential franchise killer amongst the available QB's is Johnny Manziel., If my analysis holds true, any team that uses a high first round pick on Manziel to be their franchise QB is likely betting their future on a player that has an extremely small probability to lead them to a Super Bowl win., Change my view.]",train,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","Given the option to draft either a QB with elite passing AND elite running skills or a QB with elite passing skills ONLY, NFL teams should almost always draft the QB with elite passing skills only. CMV","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]"
4,t3_1u5fux,"[I don't think that having lots of sex negatively affects people., If you are safe and smart, there aren't many, maybe not any, negative consequences., I like my body, I like the things my body can do, and I like the way it feels when I have sex., So I have a lot of sex., If a slut is a girl who has a lot of sex, I'm a slut., I don't see why being a slut is something I should avoid., I don't see why people act like having sex causes some deep psychological scars., Sex crimes, like the one I was a victim of, are bad and hurtful and cause permanent damage., But after a lot of therapy and time I was able to recognize that sex is something fun that makes you feel close to other people., I think a relationship is when you find someone who is the only person you want to feel so close to., I'm in a serious relationship, and for some reason (I actually don't know why) he is the only person I want to be with., I love him, I love being with him, and I love when we have sex., So if sex is all good warm soft things, then why would being a girl who has lots of sex, or in my situation used to have lots of sex, be a bad or shameful thing?]",train,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",I don't think promiscuity is a bad thing. CMV.,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]"


In [38]:
#attackable_df_grouped['post_len'] = attackable_df_grouped.post.apply(lambda x: len([token for sent in x for token in nltk.word_tokenize(sent)]))
attackable_df_grouped['post_title_len'] = attackable_df_grouped.title.apply(lambda x: len(nltk.word_tokenize(x)))

In [39]:
print('Avg post len:', np.mean(attackable_df_grouped.post_len.tolist()))

Avg post len: 367.974328081557


In [40]:
print('Avg title len:', np.mean(attackable_df_grouped.post_title_len.tolist()))

Avg title len: 18.107020389249303


#### Preparing data for generating counter claims:

In [9]:
attackable_df_grouped =  pd.read_pickle(data_path + '/vul_data.pickle')

In [10]:
post_comments_rel['attacked'] = post_comments_rel.apply(lambda row: row['direct_n_quotes'] > 0 or row['all_4_n_quotes'] > 0 , axis=1)

In [11]:
post_comments_df = post_comments_rel[post_comments_rel['attacked']]
post_comments_df = pd.merge(post_comments_df, attackable_df_grouped[['post_id', 'title', 'post', 'split']], on='post_id')
post_comments_df = pd.merge(post_comments_df, comments_df[['comment_id', 'body']], on='comment_id')
post_comments_df = post_comments_df[['post_id', 'split', 'comment_id', 'title', 'post', 'body', 'n_sentences', 'direct_sents', 'all_4_sents']]
post_comments_df.columns= ['post_id', 'split', 'comment_id', 'title', 'post', 'comment', 'n_sentences', 'direct_sents', 'all_4_sents']

In [12]:
post_comments_df.split.value_counts()

train    67676
test     23067
val      22398
Name: split, dtype: int64

In [13]:
post_comments_df['direct_sents'] = post_comments_df['direct_sents'].apply(lambda x: x.split(',') if type(x) == str else [])
post_comments_df['all_4_sents'] = post_comments_df['all_4_sents'].apply(lambda x: x.split(',') if type(x) == str else [])
post_comments_df['title'] = post_comments_df['title'].apply(lambda x: re.sub(r'((cmv)|(CMV)):?', '', x))
post_comments_df['comment_sents'] = post_comments_df['comment'].apply(lambda x: nltk.sent_tokenize(x.lower()))

In [14]:
post_comments_df['post'] = post_comments_df['post'].apply(lambda post: [normalizeString(sent).lower() for sent in post])
post_comments_df['comment_sents'] = post_comments_df['comment_sents'].apply(lambda comment: [normalizeString(sent.replace('&gt;', '')) for sent in comment])

In [15]:
post_comments_df.split.value_counts()

train    67676
test     23067
val      22398
Name: split, dtype: int64

In [91]:
post_comments_df.to_pickle(data_path + '/gen_data.pickle')

In [27]:
def extract_premise_counter_pairs(row, premise_max_sents=5):
    attacking_indices = []
    prev_sents = []
    premise_counter_premise_pairs = []
    post_sents = [sent for sent in row['post'] if sent.strip() != '']
    #print(post_sents)
    #print(row['direct_sents'])
    for sent in row['comment_sents']:
        sent_checks = [sent in post_sent or post_sent in sent for post_sent in post_sents]
        if any(sent_checks):
            sent_idx = sent_checks.index(True)
            #print(sent_idx)
            #print(row['post'][sent_idx])
            prev_sents.append(sent)
        elif len(prev_sents) > 0:
            premise_counter_premise_pairs.append([prev_sents, [sent]])
            prev_sents = []
        else:
            if len(premise_counter_premise_pairs) != 0:
                premise_counter_premise_pairs[-1][1].append(sent)
    
    #print(row['comment_sents'])
    #print('========')
    premise_counter_premise_pairs = [[x[0], " ".join(x[1][0:premise_max_sents])] for x in premise_counter_premise_pairs]
    #print(premise_counter_premise_pairs)
    #print('===================================')
    
    return premise_counter_premise_pairs

In [28]:
post_comments_df["premise_counter_premise_pairs"] = post_comments_df.apply(lambda x: extract_premise_counter_pairs(x) , axis=1)

In [18]:
post_comments_df["num_attacks"] = post_comments_df['premise_counter_premise_pairs'].apply(lambda x: len(x))

In [20]:
post_comments_df = post_comments_df[post_comments_df.num_attacks > 0]

In [None]:
post_comments_df.to_pickle(data_path + '/gen_data.pickle')

In [29]:
post_comments_df.split.value_counts()

train    53558
test     18018
val      17567
Name: split, dtype: int64