In [1]:
%load_ext autoreload

In [2]:
import os
import sys
import re
from argparse import Namespace

sys.path.append('../src-py/')

os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [3]:
%autoreload

from utils import *
from project_debater_api import *
from mt_bart_v2 import *

In [4]:
import torch
import json

import nltk
import numpy as np
import pandas as pd

from pathlib import Path
from datasets import load_dataset, load_metric, Dataset

from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import BartTokenizer, BartForConditionalGeneration

In [5]:
from tabulate import tabulate

In [6]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

In [7]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [8]:
device

device(type='cuda')

In [9]:
ceph_dir = '/home/sile2804/data-ceph/arguana/arg-generation/multi-taks-counter-argument-generation'
local_home_dir = '../data'

In [10]:
def generate_ds_attacks(ds, model, tokenizer, premises_clm, conclusion_clm, gen_kwargs, skip_special_tokens=True, batch_size=8):
    ds = ds.map(lambda x :preprocess_function(x, tokenizer, premises_clm, 'counter', conclusion_clm=conclusion_clm), batched=True)
    ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    dataloader = torch.utils.data.DataLoader(ds, batch_size=batch_size)
    attacks = generate_counters(model, tokenizer, dataloader, gen_kwargs, skip_special_tokens=skip_special_tokens)
    
    return attacks

#### Load Testing data:

##### Loading validation data from Reddit:

In [11]:
valid_df = pd.read_pickle(ceph_dir + '/reddit_data/conclusion_and_ca_generation/valid_conclusion_comp_remove_75sem_perc_unique_posts_with_arglexrank_conclusions.pkl')

In [15]:
#drop duplicate posts
unique_valid_posts = valid_df.drop_duplicates('post_id')

In [16]:
valid_ds = Dataset.from_pandas(unique_valid_posts)
tmp_ds = valid_ds.train_test_split(0.9)
sample_valid_ds = tmp_ds['test']

#saving the sample
reddit_sample_valid_ds = sample_valid_ds.flatten_indices()
reddit_sample_valid_ds.save_to_disk('../data/sample_valid_ds')

  0%|          | 0/3 [00:00<?, ?ba/s]

In [30]:
#loading the random sample of validation dataset with the auto-generated conclusion
reddit_sample_valid_ds = Dataset.load_from_disk('../data/sample_valid_ds/')

In [31]:
len(reddit_sample_valid_ds)

2278

### Generate predictions on Reddit:

In [14]:
# Loading Reddit models

#masked_conclusion_tokenizer = BartTokenizer.from_pretrained(local_home_dir + '/output/extended_tokenizer')
#masked_conclusion_model = BartForConditionalGeneration.from_pretrained(local_home_dir + '/output/masked-conclusion-bart-model/').to(device)

known_conclusion_model = BartForConditionalGeneration.from_pretrained(local_home_dir  + '/output/known-conclusion-bart-model').to(device)
known_conclusion_tokenizer = BartTokenizer.from_pretrained(local_home_dir + '/output/extended_tokenizer')

pred_conclusion_model = BartForConditionalGeneration.from_pretrained(local_home_dir  + '/output/pred-conclusion-bart-model/').to(device)
pred_conclusion_tokenizer = BartTokenizer.from_pretrained(local_home_dir + '/output/pred-conclusion-bart-model/')

join_model_tokenizer = BartTokenizer.from_pretrained('../data/output/joint-con-counter-bart-model-no-attention/')
#join_model  = BartModelV2.from_pretrained('../data/output/joint-con-counter-bart-model-no-attention-finetuned/0-1-1-0/', conc_loss_weight = 0.1, counter_loss_weight=1.0, attention_to_conc=False, conc_decoder=True).to(device)
join_model  = BartModelV2.from_pretrained('../data/output/joint-con-counter-bart-model-no-attention-finetuned/dynamic-weight', compute_dynamic_weights=True, conc_decoder=True).to(device)

In [32]:
def create_predictions_df(reddit_sample_valid_ds, gen_kwargs):
    
    joint_attacks       = generate_ds_attacks(reddit_sample_valid_ds, join_model, join_model_tokenizer, 'masked_premises', None, gen_kwargs)
    known_conc_attacks  = generate_ds_attacks(reddit_sample_valid_ds, known_conclusion_model, known_conclusion_tokenizer, 'masked_premises', 'title', gen_kwargs)
    bart_conc_attacks   = generate_ds_attacks(reddit_sample_valid_ds, known_conclusion_model, known_conclusion_tokenizer, 'masked_premises', 'bart_conclusion', gen_kwargs)
    arglex_conc_attacks = generate_ds_attacks(reddit_sample_valid_ds, known_conclusion_model, known_conclusion_tokenizer, 'masked_premises', 'arglex_rank_conclusion', gen_kwargs)
    masked_conc_attacks = generate_ds_attacks(reddit_sample_valid_ds, known_conclusion_model, known_conclusion_tokenizer, 'masked_premises', None, gen_kwargs)
    joint_conc_baseline_attacks  = generate_ds_attacks(reddit_sample_valid_ds, pred_conclusion_model, pred_conclusion_tokenizer, 'masked_premises', None, gen_kwargs, skip_special_tokens=False)
    #This model would predict the conclusion and then the counter in one sequence.

    reddit_pred_df = pd.DataFrame(list(zip(
                                           reddit_sample_valid_ds['post_id'],
                                           reddit_sample_valid_ds['title'], 
                                           reddit_sample_valid_ds['conclusion_targets'],
                                           reddit_sample_valid_ds['conclusion_stance'],
                                           reddit_sample_valid_ds['bart_conclusion'], 
                                           reddit_sample_valid_ds['arglex_rank_conclusion'], 
                                           reddit_sample_valid_ds['masked_premises'],
                                           reddit_sample_valid_ds['counter'], 
                                           known_conc_attacks, masked_conc_attacks, 
                                           bart_conc_attacks, arglex_conc_attacks, joint_attacks, joint_conc_baseline_attacks)), 
                    columns=['post_id', 'conclusion', 'conclusion_target', 'conclusion_stance', 'bart_conclusion', 'arglex_rank_conclusion', 'premises', 'gt_attack', 'known_conc_attacks', 
                             'masked_conc_attacks', 'bart_conc_attacks', 'arglex_conc_attacks', 'joint_conc_attacks', 'joint_conc_baseline_attacks'])

    reddit_pred_df['argument'] = reddit_pred_df.apply(lambda row: row['conclusion'] + ' : ' + ' '.join(row['premises']), axis=1)
    reddit_pred_df['premises'] = reddit_pred_df['premises'].apply(lambda x: ' '.join(x))

    #process the jointly generated conclusion and counter
    reddit_pred_df['joint_conc_baseline'] = reddit_pred_df['joint_conc_baseline_attacks'].apply (lambda x: x.split('<counter>')[0])
    reddit_pred_df['joint_conc_baseline_attacks'] = reddit_pred_df['joint_conc_baseline_attacks'].apply (lambda x: x.split('<counter>')[1] if '<counter>' in x else x)
    reddit_pred_df['joint_conc_baseline'] = reddit_pred_df['joint_conc_baseline'].apply (lambda x: re.sub('<s>|</s>|<conclusion>|<counter>|<pad>', '', x).strip())
    reddit_pred_df['joint_conc_baseline_attacks'] = reddit_pred_df['joint_conc_baseline_attacks'].apply (lambda x: re.sub('<s>|</s>|<conclusion>|<counter>|<pad>', '', x).strip())

    return reddit_pred_df

In [33]:
gen_kwargs = {
    "do_sample": False, 
    "max_length":100,
    "top_p":0.95, 
    "top_k":50,
    "num_beams":4
}

reddit_pred_df = create_predictions_df(reddit_sample_valid_ds, gen_kwargs)
reddit_pred_df.to_pickle('../data/output/reddit_pred_df-nosample-beamsize-4.pkl')

gen_kwargs = {
    "do_sample": True, 
    "max_length":100,
    "top_p":0.95, 
    "top_k":50,
    "num_beams":4
}

reddit_pred_df = create_predictions_df(reddit_sample_valid_ds, gen_kwargs)
reddit_pred_df.to_pickle('../data/output/reddit_pred_df-sample-beamsize-4.pkl')

ProConClient: 100%|██████████| 2172/2172 [39:23<00:00,  1.09s/it]
ProConClient: 100%|██████████| 2172/2172 [40:38<00:00,  1.12s/it]


  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]



### Evaluate generated predictions:

In [12]:
reddit_pred_df = pd.read_pickle('../data/output/reddit_pred_df-nosample-beamsize-4.pkl')

In [13]:
reddit_pred_df[['conclusion', 'masked_conc_attacks', 'known_conc_attacks', 'joint_conc_baseline_attacks', 'joint_conc_attacks']].head(n=10)

Unnamed: 0,conclusion,masked_conc_attacks,known_conc_attacks,joint_conc_baseline_attacks,joint_conc_attacks
0,Britons should be proud of the Empire,"i don't think it's a good thing that britain is a part of the world, but it's not a bad thing.","i don't think you should be proud of britain, but i think it's important to remember that the empire was founded on the principles of democracy and individualism. it's not just that the people who built the empire were pro democracy, it's that they were pro individualism and anti individualism as well.","i don't think the empire did much good, but i think it was a big step in the development of the world.","i don't think the empire is evil, but i think it is. i think we should be proud of our culture, our history, and our history."
1,Moral Consequentialism is the only rational basis of evaluating actions in discussions of morality,the problem with consequentialism is that it doesn't have to do with the consequences of an action. it's about the consequences. the consequences are the ultimate basis for any judgment about the rightness or wrongness of that conduct. consequentialism does not have to deal with the moral consequences of the action.,"the problem with consequentialism is that it is not the only rational basis of evaluating actions in discussions of morality. it is the only logical basis for determining whether or not an action benefits a group of people where 'right' means beneficial to their well being and 'wrong' means harmful. the problem with this is that there is no rational basis for evaluating an action that benefits the group. if you want to determine whether an action is beneficial to a group, you have to make sure that","there is no such thing as a 'good' or 'bad' moral evaluation, but there is a good moral evaluation of the consequences of an action.",there is no such thing as 'rightness wrongness' or 'wrongness rightness'. it's just that there is no way to determine whether or not an action is wrong or not.
2,ASAP Ferg’s new song is awful,i don't think asap ferg's new song is really really bad.,"asap ferg's new song is awful. it's not that he's bad, it's just that he doesn't have the ability to make good music.",asap ferg's new song has nothing to do with music. it's just a song that he wrote.,"i don't know what you're talking about, but i think you're missing the point. asap ferg's new song is really really bad. it's not a good song, it's a bad song, but it doesn't have the qualities of good music in any genre."
3,I think hijab is good for the society and for women.,"i'm a muslim, and i think it's a good idea to wear a hijab. i'm not saying that everyone is free to wear whatever he she wants just to make that clear.",i think hijab is good for the society and for women. i don't think it's a good way to reduce rape and harassment.,"i don't think you're right, but i think it's a good idea to change your view.",i don't think hijab is a good approach to reduce rape and harassment.
4,State lines in the United States are drawn unfairly,i don't think it's fair to have two senators in the house of representatives.,"i don't think it's fair to draw the state lines based on the population of the state, but it's not unfair to draw them based on their population.","i think it's unfair that each state has two senators, given the way state lines are drawn now.",i think it's unfair for each state to have two senators.
5,There is no viable alternative to government welfare that offers same level of coverage,the problem with government welfare is that it is not a waste. it is a waste of resources. the problem is that there is no viable alternative that can cover all those people to the same degree as they are taxed.,there is no viable alternative to government welfare that offers same level of coverage as government welfare.,i think you're missing the point. government welfare is a waste of money.,"i don't think it's a waste, but i think it is a waste. if you take away government welfare, there is no viable alternative."
6,Jews should be allowed to circumcise their babies,"i don't think it's a good idea to force circumcision on a child, but i think it should be allowed.","i don't think you should be allowed to circumcise your babies, but i think it's a good idea.","i don't think jews have the right to do so, but i do think they should be able to do it.","i don't think it's a good idea to force people to circumcise their babies, but i think you're missing the point. if you're a jew, you should be able to do it without medical reason."
7,The volume control sliders on video players is a poor method of adjusting the volume.,"i'm not sure what you're talking about, but i think you're missing the point. i'm not saying that the sliders are a poor way of adjusting the volume, but they are a good way to adjust the volume.","i'm not sure what you're talking about, but i think you're missing the point. the volume control sliders are a good way of adjusting the volume of a tv, but they're not the best way to do it.",i don't think the sliders are the best way to adjust the volume.,"i don't think it's a good way to adjust the volume, but i do think that it's not a bad way to change the volume."
8,The entirety of the modern liberal platform is immoral.,i don't think that the modern liberal platform is immoral. they advocate for abortion which is murdering babies. they support gun control which is taking away my natural right to protect myself and my family.,"i don't think that the modern liberal platform is immoral. i think that it is immoral to support abortion, and to support gun control.",i don't think it's immoral to vote for a candidate who is against the 1. i think it is immoral to support a candidate that is not against the 2.,i don't think that the modern liberal platform is immoral. i think it's immoral because it's not immoral.
9,Teachers should not be allowed to conceal carry in school.,"if teachers were allowed to carry, would they also be paid a higher salary for not only teaching but acting as a security guard?",i don't think you should be allowed to conceal carry in school. i think it's a good idea for teachers to be able to carry a gun.,"If teachers were allowed to carry, would they also be paid a higher salary for not only teaching but acting as a security guard? police offers who have gone through years of training often crumble under the pressure of a crisis.",i don't think it's fair to say that teachers should be allowed to carry a gun.


In [14]:
reddit_pred_df.columns

Index(['post_id', 'conclusion', 'conclusion_target', 'conclusion_stance',
       'bart_conclusion', 'arglex_rank_conclusion', 'premises', 'gt_attack',
       'known_conc_attacks', 'masked_conc_attacks', 'bart_conc_attacks',
       'arglex_conc_attacks', 'joint_conc_attacks',
       'joint_conc_baseline_attacks', 'argument', 'joint_conc_baseline'],
      dtype='object')

In [15]:
manual_inspec_sample_df = reddit_pred_df.sample(50)
manual_inspec_sample_df[['premises', 'conclusion', 'masked_conc_attacks', 'bart_conc_attacks', 'joint_conc_baseline_attacks', 'joint_conc_attacks']].to_csv('../data/output/valid_sample_manual_inspection.csv')

In [20]:
def get_evaluation_results(reddit_pred_df, df_path):
    
    #collect references
    df = pd.read_pickle(df_path)
    arg_counters = df.groupby('post_id').agg({
        'counter': lambda x: [' '.join(c) for c in x[0:10]]
    }).reset_index()

    arg_counters = pd.Series(arg_counters.counter.values, index=arg_counters.post_id).to_dict()

    reddit_pred_df['gt_attack'] = reddit_pred_df['gt_attack'].apply(lambda x: str(x))
    reddit_pred_df['all_counters'] = reddit_pred_df['post_id'].apply(lambda x: arg_counters[x])
    reddit_pred_df = reddit_pred_df[reddit_pred_df.all_counters.map(len) > 0]

    masked_conc_eval = evaluate_gen_attacks(reddit_pred_df['masked_conc_attacks'].tolist(), reddit_pred_df['all_counters'].tolist())
    known_conc_eval  = evaluate_gen_attacks(reddit_pred_df['known_conc_attacks'].tolist(), reddit_pred_df['all_counters'].tolist())
    bart_conc_eval   = evaluate_gen_attacks(reddit_pred_df['bart_conc_attacks'].tolist(), reddit_pred_df['all_counters'].tolist())
    arglex_conc_eval = evaluate_gen_attacks(reddit_pred_df['arglex_conc_attacks'].tolist(), reddit_pred_df['all_counters'].tolist())
    pred_conc_eval   = evaluate_gen_attacks(reddit_pred_df['joint_conc_baseline_attacks'].tolist(), reddit_pred_df['all_counters'].tolist())
    joint_conc_eval  = evaluate_gen_attacks(reddit_pred_df['joint_conc_attacks'].tolist(), reddit_pred_df['all_counters'].tolist())
    
    #Test stance correctness
    filtered_reddit_pred_df = reddit_pred_df[pd.notna(reddit_pred_df.conclusion_target)]
    
    #compute the stance of the generated counters towards the conclusion target
    filtered_reddit_pred_df['masked_conc_stances'] = get_stances(filtered_reddit_pred_df.conclusion_target.tolist(), filtered_reddit_pred_df.masked_conc_attacks.tolist())
    filtered_reddit_pred_df['known_conc_stances']  = get_stances(filtered_reddit_pred_df.conclusion_target.tolist(), filtered_reddit_pred_df.known_conc_attacks.tolist())
    filtered_reddit_pred_df['bart_conc_stances']   = get_stances(filtered_reddit_pred_df.conclusion_target.tolist(), filtered_reddit_pred_df.bart_conc_attacks.tolist())
    filtered_reddit_pred_df['arglex_conc_stances'] = get_stances(filtered_reddit_pred_df.conclusion_target.tolist(), filtered_reddit_pred_df.arglex_conc_attacks.tolist())
    filtered_reddit_pred_df['pred_conc_stances']   = get_stances(filtered_reddit_pred_df.conclusion_target.tolist(), filtered_reddit_pred_df.joint_conc_baseline_attacks.tolist())
    filtered_reddit_pred_df['joint_conc_stances']  = get_stances(filtered_reddit_pred_df.conclusion_target.tolist(), filtered_reddit_pred_df.joint_conc_attacks.tolist())
    
    #compute the distance between the conclusion stance and the attack stance: the bigger the distance the better...
    masked_conc_eval['stance_score'] = round(np.mean([abs(x[0] - x[1]) for x in zip(filtered_reddit_pred_df.masked_conc_stances.tolist(), filtered_reddit_pred_df.conclusion_stance.tolist())]), 2)
    known_conc_eval['stance_score']  = round(np.mean([abs(x[0] - x[1]) for x in zip(filtered_reddit_pred_df.known_conc_stances.tolist(), filtered_reddit_pred_df.conclusion_stance.tolist())]), 2)
    bart_conc_eval['stance_score']   = round(np.mean([abs(x[0] - x[1]) for x in zip(filtered_reddit_pred_df.bart_conc_stances.tolist(), filtered_reddit_pred_df.conclusion_stance.tolist())]), 2)
    arglex_conc_eval['stance_score'] = round(np.mean([abs(x[0] - x[1]) for x in zip(filtered_reddit_pred_df.arglex_conc_stances.tolist(), filtered_reddit_pred_df.conclusion_stance.tolist())]), 2)
    pred_conc_eval['stance_score']   = round(np.mean([abs(x[0] - x[1]) for x in zip(filtered_reddit_pred_df.pred_conc_stances.tolist(), filtered_reddit_pred_df.conclusion_stance.tolist())]), 2)
    joint_conc_eval['stance_score']  = round(np.mean([abs(x[0] - x[1]) for x in zip(filtered_reddit_pred_df.joint_conc_stances.tolist(), filtered_reddit_pred_df.conclusion_stance.tolist())]), 2)

    #check if the two stances are contradicotry
    #masked_conc_stance_score2 = round(sum([int(x[0] * x[1] < 0) for x in zip(filtered_reddit_pred_df.masked_conc_stances.tolist(), filtered_reddit_pred_df.conclusion_stance.tolist())])/len(filtered_reddit_pred_df), 2)
    #known_conc_stance_score2  = round(sum([int(x[0] * x[1] < 0) for x in zip(filtered_reddit_pred_df.known_conc_stances.tolist(), filtered_reddit_pred_df.conclusion_stance.tolist())])/len(filtered_reddit_pred_df), 2)
    #bart_conc_stance_score2   = round(sum([int(x[0] * x[1] < 0) for x in zip(filtered_reddit_pred_df.bart_conc_stances.tolist(), filtered_reddit_pred_df.conclusion_stance.tolist())])/len(filtered_reddit_pred_df), 2)
    #pred_conc_stance_score2  = round(sum([int(x[0] * x[1] < 0) for x in zip(filtered_reddit_pred_df.pred_conc_stances.tolist(), filtered_reddit_pred_df.conclusion_stance.tolist())])/len(filtered_reddit_pred_df), 2)
    #joint_conc_stance_score2  = round(sum([int(x[0] * x[1] < 0) for x in zip(filtered_reddit_pred_df.joint_conc_stances.tolist(), filtered_reddit_pred_df.conclusion_stance.tolist())])/len(filtered_reddit_pred_df), 2)
    return {'Masked Conclusion': masked_conc_eval,
            'BART Conclusion': bart_conc_eval,
            'ArgLexRank Conclusion': arglex_conc_eval,
            'Joint Prediction (baseline)': pred_conc_eval,
            'Joint Prediction': joint_conc_eval,
            'Known Conclusion': known_conc_eval,
            'preds_df' : reddit_pred_df,
            'stances_df': filtered_reddit_pred_df}

In [21]:
reddit_pred_df = pd.read_pickle('../data/output/reddit_pred_df-sample-beamsize-4.pkl')
pred_df_scores = get_evaluation_results(reddit_pred_df, 
                                        ceph_dir + '/reddit_data/conclusion_and_ca_generation/valid_conclusion_comp_remove_75sem_perc.pkl')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

ProConClient:   0%|          | 0/2172 [00:00<?, ?it/s][A
ProConClient:  23%|██▎       | 500/2172 [00:06<00:20, 81.45it/s][A
ProConClient: 100%|██████████| 2172/2172 [00:39<00:00, 70.11it/s][A
ProConClient: 100%|██████████| 2172/2172 [00:51<00:00, 41.77it/s][A

ProConClient:  92%|█████████▏| 2000/2172 [00:27<00:02, 69.89it/s][A
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
ProConClient:  23%|██▎       | 500/2172 [00:06<00:20, 83.17it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try usi

In [22]:
print('Sample technique')
print(tabulate([['Masked Conclusion'] + [round(pred_df_scores['Masked Conclusion']['bleu'], 2), pred_df_scores['Masked Conclusion']['bert-fscore'], pred_df_scores['Masked Conclusion']['stance_score']],
                ['BART Conclusion'] + [round(pred_df_scores['BART Conclusion']['bleu'], 2), pred_df_scores['BART Conclusion']['bert-fscore'], pred_df_scores['BART Conclusion']['stance_score']],
                ['ArgLexRank Conclusion'] + [round(pred_df_scores['ArgLexRank Conclusion']['bleu'], 2), pred_df_scores['ArgLexRank Conclusion']['bert-fscore'], pred_df_scores['ArgLexRank Conclusion']['stance_score']],
                ['Joint Prediction (baseline)'] + [round(pred_df_scores['Joint Prediction (baseline)']['bleu'], 2), pred_df_scores['Joint Prediction (baseline)']['bert-fscore'], pred_df_scores['Joint Prediction (baseline)']['stance_score']],
                ['Joint Prediction'] + [round(pred_df_scores['Joint Prediction']['bleu'], 2), pred_df_scores['Joint Prediction']['bert-fscore'], pred_df_scores['Joint Prediction']['stance_score']],
                ['Known Conclusion'] + [round(pred_df_scores['Known Conclusion']['bleu'], 2), pred_df_scores['Known Conclusion']['bert-fscore'], pred_df_scores['Known Conclusion']['stance_score']],
    ], headers=['bleu', 'bert-f1score', 'stance-score (diff)']))

Sample technique
                               bleu    bert-f1score    stance-score (diff)
---------------------------  ------  --------------  ---------------------
Masked Conclusion              0.14            0.14                   0.85
BART Conclusion                0.16            0.16                   0.86
ArgLexRank Conclusion          0.13            0.13                   0.84
Joint Prediction (baseline)    0.14            0.16                   0.86
Joint Prediction               0.15            0.16                   0.84
Known Conclusion               0.17            0.17                   0.87


In [23]:
reddit_pred_df = pd.read_pickle('../data/output/reddit_pred_df-nosample-beamsize-4.pkl')
pred_df_scores = get_evaluation_results(reddit_pred_df,
                                       ceph_dir + '/reddit_data/conclusion_and_ca_generation/valid_conclusion_comp_remove_75sem_perc.pkl')

ProConClient: 100%|██████████| 2172/2172 [02:43<00:00, 13.27it/s]
ProConClient: 100%|██████████| 2172/2172 [00:32<00:00, 66.32it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
ProConClient: 100%|██████████| 2172/2172 [00:25<00:00, 86.74it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
ProConClient: 100%|██████████| 2172/2172 [00:34<00:00, 62.98it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/use

In [24]:
print('No sample technique:')
print(tabulate([['Masked Conclusion'] + [round(pred_df_scores['Masked Conclusion']['bleu'], 2), pred_df_scores['Masked Conclusion']['bert-fscore'], pred_df_scores['Masked Conclusion']['stance_score']],
                ['BART Conclusion'] + [round(pred_df_scores['BART Conclusion']['bleu'], 2), pred_df_scores['BART Conclusion']['bert-fscore'], pred_df_scores['BART Conclusion']['stance_score']],
                ['ArgLexRank Conclusion'] + [round(pred_df_scores['ArgLexRank Conclusion']['bleu'], 2), pred_df_scores['ArgLexRank Conclusion']['bert-fscore'], pred_df_scores['ArgLexRank Conclusion']['stance_score']],
                ['Joint Prediction (baseline)'] + [round(pred_df_scores['Joint Prediction (baseline)']['bleu'], 2), pred_df_scores['Joint Prediction (baseline)']['bert-fscore'], pred_df_scores['Joint Prediction (baseline)']['stance_score']],
                ['Joint Prediction'] + [round(pred_df_scores['Joint Prediction']['bleu'], 2), pred_df_scores['Joint Prediction']['bert-fscore'], pred_df_scores['Joint Prediction']['stance_score']],
                ['Known Conclusion'] + [round(pred_df_scores['Known Conclusion']['bleu'], 2), pred_df_scores['Known Conclusion']['bert-fscore'], pred_df_scores['Known Conclusion']['stance_score']],
    ], headers=['bleu', 'bert-f1score', 'stance-score (diff)']))

No sample technique:
                               bleu    bert-f1score    stance-score (diff)
---------------------------  ------  --------------  ---------------------
Masked Conclusion              0.15            0.15                   0.85
BART Conclusion                0.18            0.18                   0.85
ArgLexRank Conclusion          0.15            0.14                   0.84
Joint Prediction (baseline)    0.13            0.17                   0.87
Joint Prediction               0.17            0.17                   0.86
Known Conclusion               0.19            0.18                   0.86


ProConClient: 100%|██████████| 2172/2172 [00:46<00:00, 63.65it/s]

In [29]:
x = pred_df_scores['stances_df']
x[['conclusion', 'conclusion_stance', 'conclusion_target', 'known_conc_stances', 'known_conc_attacks', 'joint_conc_baseline_attacks', 'joint_conc_attacks']].head()

Unnamed: 0,conclusion,conclusion_stance,conclusion_target,known_conc_stances,known_conc_attacks,joint_conc_baseline_attacks,joint_conc_attacks
0,Britons should be proud of the Empire,0.973527,the Empire,0.370406,"i don't think you should be proud of britain, but i think it's important to remember that the empire was founded on the principles of democracy and individualism. it's not just that the people who built the empire were pro democracy, it's that they were pro individualism and anti individualism as well.","i don't think the empire did much good, but i think it was a big step in the development of the world.","i don't think the empire is evil, but i think it is. i think we should be proud of our culture, our history, and our history."
1,Moral Consequentialism is the only rational basis of evaluating actions in discussions of morality,0.998206,Moral Consequentialism,0.942047,"the problem with consequentialism is that it is not the only rational basis of evaluating actions in discussions of morality. it is the only logical basis for determining whether or not an action benefits a group of people where 'right' means beneficial to their well being and 'wrong' means harmful. the problem with this is that there is no rational basis for evaluating an action that benefits the group. if you want to determine whether an action is beneficial to a group, you have to make sure that the group is not harmed by the action.","there is no such thing as a 'good' or 'bad' moral evaluation, but there is a good moral evaluation of the consequences of an action.",there is no such thing as 'rightness wrongness' or 'wrongness rightness'. it's just that there is no way to determine whether or not an action is wrong or not.
2,ASAP Ferg’s new song is awful,-0.160894,Ferg,0.062374,"asap ferg's new song is awful. it's not that he's bad, it's just that he doesn't have the ability to make good music.",asap ferg's new song has nothing to do with music. it's just a song that he wrote.,"i don't know what you're talking about, but i think you're missing the point. asap ferg's new song is really really bad. it's not a good song, it's a bad song, but it doesn't have the qualities of good music in any genre."
3,I think hijab is good for the society and for women.,0.998132,hijab,0.566873,i think hijab is good for the society and for women. i don't think it's a good way to reduce rape and harassment.,"i don't think you're right, but i think it's a good idea to change your view.",i don't think hijab is a good approach to reduce rape and harassment.
4,State lines in the United States are drawn unfairly,-0.846689,State lines in the United States,0.055674,"i don't think it's fair to draw the state lines based on the population of the state, but it's not unfair to draw them based on their population.","i think it's unfair that each state has two senators, given the way state lines are drawn now.",i think it's unfair for each state to have two senators.


### Similarity to conclusion:

In [15]:
masked_conc_eval = evaluate_gen_attacks(reddit_pred_df['conclusion'], reddit_pred_df['masked_conc_attacks'].tolist())
known_conc_eval  = evaluate_gen_attacks(reddit_pred_df['conclusion'], reddit_pred_df['known_conc_attacks'].tolist())
#auto_conc_eval   = evaluate_gen_attacks(reddit_pred_df['conclusion'], reddit_pred_df['auto_conc_attacks'].tolist())
pred_conc_eval   = evaluate_gen_attacks(reddit_pred_df['conclusion'], reddit_pred_df['pred_conc_attacks'].tolist())
gt_attack_conc_eval = evaluate_gen_attacks(reddit_pred_df['conclusion'], reddit_pred_df['gt'].tolist())

In [17]:
from tabulate import tabulate

print(tabulate([['masked'] + [round(masked_conc_eval['bleu'], 2), masked_conc_eval['bert-fscore']],
                #['auto'] + list(auto_conc_eval.values()),
                ['pred'] + [round(pred_conc_eval['bleu'], 2), pred_conc_eval['bert-fscore']],
                ['known'] + [round(known_conc_eval['bleu'], 2), known_conc_eval['bert-fscore']],
                ['gt_attack'] + [round(gt_attack_conc_eval['bleu'], 2), gt_attack_conc_eval['bert-fscore']]
    ], headers=['bleu', 'bert-f1score']))

             bleu    bert-f1score
---------  ------  --------------
masked       0               0.16
pred         0.01            0.18
known        0.03            0.33
gt_attack    0              -0.03
