In [1]:
import os
import sys
import re
from argparse import Namespace

sys.path.append('../src-py/')

os.environ["CUDA_VISIBLE_DEVICES"]="1"

from utils import *
from project_debater_api import *
from mt_bart_v2 import *

import torch
import json

import nltk
import numpy as np
import pandas as pd

from pathlib import Path
from datasets import load_dataset, load_metric, Dataset

from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import BartTokenizer, BartForConditionalGeneration

import matplotlib.pyplot as plt

from tabulate import tabulate

In [2]:
ls /home/sile2804/data-ceph/arguana/arg-generation/multi-taks-counter-argument-generation/reddit_data/conclusion_and_ca_generation/

preprocessed_train_conclusion_all.pkl
sample_test_conclusion_all.pkl
sample_test_conclusion_all_preprocessed.pkl
sample_test_conclusion_all_preprocessed_multi_conclusions.pkl
sample_valid_conclusion_all.pkl
sample_valid_conclusion_all_preprocessed.pkl
test_conclusion_all.pkl
test_conclusion_comp_remove_75sem_perc.pkl
train_conclusion_all.pkl
train_conclusion_comp_remove_75sem_perc.pkl
valid_conclusion_all.pkl
valid_conclusion_comp_remove_75sem_perc.pkl


In [3]:
pd.set_option('display.max_colwidth', None)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

ceph_dir = '/home/sile2804/data-ceph/arguana/arg-generation/multi-taks-counter-argument-generation'
local_home_dir = '../data'

data_unique_path = '/reddit_data/conclusion_and_ca_generation/sample_valid_conclusion_all_preprocessed.pkl'
data_path = '/reddit_data/conclusion_and_ca_generation/sample_valid_conclusion_all.pkl'

In [4]:
def generate_ds_attacks(ds, model, tokenizer, premises_clm, conclusion_clm, gen_kwargs, skip_special_tokens=True, batch_size=16):
    ds = ds.map(lambda x :preprocess_function(x, tokenizer, premises_clm, 'counter', conclusion_clm=conclusion_clm), batched=True)
    ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    dataloader = torch.utils.data.DataLoader(ds, batch_size=batch_size)
    attacks = generate_counters(model, tokenizer, dataloader, gen_kwargs, skip_special_tokens=skip_special_tokens)
    
    return attacks

def create_predictions_df(model, tokenizer, reddit_sample_valid_ds, gen_kwargs, premises_clm='masked_premises', conclusion_clm=None):
    
    
    known_conc_attacks  = generate_ds_attacks(reddit_sample_valid_ds, model, tokenizer, premises_clm, conclusion_clm, gen_kwargs)    
    #update max_gen_length to account to the generated conclusion
    
    reddit_pred_df = pd.DataFrame(list(zip(
                                           reddit_sample_valid_ds['post_id'],
                                           reddit_sample_valid_ds['title'], 
                                           reddit_sample_valid_ds['conclusion_targets'],
                                           reddit_sample_valid_ds['conclusion_stance'],
                                           reddit_sample_valid_ds['bart_conclusion'], 
                                           reddit_sample_valid_ds[premises_clm],
                                           reddit_sample_valid_ds['counter'], 
                                           known_conc_attacks)), 
                    columns=['post_id', 'conclusion', 'conclusion_target', 'conclusion_stance', 'bart_conclusion', 'premises', 'gt_attack', 'known_conc_attacks'])

    reddit_pred_df['argument'] = reddit_pred_df.apply(lambda row: row['conclusion'] + ' : ' + ' '.join(row['premises']), axis=1)
    reddit_pred_df['premises'] = reddit_pred_df['premises'].apply(lambda x: ' '.join(x))
    
    return reddit_pred_df

def get_evaluation_results(reddit_pred_df, df_path):
    #collect references
    df = pd.read_pickle(df_path)
    arg_counters = df.groupby('post_id').agg({
        'counter': lambda x: [' '.join(c) for c in x]
    }).reset_index()

    arg_counters = pd.Series(arg_counters.counter.values, index=arg_counters.post_id).to_dict()

    reddit_pred_df['all_counters'] = reddit_pred_df['post_id'].apply(lambda x: arg_counters[x])
    reddit_pred_df['all_counters'] = reddit_pred_df.all_counters.apply(lambda claims: [c for c in claims if c !=''])
    reddit_pred_df = reddit_pred_df[reddit_pred_df.all_counters.map(len) > 0]

    known_conc_eval  = evaluate_gen_attacks(reddit_pred_df['known_conc_attacks'].tolist(), reddit_pred_df['all_counters'].tolist(), detailed=False)
  
    #Test stance correctness
    filtered_reddit_pred_df = reddit_pred_df[pd.notna(reddit_pred_df.conclusion_target)]
    print('Testing stance on only {} posts'.format(len(filtered_reddit_pred_df)))
    
    #compute the stance of the generated counters towards the conclusion target
    filtered_reddit_pred_df['known_conc_stances']  = get_stances(filtered_reddit_pred_df.conclusion_target.tolist(), filtered_reddit_pred_df.known_conc_attacks.tolist())
    
    #compute the distance between the conclusion stance and the attack stance: the bigger the distance the better...
    known_conc_eval['stance_score']  = round(np.mean([abs(x[0] - x[1]) for x in zip(filtered_reddit_pred_df.known_conc_stances.tolist(), filtered_reddit_pred_df.conclusion_stance.tolist())]), 2)

    return known_conc_eval

In [5]:
known_conclusion_model = BartForConditionalGeneration.from_pretrained(local_home_dir + '/output/ca-final-models/known-conc-model/checkpoint-9500').to(device)
known_conclusion_tokenizer = BartTokenizer.from_pretrained(local_home_dir + '/output/ca-final-models/known-conc-model/checkpoint-9500')

pred_conclusion_model = BartForConditionalGeneration.from_pretrained(local_home_dir + '/output/ca-final-models/pred-conc-model/checkpoint-9500').to(device)
pred_conclusion_tokenizer = BartTokenizer.from_pretrained(local_home_dir + '/output/ca-final-models/pred-conc-model/checkpoint-9500')

join_model_tokenizer = BartTokenizer.from_pretrained(local_home_dir + '/output/ca-final-models/mt-model-baseline-weighting-scheme/checkpoint-9500')
join_model  = BartModelV2.from_pretrained(local_home_dir + '/output/ca-final-models/mt-model-baseline-weighting-scheme/checkpoint-9500', compute_dynamic_weights=False, conc_decoder=True).to(device)

In [6]:
valid_df = pd.read_pickle(ceph_dir + data_unique_path).sample(1000)

In [7]:
#Create a dataset
print('Testing on {} posts'.format(len(valid_df)))
valid_ds = Dataset.from_pandas(valid_df)
valid_ds = valid_ds.flatten_indices()

Testing on 1000 posts


Flattening the indices:   0%|          | 0/1 [00:00<?, ?ba/s]

In [8]:
valid_df.to_pickle('../data/valid_sample_for_finetuning_sampling_techniques.pkl')

### Studying the effect of Beam Search:

In [9]:
#Generate counters without sampling technique...
gen_kwargs = {
    "do_sample": True, 
    "max_length":100,
    "top_p":0.95, #from fine-tuning the best top_p was 0.95
    "num_beams":10
}

scores = []
for num_beams in [1, 4, 8, 12]:
    #generate predictions
    gen_kwargs['num_beams'] = num_beams
    preds_df = create_predictions_df(known_conclusion_model, known_conclusion_tokenizer, valid_ds, gen_kwargs, premises_clm='post', conclusion_clm='title')
    preds_df_scores = get_evaluation_results(preds_df, ceph_dir + data_path)
    scores.append([num_beams, preds_df_scores])

  0%|          | 0/1 [00:00<?, ?ba/s]

Testing stance on only 999 posts


ProConClient: 100%|██████████| 999/999 [00:19<00:00, 51.97it/s]


  0%|          | 0/1 [00:00<?, ?ba/s]

Testing stance on only 999 posts


ProConClient: 100%|██████████| 999/999 [00:18<00:00, 54.62it/s]


  0%|          | 0/1 [00:00<?, ?ba/s]

Testing stance on only 999 posts


ProConClient: 100%|██████████| 999/999 [00:18<00:00, 54.96it/s]


  0%|          | 0/1 [00:00<?, ?ba/s]

Testing stance on only 999 posts


ProConClient: 100%|██████████| 999/999 [00:17<00:00, 55.54it/s]


In [10]:
res_table = tabulate([(x[0], round(x[1]['bleu'], 2), round(x[1]['bert-fscore'], 2), x[1]['stance_score']) for x in scores], headers=['num_beams', 'bleu', 'bert-f1score', 'stance-score (diff)'])
    
print(res_table)

  num_beams    bleu    bert-f1score    stance-score (diff)
-----------  ------  --------------  ---------------------
          1    0.01            0.03                   0.86
          4    0.02            0.06                   0.82
          8    0.02            0.07                   0.75
         12    0.02            0.07                   0.75


In [12]:
scores = []
for num_beams in [1, 4, 8, 12]:
    #generate predictions
    gen_kwargs['num_beams'] = num_beams
    preds_df = create_predictions_df(pred_conclusion_model, pred_conclusion_tokenizer, valid_ds, gen_kwargs, premises_clm='post', conclusion_clm=None)
    preds_df_scores = get_evaluation_results(preds_df, ceph_dir + data_path)
    scores.append([num_beams, preds_df_scores])

  0%|          | 0/1 [00:00<?, ?ba/s]

Testing stance on only 999 posts


ProConClient: 100%|██████████| 999/999 [00:19<00:00, 52.15it/s]


  0%|          | 0/1 [00:00<?, ?ba/s]

Testing stance on only 999 posts


ProConClient: 100%|██████████| 999/999 [00:18<00:00, 54.02it/s]


  0%|          | 0/1 [00:00<?, ?ba/s]

Testing stance on only 999 posts


ProConClient: 100%|██████████| 999/999 [00:18<00:00, 55.02it/s]


  0%|          | 0/1 [00:00<?, ?ba/s]

Testing stance on only 999 posts


ProConClient: 100%|██████████| 999/999 [00:17<00:00, 55.57it/s]


In [13]:
res_table = tabulate([(x[0], round(x[1]['bleu'], 2), round(x[1]['bert-fscore'], 2), x[1]['stance_score']) for x in scores], headers=['num_beams', 'bleu', 'bert-f1score', 'stance-score (diff)'])
    
print(res_table)

  num_beams    bleu    bert-f1score    stance-score (diff)
-----------  ------  --------------  ---------------------
          1    0.01            0.01                   0.78
          4    0.02            0.03                   0.73
          8    0.02            0.03                   0.74
         12    0.02            0.03                   0.69


In [14]:
scores = []
for num_beams in [1, 4, 8, 12]:
    #generate predictions
    gen_kwargs['num_beams'] = num_beams
    preds_df = create_predictions_df(join_model, join_model_tokenizer, valid_ds, gen_kwargs, premises_clm='post', conclusion_clm=None)
    preds_df_scores = get_evaluation_results(preds_df, ceph_dir + data_path)
    scores.append([num_beams, preds_df_scores])

  0%|          | 0/1 [00:00<?, ?ba/s]

Testing stance on only 999 posts


ProConClient: 100%|██████████| 999/999 [00:18<00:00, 52.87it/s]


  0%|          | 0/1 [00:00<?, ?ba/s]

Testing stance on only 999 posts


ProConClient: 100%|██████████| 999/999 [00:18<00:00, 54.02it/s]


  0%|          | 0/1 [00:00<?, ?ba/s]

Testing stance on only 999 posts


ProConClient: 100%|██████████| 999/999 [00:18<00:00, 54.00it/s]


  0%|          | 0/1 [00:00<?, ?ba/s]

Testing stance on only 999 posts


ProConClient: 100%|██████████| 999/999 [00:18<00:00, 54.90it/s]


In [15]:
res_table = tabulate([(x[0], round(x[1]['bleu'], 2), round(x[1]['bert-fscore'], 2), x[1]['stance_score']) for x in scores], headers=['num_beams', 'bleu', 'bert-f1score', 'stance-score (diff)'])
    
print(res_table)

  num_beams    bleu    bert-f1score    stance-score (diff)
-----------  ------  --------------  ---------------------
          1    0.01            0.03                   0.82
          4    0.02            0.06                   0.84
          8    0.02            0.06                   0.83
         12    0.02            0.06                   0.83


### Studying the effect of P:

In [15]:
#Generate counters without sampling technique...
gen_kwargs = {
    "do_sample": True, 
    "max_length":100,
    "top_p":0.95, 
    "num_beams":1
}

In [20]:
scores1 = []
for p in [0.85, 0.90, 0.95, 1.0]:
    #generate predictions
    gen_kwargs['top_p'] = p
    preds_df = create_predictions_df(known_conclusion_model, known_conclusion_tokenizer, valid_ds, gen_kwargs, premises_clm='post', conclusion_clm='title')
    preds_df_scores = get_evaluation_results(preds_df, ceph_dir + data_path)
    scores1.append([p, preds_df_scores])

  0%|          | 0/1 [00:00<?, ?ba/s]

Testing stance on only 500 posts


ProConClient: 100%|██████████| 500/500 [00:09<00:00, 51.71it/s]


  0%|          | 0/1 [00:00<?, ?ba/s]

Testing stance on only 500 posts


ProConClient: 100%|██████████| 500/500 [00:10<00:00, 47.30it/s]


  0%|          | 0/1 [00:00<?, ?ba/s]

Testing stance on only 500 posts


ProConClient: 100%|██████████| 500/500 [00:10<00:00, 48.23it/s]


  0%|          | 0/1 [00:00<?, ?ba/s]

Testing stance on only 500 posts


ProConClient: 100%|██████████| 500/500 [00:11<00:00, 44.61it/s]


In [21]:
res_table = tabulate([(x[0], round(x[1]['bleu'], 2), round(x[1]['bert-fscore'], 2), x[1]['stance_score']) for x in scores1], headers=['p', 'bleu', 'bert-f1score', 'stance-score (diff)'])
    
print(res_table)

   p    bleu    bert-f1score    stance-score (diff)
----  ------  --------------  ---------------------
0.85    0.11            0.11                   0.71
0.9     0.1             0.1                    0.67
0.95    0.09            0.1                    0.66
1       0.08            0.09                   0.68


---------

In [22]:
scores2 = []
for p in [0.85, 0.90, 0.95, 1.0]:
    #generate predictions
    gen_kwargs['top_p'] = p
    preds_df = create_predictions_df(pred_conclusion_model, pred_conclusion_tokenizer, valid_ds, gen_kwargs, premises_clm='post', conclusion_clm=None)
    preds_df_scores = get_evaluation_results(preds_df, ceph_dir + data_path)
    scores2.append([p, preds_df_scores])

  0%|          | 0/1 [00:00<?, ?ba/s]

Testing stance on only 500 posts


ProConClient: 100%|██████████| 500/500 [00:09<00:00, 52.39it/s]


  0%|          | 0/1 [00:00<?, ?ba/s]

Testing stance on only 500 posts


ProConClient: 100%|██████████| 500/500 [00:09<00:00, 52.67it/s]


  0%|          | 0/1 [00:00<?, ?ba/s]

Testing stance on only 500 posts


ProConClient: 100%|██████████| 500/500 [00:09<00:00, 52.95it/s]


  0%|          | 0/1 [00:00<?, ?ba/s]

Testing stance on only 500 posts


ProConClient: 100%|██████████| 500/500 [00:09<00:00, 50.37it/s]


In [23]:
res_table = tabulate([(x[0], round(x[1]['bleu'], 2), round(x[1]['bert-fscore'], 2), x[1]['stance_score']) for x in scores2], headers=['p', 'bleu', 'bert-f1score', 'stance-score (diff)'])
print(res_table)

   p    bleu    bert-f1score    stance-score (diff)
----  ------  --------------  ---------------------
0.85    0.09            0.08                   0.73
0.9     0.08            0.08                   0.7
0.95    0.08            0.08                   0.72
1       0.07            0.07                   0.7


--------

In [24]:
scores3 = []
for p in [0.85, 0.90, 0.95, 1.0]:
    #generate predictions
    gen_kwargs['top_p'] = p
    preds_df = create_predictions_df(join_model, join_model_tokenizer, valid_ds, gen_kwargs, premises_clm='post', conclusion_clm=None)
    preds_df_scores = get_evaluation_results(preds_df, ceph_dir + data_path)
    scores3.append([p, preds_df_scores])
    

res_table = tabulate([(x[0], round(x[1]['bleu'], 2), round(x[1]['bert-fscore'], 2), x[1]['stance_score']) for x in scores3], headers=['p', 'bleu', 'bert-f1score', 'stance-score (diff)'])
print(res_table)

  0%|          | 0/1 [00:00<?, ?ba/s]

Testing stance on only 500 posts


ProConClient: 100%|██████████| 500/500 [00:09<00:00, 52.69it/s]


  0%|          | 0/1 [00:00<?, ?ba/s]

Testing stance on only 500 posts


ProConClient: 100%|██████████| 500/500 [00:11<00:00, 43.67it/s]


  0%|          | 0/1 [00:00<?, ?ba/s]

Testing stance on only 500 posts


ProConClient: 100%|██████████| 500/500 [00:11<00:00, 43.78it/s]


  0%|          | 0/1 [00:00<?, ?ba/s]

Testing stance on only 500 posts


ProConClient: 100%|██████████| 500/500 [00:09<00:00, 51.62it/s]

   p    bleu    bert-f1score    stance-score (diff)
----  ------  --------------  ---------------------
0.85    0.1             0.1                    0.73
0.9     0.09            0.09                   0.67
0.95    0.08            0.09                   0.74
1       0.07            0.08                   0.72



