In [1]:
%load_ext autoreload

In [2]:
import os
import sys
from scipy import stats
import re
import random
from argparse import Namespace

sys.path.append('../src-py/')

os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [10]:
%autoreload

from utils import *
from ca_utils import *

2022-08-10 16:22:29,919 loading file ../../data-ceph/arguana/arg-generation/claim-target-tagger/model/final-model.pt
2022-08-10 16:22:57,845 SequenceTagger predicts: Dictionary with 4 tags: <unk>, B-CT, I-CT, O


In [4]:
import pickle
import torch
import json

import nltk
import numpy as np
import pandas as pd

from pathlib import Path
from tabulate import tabulate
pd.set_option('display.max_colwidth', None)

import matplotlib.pyplot as plt

In [5]:
from datasets import load_dataset, load_metric, Dataset

In [6]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

## General:

In [7]:
ceph_dir = '/home/sile2804/data-ceph/arguana/arg-generation/multi-taks-counter-argument-generation'
local_home_dir = '../data'

data_unique_path = '/reddit_data/conclusion_and_ca_generation/test_conclusion_all_preprocessed.pkl'
data_path = '/reddit_data/conclusion_and_ca_generation/test_conclusion_all.pkl'

In [11]:
def generate_ds_attacks(ds, model, tokenizer, premises_clm, conclusion_clm, gen_kwargs, skip_special_tokens=True, batch_size=5):
    
    ds = ds.map(lambda x :preprocess_function(x, tokenizer, premises_clm, 'counter', conclusion_clm=conclusion_clm), batched=True)
    ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    dataloader = torch.utils.data.DataLoader(ds, batch_size=batch_size)
    attacks = generate_counters(model, tokenizer, dataloader, gen_kwargs, skip_special_tokens=skip_special_tokens)
    
    return attacks

def create_predictions_df(reddit_sample_valid_ds, gen_kwargs, premises_clm='masked_premises'):
   
    known_conc_attacks  = generate_ds_attacks(reddit_sample_valid_ds, known_conclusion_model, known_conclusion_tokenizer, premises_clm, 'title', gen_kwargs)
    bart_conc_attacks   = generate_ds_attacks(reddit_sample_valid_ds, known_conclusion_model, known_conclusion_tokenizer, premises_clm, 'bart_conclusion', gen_kwargs)
    masked_conc_attacks = generate_ds_attacks(reddit_sample_valid_ds, known_conclusion_model, known_conclusion_tokenizer, premises_clm, None, gen_kwargs)
    
    #update max_gen_length to account to the generated conclusion
    gen_kwargs['max_length'] = gen_kwargs['max_length'] + 50
    joint_conc_baseline_attacks  = generate_ds_attacks(reddit_sample_valid_ds, pred_conclusion_model, pred_conclusion_tokenizer, premises_clm, None, gen_kwargs, skip_special_tokens=False)

    reddit_pred_df = pd.DataFrame(list(zip(
                                           reddit_sample_valid_ds['post_id'],
                                           reddit_sample_valid_ds['title'], 
                                           reddit_sample_valid_ds['conclusion_targets'],
                                           reddit_sample_valid_ds['conclusion_stance'],
                                           reddit_sample_valid_ds['bart_conclusion'], 
                                           reddit_sample_valid_ds[premises_clm],
                                           reddit_sample_valid_ds['counter'], 
                                           known_conc_attacks, masked_conc_attacks, 
                                           bart_conc_attacks, joint_conc_baseline_attacks)), 
                    columns=['post_id', 'conclusion', 'conclusion_target', 'conclusion_stance', 'bart_conclusion', 
                             'premises', 'gt_attack', 'known_conc_attacks', 'masked_conc_attacks', 
                             'bart_conc_attacks',  'joint_conc_baseline_attacks'])

    reddit_pred_df['argument'] = reddit_pred_df.apply(lambda row: row['conclusion'] + ' : ' + ' '.join(row['premises']), axis=1)
    reddit_pred_df['premises'] = reddit_pred_df['premises'].apply(lambda x: ' '.join(x))

    #process the jointly generated conclusion and counter
    reddit_pred_df['joint_conc_baseline'] = reddit_pred_df['joint_conc_baseline_attacks'].apply (lambda x: x.split('<counter>')[0])
    reddit_pred_df['joint_conc_baseline_attacks'] = reddit_pred_df['joint_conc_baseline_attacks'].apply (lambda x: x.split('<counter>')[1] if '<counter>' in x else x)
    reddit_pred_df['joint_conc_baseline'] = reddit_pred_df['joint_conc_baseline'].apply (lambda x: re.sub('<s>|</s>|<conclusion>|<counter>|<pad>', '', x).strip())
    reddit_pred_df['joint_conc_baseline_attacks'] = reddit_pred_df['joint_conc_baseline_attacks'].apply (lambda x: re.sub('<s>|</s>|<conclusion>|<counter>|<pad>', '', x).strip())

    return reddit_pred_df

----------------------

## Generated Predictions:

In [12]:
known_conclusion_model = BartForConditionalGeneration.from_pretrained(local_home_dir + '/output/ca-final-models/known-conc-model/checkpoint-9500').to(device)
known_conclusion_tokenizer = BartTokenizer.from_pretrained(local_home_dir + '/output/ca-final-models/known-conc-model/checkpoint-9500')

pred_conclusion_model = BartForConditionalGeneration.from_pretrained(local_home_dir + '/output/ca-final-models/pred-conc-model').to(device)
pred_conclusion_tokenizer = BartTokenizer.from_pretrained(local_home_dir + '/output/ca-final-models/pred-conc-model')

In [13]:
valid_df = pd.read_pickle(ceph_dir + data_unique_path)

In [14]:
#Create a dataset
print('Testing on {} posts'.format(len(valid_df)))
valid_ds = Dataset.from_pandas(valid_df.sample(10))
valid_ds = valid_ds.flatten_indices()

Testing on 8533 posts


Flattening the indices:   0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
#Generate counters according to best parameters...
gen_kwargs = {
    "do_sample": True, 
    "max_length":100,
    "min_length":50,
    "top_k": 50,
    "no_repeat_ngram_size":3,
    "top_p":0.95, 
    "num_beams":4
}

#generate predictions
reddit_pred_df = create_predictions_df(valid_ds, gen_kwargs, premises_clm='post')

In [None]:
reddit_pred_df.to_pickle('../data/output/test_all_reddit_pred_test_with_sampling_4beam_df.pkl')

In [None]:
eval_df = pd.read_pickle('../data/output/automatic_evaluation_results.pkl')

In [None]:
eval_df['arg_len'] = eval_df['argument'].apply(lambda x : len(x.split()))

In [None]:
eval_df = split_dataframe_per_conc_similarity(eval_df)

In [None]:
for clm_name, clm in pred_clms.items():
    eval_df['{}_opposing'.format(clm)] = eval_df.apply(lambda row: 1 if row['{}_stances'.format(clm)] * row['conclusion_stance'] < 0 else 0, axis=1)

In [None]:
eval_df.arg_len.hist()

In [None]:
eval_df.max_sim_to_conclusion.hist()

In [None]:
def analyze_effetiveness1(df, clm, measure, num_rows=10, num_clms=3):
    df  = df.sort_values('arg_len')
    chunk_size = int(len(df)/num_rows)
    score_dist = np.zeros((num_rows, num_clms))
    for i in range(0, num_rows) :
        df_row = df[i*chunk_size: (i+1) * chunk_size].copy()
        chunk_arg_len = round(df_row['arg_len'].mean(), 2)
        print('Number of samples is {} with average argument length = {}'.format(len(df_row), chunk_arg_len))
        #now split per similarity
        df_row = df_row.sort_values('max_sim_to_conclusion')
        chunk_size1 = int(len(df_row)/num_clms)
        for j in range(0, num_clms):
            df_chunk = df_row[j*chunk_size1: (j+1) * chunk_size1].copy()
            chunk_sim = round(df_chunk['max_sim_to_conclusion'].mean(), 2)
            print('Number of samples is {} with average similarity to conclusion = {}'.format(len(df_chunk), chunk_sim))
            score_dist[i,j] = round(df_chunk['{}_{}'.format(clm, measure)].mean(), 2)
    
    return score_dist

def analyze_effetiveness2(df, clm, measure, dimension='arg_len', num_buckets=5):
    df  = df.sort_values(dimension)
    chunk_size = int(len(df)/num_buckets)
    score_dist = []
    for i in range(0, num_buckets) :
        df_chunk = df[i*chunk_size: (i+1) * chunk_size].copy()
        chunk_dim = round(df_chunk[dimension].mean(), 2)
        print('Number of samples is {} with average dimension length = {}'.format(len(df_chunk), chunk_dim))
        value = round(df_chunk['{}_{}'.format(clm, measure)].mean(), 2)
        score_dist.append((chunk_dim, value))
    
    return score_dist

def analyze_effetiveness3(df, clm, measure, sim_thresholds=[0, 0.4, 0.7, 1.0], len_thresholds=[0, 300, 500, 1000]):
    score_dist = np.zeros((len(len_thresholds)-1, len(sim_thresholds)-1))
    for i in range(0, len(len_thresholds) - 1):
        df_row = df[(df.arg_len >= len_thresholds[i]) & (df.arg_len < len_thresholds[i+1])]
        chunk_arg_len = round(df_row['arg_len'].mean(), 2)
        print('Number of samples is {} with average argument length = {}'.format(len(df_row), chunk_arg_len))
        #now split per similarity
        for j in range(0, len(sim_thresholds) - 1):
            df_chunk = df_row[(df_row.max_sim_to_conclusion >= sim_thresholds[j]) & (df_row.max_sim_to_conclusion < sim_thresholds[j+1])]
            chunk_sim = round(df_chunk['max_sim_to_conclusion'].mean(), 2)
            print('Number of samples is {} with average similarity to conclusion = {}'.format(len(df_chunk), chunk_sim))
            score_dist[i,j] = round(df_chunk['{}_{}'.format(clm, measure)].mean(), 3)
    
    return score_dist

In [None]:
pred_clms = {  
#     'BART Conclusion': 'bart_conc_attacks',
#     'Joint Prediction': 'joint_conc_baseline_attacks',
#     'Multi Conclusions (pipeline prediction)': 'multi_counter_pipeline',
     'Multi Conclusions (joint prediction)': 'multi_counter_joint',
#     'Stance Based CAG (w/o stance)': 'single_pred_counter_arguments_no_stance',
    #'Stance Based CAG (M- w/o stance)': 'pred_counter_arguments_no_stance',
    'Known Conclusion': 'known_conc_attacks',
    'Masked Conclusion': 'masked_conc_attacks',
}

In [None]:
score_dist = {}
for clm_name, clm in pred_clms.items():
    score_dist[clm_name] = analyze_effetiveness2(eval_df, clm, 'our_stance_score', 'max_sim_to_conclusion', num_buckets=4)

fig = plt.figure(figsize=(15,10))
width = 0.2 #/len(score_dist)
for i, item, in enumerate(score_dist.items()):
    app, app_scores = item[0], item[1]
    print(app, app_scores)
    xs, ys = zip(*app_scores)
    plt.bar([x + (i * width) for x in range(0, len(xs))], ys, width=width , label=app,)

plt.xticks(range(0, len(xs)), xs)
plt.legend()
plt.savefig('./figures/conc_sim_to_stance_score_correlation.pdf')
plt.show()

In [None]:
score_dist = {}
for clm_name, clm in pred_clms.items():
    score_dist[clm_name] = analyze_effetiveness2(eval_df, clm, 'bert', 'max_sim_to_conclusion', num_buckets=4)

fig = plt.figure(figsize=(15,10))
width = 0.2 #/len(score_dist)
for i, item, in enumerate(score_dist.items()):
    app, app_scores = item[0], item[1]
    print(app, app_scores)
    xs, ys = zip(*app_scores)
    plt.bar([x + (i * width) for x in range(0, len(xs))], ys, width=width , label=app,)

plt.xticks(range(0, len(xs)), xs)
plt.legend()
plt.savefig('./figures/conc_sim_to_bert_correlation.pdf')
plt.show()

In [None]:
score_dist = {}
for clm_name, clm in pred_clms.items():
    score_dist[clm_name] = analyze_effetiveness2(eval_df, clm, 'our_stance_score', 'arg_len', num_buckets=4)

fig = plt.figure(figsize=(15,10))
width = 0.2 #/len(score_dist)
for i, item, in enumerate(score_dist.items()):
    app, app_scores = item[0], item[1]
    print(app, app_scores)
    xs, ys = zip(*app_scores)
    plt.bar([x + (i * width) for x in range(0, len(xs))], ys, width=width , label=app,)

plt.xticks(range(0, len(xs)), xs)
plt.legend()
plt.savefig('./figures/arg_len_to_stance_score_correlation.pdf')
plt.show()

In [None]:
score_dist = {}
for clm_name, clm in pred_clms.items():
    score_dist[clm_name] = analyze_effetiveness2(eval_df, clm, 'bert', 'arg_len', num_buckets=4)

fig = plt.figure(figsize=(15,10))
width = 0.2 #/len(score_dist)
for i, item, in enumerate(score_dist.items()):
    app, app_scores = item[0], item[1]
    xs, ys = zip(*app_scores)
    print(app, app_scores)
    plt.bar([x + (i * width) for x in range(0, len(xs))], ys, width=width , label=app,)

plt.xticks(range(0, len(xs)), xs)
plt.legend()
plt.savefig('./figures/arg_len_to_bert_correlation.pdf')
plt.show()

In [None]:
#score_dist = analyze_effetiveness2(eval_df, 'masked_conc_attacks', 'bleu', len_thresholds=[0, 300, 600, 900, 1000], sim_thresholds=[0, 1.1])

---------