In [1]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

In [2]:
df = pd.read_csv("../data/model_predictions.csv", keep_default_na=False)

In [3]:
df = df.dropna()

In [4]:
df.count()

conclusion             523
premises               523
argument               523
auto_conc              523
gt                     523
masked_conc_attacks    523
known_conc_attacks     523
auto_conc_attacks      523
dtype: int64

In [5]:
df = df[df['gt']!=""]

In [6]:
df.count()

conclusion             522
premises               522
argument               522
auto_conc              522
gt                     522
masked_conc_attacks    522
known_conc_attacks     522
auto_conc_attacks      522
dtype: int64

In [7]:
known_conc_attacks = df['known_conc_attacks'].tolist()
masked_conc_attacks = df['masked_conc_attacks'].tolist()
auto_conc_attacks = df['auto_conc_attacks'].tolist()
refs = df['gt'].tolist()

In [9]:
premises = df['argument'].tolist()

In [14]:
with open("../data_cg/references.txt","w", encoding='utf-8') as outf:
    for line in refs:
        outf.write(line)
        outf.write("\n")

In [12]:
tokenizer = AutoTokenizer.from_pretrained("../models/conclugen-bart-large-all/")
model = AutoModelForSeq2SeqLM.from_pretrained("../models/conclugen-bart-large-all/")
conclugen_pipeline = pipeline("summarization", tokenizer=tokenizer, model=model)

In [4]:
data = pd.read_csv("../data/model_predictions.csv", keep_default_na=False)

In [8]:
def truncate_text(text, remove_extra_tokens=0):
        for i in range(3):
            tokens = tokenizer(
                text, return_tensors="pt", truncation=True
            ).input_ids
            max_model_length = tokens.size()[1]
            truncated_tokens = tokens[0][: max_model_length - remove_extra_tokens]
            text = tokenizer.decode(
                truncated_tokens, clean_up_tokenization_spaces=True
            )
            without_truncate_length = tokenizer(
                text, return_tensors="pt"
            ).input_ids.size()[1]
            if max_model_length > without_truncate_length:
                return tokens, text
        return truncate_text(text, remove_extra_tokens=remove_extra_tokens + 5)

In [24]:
def generate_conclusion(row):
    premises = row['premises']
    tokens, processed_premises = truncate_text(premises)
    conclugen = conclugen_pipeline(processed_premises, clean_up_tokenization_spaces=True)
    conclusion = conclugen[0]['summary_text']
    row['baseline_conclugen'] = conclusion.strip()
    return row

    

In [25]:
conclugen_df = data.progress_apply(generate_conclusion, axis=1)

  3%|▎         | 16/523 [01:03<36:17,  4.30s/it]Your max_length is set to 62, but you input_length is only 24. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
  4%|▍         | 23/523 [01:28<32:24,  3.89s/it]Your max_length is set to 62, but you input_length is only 42. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
  6%|▌         | 29/523 [01:59<42:39,  5.18s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (1026 > 1024). Running this sequence through the model will result in indexing errors
  6%|▌         | 32/523 [02:20<48:14,  5.90s/it]Your max_length is set to 62, but you input_length is only 43. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
  8%|▊         | 44/523 [03:17<43:10,  5.41s/it]Your max_length is set to 62, but you input_length is only 9. You might consider decreasing max_length manually, e.g. su

In [27]:
premises = conclugen_df['premises'].tolist()
references = conclugen_df['conclusion'].tolist()
baseline_conclugen_predictions = conclugen_df['baseline_conclugen'].tolist()

In [30]:
with open('../data/baseline_conclugen_predictions.txt','w', encoding='utf-8') as outf:
    for line in baseline_conclugen_predictions:
        outf.write(line)
        outf.write("\n")

In [31]:
conclugen_df.to_csv("../data/conclugen_predictions.csv", index=False)

In [32]:
se_df = pd.read_csv("../data/conc_comprehension_experiment.csv",keep_default_na=False)

In [33]:
se_df.count()

Unnamed: 0             523
argument               523
gt                     523
masked_conc_attacks    523
known_conc_attacks     523
dtype: int64

In [34]:
articles = se_df['argument'].tolist()
references = se_df['gt'].tolist()
masked_conc_attacks = se_df['masked_conc_attacks'].tolist()
known_conc_attacks = se_df['known_conc_attacks'].tolist()

In [38]:
with open("../data/known_conc.txt",'w', encoding='utf-8') as outf:
    for line in known_conc_attacks:
        outf.write(line)
        outf.write("\n")