In [3]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

In [12]:
tokenizer = AutoTokenizer.from_pretrained("../models/conclugen-bart-large-all/")
model = AutoModelForSeq2SeqLM.from_pretrained("../models/conclugen-bart-large-all/")
conclugen_pipeline = pipeline("summarization", tokenizer=tokenizer, model=model)

In [4]:
data = pd.read_csv("../data/model_predictions.csv", keep_default_na=False)

In [13]:
data.count()

conclusion             523
premises               523
argument               523
gt                     523
masked_conc_attacks    523
known_conc_attacks     523
dtype: int64

In [8]:
def truncate_text(text, remove_extra_tokens=0):
        for i in range(3):
            tokens = tokenizer(
                text, return_tensors="pt", truncation=True
            ).input_ids
            max_model_length = tokens.size()[1]
            truncated_tokens = tokens[0][: max_model_length - remove_extra_tokens]
            text = tokenizer.decode(
                truncated_tokens, clean_up_tokenization_spaces=True
            )
            without_truncate_length = tokenizer(
                text, return_tensors="pt"
            ).input_ids.size()[1]
            if max_model_length > without_truncate_length:
                return tokens, text
        return truncate_text(text, remove_extra_tokens=remove_extra_tokens + 5)

In [21]:
sample = data['premises'].tolist()[0]
sample

"as a non american but one interested in politics, i've seen many discussions about american politics that have talked about the level of anger between the two different parties and they typically blame the two equally. a typical complaint from republicans is that they're treated like they're racist, misogynistic, bigoted, ignorant, selfish and or foolish. from their lies over actual policy issues like health their embrace of white identity politics republicans seem far more interested in cultural issues like nfl protests and sexual assault allegations than issues like the economy and america's standing in the world. their support of trump, roy moore and brett kavanaugh suggests they view improved treatment respect for women as something to be actively fought and a danger to men's justified dominance. their concerns over the deficit rapidly vanished once trump was elected suggesting the tea party movement was as much about race as anything. their support of tax cuts suggests they're mo

In [22]:
tokens, text = truncate_text(sample)
text

"<s><s><s><s>as a non american but one interested in politics, i've seen many discussions about american politics that have talked about the level of anger between the two different parties and they typically blame the two equally. a typical complaint from republicans is that they're treated like they're racist, misogynistic, bigoted, ignorant, selfish and or foolish. from their lies over actual policy issues like health their embrace of white identity politics republicans seem far more interested in cultural issues like nfl protests and sexual assault allegations than issues like the economy and america's standing in the world. their support of trump, roy moore and brett kavanaugh suggests they view improved treatment respect for women as something to be actively fought and a danger to men's justified dominance. their concerns over the deficit rapidly vanished once trump was elected suggesting the tea party movement was as much about race as anything. their support of tax cuts suggest

In [23]:
conclugen = conclugen_pipeline(text, clean_up_tokenization_spaces=True)
conclusion = conclugen[0]['summary_text']
conclusion

'the republicans are just as bad as the tea party.'

In [24]:
def generate_conclusion(row):
    premises = row['premises']
    tokens, processed_premises = truncate_text(premises)
    conclugen = conclugen_pipeline(processed_premises, clean_up_tokenization_spaces=True)
    conclusion = conclugen[0]['summary_text']
    row['baseline_conclugen'] = conclusion.strip()
    return row

    

In [25]:
conclugen_df = data.progress_apply(generate_conclusion, axis=1)

  3%|▎         | 16/523 [01:03<36:17,  4.30s/it]Your max_length is set to 62, but you input_length is only 24. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
  4%|▍         | 23/523 [01:28<32:24,  3.89s/it]Your max_length is set to 62, but you input_length is only 42. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
  6%|▌         | 29/523 [01:59<42:39,  5.18s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (1026 > 1024). Running this sequence through the model will result in indexing errors
  6%|▌         | 32/523 [02:20<48:14,  5.90s/it]Your max_length is set to 62, but you input_length is only 43. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
  8%|▊         | 44/523 [03:17<43:10,  5.41s/it]Your max_length is set to 62, but you input_length is only 9. You might consider decreasing max_length manually, e.g. su

In [26]:
conclugen_df.head()

Unnamed: 0,conclusion,premises,argument,gt,masked_conc_attacks,known_conc_attacks,baseline_conclugen
0,The anger many liberal Americans feel towards...,as a non american but one interested in politi...,The anger many liberal Americans feel towards...,the ultimate thing i can tell you is to always...,i'm not sure what you mean by 'bigoted' or 'bi...,"i agree with you that the anger is justified, ...",the republicans are just as bad as the tea party.
1,Killing an innocent life is wrong,it stemmed from a pro life statement i made. t...,Killing an innocent life is wrong : it stemme...,if i had to the power to go back in time and k...,i'm not sure what you mean by 'rationalized to...,"i'm not sure what you're trying to say here, b...",i think it's wrong to pull the lever because s...
2,High schools should not have credit requirements,according to the national research council and...,High schools should not have credit requireme...,"let's use my own undergraduate school, the uni...",i'm not sure what you mean by 'well rounded.'i...,i don't think it's fair to say that high schoo...,Students should be required to take more class...
3,god existing would make sense,the two biggest problems with people believing...,god existing would make sense : the two bigge...,i am in a bit of an interesting position to an...,i don't think there is any reason to believe t...,i don't think it would make sense for god to e...,I believe that god exists and that he is respo...
4,A pill to stop gender dysphoria would be obje...,this seemed to offend people and i don't get w...,A pill to stop gender dysphoria would be obje...,yeh the thing is i wouldn't want to take such ...,i don't think it's fair to mandate people to t...,i don't think it would be objectively a better...,i don't think people with gender dysphoria sho...


In [27]:
premises = conclugen_df['premises'].tolist()
references = conclugen_df['conclusion'].tolist()
baseline_conclugen_predictions = conclugen_df['baseline_conclugen'].tolist()

In [30]:
with open('../data/baseline_conclugen_predictions.txt','w', encoding='utf-8') as outf:
    for line in baseline_conclugen_predictions:
        outf.write(line)
        outf.write("\n")