In [1]:
from transformers import AutoModelForSeq2SeqLM, pipeline, AutoTokenizer
from tqdm.autonotebook import tqdm
import re
import pandas as pd
tqdm.pandas()

pd.set_option('display.max_colwidth', None)

In [2]:
# bart summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", tokenizer="facebook/bart-large-cnn")

In [3]:
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

In [4]:
def truncate_text_and_tokenize(text):
        tokens = tokenizer(text, return_tensors = "pt", truncation=True).input_ids
        num_tokens = tokens.size()[1]
        if num_tokens>=1024:
            _tokens = tokens[0][:num_tokens-3]
        else:
            _tokens = tokens[0]
        tokenized_text = tokenizer.decode(_tokens, skip_special_tokens=False, clean_up_tokenization_spaces=True)
        return (_tokens, tokenized_text)

In [13]:
#df = pd.read_csv("../data/inappropriate_arguments_sample_100_argsme.csv")
df = pd.read_csv("../data/results-by-corpus/argsme/neutralization/llama_ppo_rewrite_argsme_llama-7b-harmonic-mean-10a-00ss.csv", sep="\t", header=None)
df.columns = ["id", "neutralized_argument"]


In [12]:
# check for length of at least 200 words
def get_word_count(text):
    return len(re.findall(r'\w+', text))

In [14]:
#df['word_count'] = df['argument'].progress_apply(get_word_count)
df['word_count'] = df['neutralized_argument'].progress_apply(get_word_count)

  0%|          | 0/99 [00:00<?, ?it/s]

In [15]:
def summarize_argument(text, word_count):
    _tokens, tokenized_text = truncate_text_and_tokenize(text)
    # min_length and max_length should be 25% and 50% of word_count
    min_length = int(word_count * 0.25)
    max_length = int(word_count * 0.35)
    summarization = summarizer(tokenized_text, min_length=min_length, max_length=max_length, clean_up_tokenization_spaces=True)
    summary = summarization[0]['summary_text']
    return summary

In [16]:
def apply_summarization(row):
    #argument = row['argument']
    argument = row['neutralized_argument']
    word_count = int(row['word_count'])
    summary = summarize_argument(argument, word_count)
    row['bart_gist'] = summary
    return row

In [17]:
s_df = df.progress_apply(apply_summarization, axis=1)

  0%|          | 0/99 [00:00<?, ?it/s]

In [18]:
s_df.columns

Index(['id', 'neutralized_argument', 'word_count', 'bart_gist'], dtype='object')

In [19]:
s_df.to_csv("../data/results-by-corpus/argsme/both/bart_summarized_and_neutralized.csv")