In [None]:
!pip install transformers # for loading model
!pip install sentencepiece # for tokenization in some cases (requires runtime reload after installation in colab)

In [None]:
import pandas as pd

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

In [None]:
# language in ISO 639â€‘1 code
# both abstracts and short absctracts should be in this language
lang = 'en' # 'en', 'nl', 'fr'

output_path = '/data/data_' + lang + '.csv' # path for saving final dataframe

In [None]:
# read saved .csv files
df = pd.read_csv(output_path)
df = df.drop(columns=['Unnamed: 0']) # if dataframe was saved with index

## T5 model

- [Model description on HuggingFace](https://huggingface.co/t5-large)
- [Paper](https://jmlr.org/papers/volume21/20-074/20-074.pdf)

In [None]:
# load T5 model and tokenizer
sum_model = AutoModelForSeq2SeqLM.from_pretrained("t5-large")
tokenizer = AutoTokenizer.from_pretrained("t5-large")

t5_summaries = [] # list for summaries

for num in range(len(df)):
    #if num % 100 == 0:
    #    print(num, 'from', len(df))
    text = df['abstract'].loc[num] # abstract in the num row 
    inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True) # input abstract text
    outputs = sum_model.generate(inputs["input_ids"], max_length=100, min_length=40, length_penalty=4.0, num_beams=4, early_stopping=True) # generating summary
    #print(tokenizer.decode(outputs[0])[6:-4])
    t5_summaries.append(tokenizer.decode(outputs[0])[6:-4]) # save summary in the list

# add generated summaries as a column in the dataframe
df['T5'] = t5_summaries

## Multilingual BART

- [Model description on HuggingFace](https://huggingface.co/facebook/mbart-large-50)
- [Paper](https://arxiv.org/abs/2008.00401)

In [None]:
# load multilingual BART and tokenizer
sum_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50")
tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50")

bart_summaries = [] # list for summaries

for num in range(len(df)):
    #if num % 100 == 0:
    #    print(num, 'from', len(df))
    text = df['abstract'].loc[num] # abstract in the num row 
    inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True) # input abstract text
    outputs = sum_model.generate(inputs["input_ids"], length_penalty=4.0, num_beams=4, early_stopping=True) # generating summary
    #print(tokenizer.decode(outputs[0])[19:-4])
    bart_summaries.append(tokenizer.decode(outputs[0])[19:-4])  # save summary in the list

# add generated summaries as a column in the dataframe
df['BART'] = bart_summaries

## BART CNN summarization model

- [Model description on HuggingFace](https://huggingface.co/facebook/bart-large-cnn)
- [Paper](https://arxiv.org/abs/1910.13461)

In [None]:
# load CNN-trained BART model and tokenizer
sum_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

bart_cnn_summaries = [] # list for summaries

for num in range(len(df)):
    #if num % 100 == 0:
    #    print(num, 'from', len(df))
    text = df['abstract'].loc[num] # abstract in the num row 
    inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True) # input abstract text
    outputs = sum_model.generate(inputs["input_ids"], length_penalty=4.0, num_beams=4, early_stopping=True) # generating summary
    #print(tokenizer.decode(outputs[0])[7:-4])
    bart_cnn_summaries.append(tokenizer.decode(outputs[0])[7:-4]) # save summary in the list

# add generated summaries as a column in the dataframe
df['BART-CNN'] = bart_summaries

In [None]:
df.to_csv('/content/drive/MyDrive/DBpedia_sum/data/sum_' + lang + '.csv', index=False)