# Install dependencies

In [None]:
! pip install pandas transformers datasets scikit-learn simpletransformers torch tqdm

# Initialize packages and data

In [53]:
import pandas as pd
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from tqdm import tqdm

tqdm.pandas()

train_stances = pd.read_csv('train_stances.csv', index_col="Body ID")
train_bodies = pd.read_csv('train_bodies.csv', index_col="Body ID")
test_stances = pd.read_csv('competition_test_stances.csv', index_col="Body ID")
test_bodies = pd.read_csv('competition_test_bodies.csv', index_col="Body ID")

# Choose model

In [None]:
# Decide on a model from this list
# https://huggingface.co/models?search=pegasus
model_name = "google/pegasus-cnn_dailymail"

# Initialize model

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

def summarize(src_text):
    batch = tokenizer(src_text, truncation=True, padding="longest", return_tensors="pt").to(device)
    translated = model.generate(**batch)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return tgt_text[0]

# Run model

In [None]:
train_bodies["articleBody"] = train_bodies["articleBody"].progress_apply(summarize)
train_bodies.to_csv(f'train_bodies_{model_name.replace("/","_")}.csv')

In [38]:
test_bodies["articleBody"] = test_bodies["articleBody"].progress_apply(summarize)
test_bodies.to_csv(f'test_bodies_{model_name.replace("/","_")}.csv')

100%|███████████████████████████████████████| 904/904 [1:19:39<00:00,  5.29s/it]
