# Imports

In [None]:
# TODO: Change PATH to desired file location where results will be saved.
PATH = '.'

In [None]:
from datasets import load_dataset
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import scipy.stats as stats

In [None]:
ds = load_dataset("ccdv/pubmed-summarization", "section")

In [None]:
model = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForSeq2SeqLM.from_pretrained(model)

eval_model = SentenceTransformer('all-mpnet-base-v2')

In [None]:
train_size = len(ds['train'])
val_size = len(ds['validation'])
test_size = len(ds['test'])
total_size = train_size + val_size + test_size
print(f'Number of instances in training set = {train_size}; {train_size / total_size} portion of data')
print(f'Number of instances in validation set = {val_size}; {val_size / total_size} portion of data')
print(f'Number of instances in test set = {test_size}; {test_size / total_size} portion of data')

In [None]:
run_abstract_analysis = False

In [None]:
if run_abstract_analysis:
  abstract_lengths = []
  for abstract in ds['train']['abstract']:
    abs_tokens = tokenizer.tokenize(abstract)
    abstract_lengths.append(len(abs_tokens))

  print(f'Average abstract length = {np.mean(abstract_lengths)}.')
  print(f'Max abstract length = {max(abstract_lengths)}.')
  print(f'5th percentile abstract length = {np.quantile(abstract_lengths, 0.05)}.')
  print(f'25th percentile abstract length = {np.quantile(abstract_lengths, 0.25)}.')
  print(f'75th percentile abstract length = {np.quantile(abstract_lengths, 0.75)}.')
  print(f'95th percentile abstract length = {np.quantile(abstract_lengths, 0.95)}.')
  print(f'99th percentile abstract length = {np.quantile(abstract_lengths, 0.99)}.')

# T5

In [None]:
def chunk_paper(text, max_tokens, overlap=50):
    tokens = tokenizer.tokenize(text)
    chunks = []
    for i in range(0, len(tokens), max_tokens - overlap):
        chunk = tokens[i:i + max_tokens]
        chunks.append(tokenizer.convert_tokens_to_string(chunk))
    return chunks

In [None]:
llm_summaries = []

for i in range(200):
  paper_chunked = chunk_paper(ds['test'][i]['article'], 512, overlap=64)
  chunk_summaries = []
  for chunk in paper_chunked:
      chunk = "Summarize:" + chunk
      inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=1024)
      summary_ids = model.generate(**inputs, max_length=256, min_length=64)
      chunk_summaries.append(tokenizer.decode(summary_ids[0], skip_special_tokens=True))

  all_summaries = ' '.join(chunk_summaries)
  final_summary = model.generate( tokenizer(all_summaries, return_tensors="pt", truncation=True, max_length=1024)["input_ids"], min_length=100, max_length=606)
  model_summary_text = tokenizer.decode(final_summary[0], skip_special_tokens=True)
  print(f'Paper {i}:', model_summary_text)
  llm_summaries.append(model_summary_text)

In [None]:
save = True
basename = PATH
if save:
  np.save(basename+'_text.npy', np.array(llm_summaries))