# Explore models
Try SOTA text summarization models for their performance on speech/lecture transcripts
Since Sat. Oct. 23rd, 2021


In [5]:
## Setup
import torch
# import tensorflow
from transformers import pipeline

from util import *


txt = get_ted_eg()



## BigBird
"Big Bird: Transformers for Longer Sequences".
Manzil Zaheer, Guru Guruganesh, Kumar Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
*NeurIPS 2020*.



In [None]:
def distill_bart(text):
    """
    Default: `sshleifer/distilbart-cnn-12-6` in pytorch
    """
    summarizer = pipeline('summarization')
    summarizer(text, min_length=50, max_length=200)  # Too much tokens than maximum of 1024


def t5(text):
    """
    `t5` in tf
    """
    summarizer = pipeline('summarization', model='t5-base', tokenizer='t5-base', framework='tf')
    summarizer(text, min_length=5, max_length=20)


def bigbird(text):
    """
    `BigBird` in torch

    Looks like doesn't suppport summarization
    """
    summarizer = pipeline(
        'summarization',
        model='google/bigbird-roberta-base',
        tokenizer='google/bigbird-roberta-base'
    )
    summarizer(text, min_length=5, max_length=20)


def bigbird_pegasus(text):
    from transformers import PegasusTokenizer, BigBirdPegasusForConditionalGeneration, BigBirdPegasusConfig
    model = BigBirdPegasusForConditionalGeneration.from_pretrained('google/bigbird-pegasus-large-arxiv')
    tokenizer = PegasusTokenizer.from_pretrained('google/bigbird-pegasus-large-arxiv')
    inputs = tokenizer([text], max_length=4096, return_tensors='pt', truncation=True)
    summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=300, early_stopping=True)
    return [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]


def bigbird_pegasus_(text):
    from transformers import BigBirdPegasusForConditionalGeneration, AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")

    # by default encoder-attention is `block_sparse` with num_random_blocks=3, block_size=64
    # model = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-arxiv")
    model = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-arxiv", attention_type="original_full")
    # model = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-arxiv", block_size=16, num_random_blocks=2)

    # text = "haha"
    inputs = tokenizer(text, return_tensors='pt')
    prediction = model.generate(**inputs)
    prediction = tokenizer.batch_decode(prediction)
    return prediction

# bigbird()
# ic(bigbird_pegasus(txt))
bigbird_pegasus_(txt)