# Explore models
Try SOTA text summarization models for their performance on speech/lecture transcripts
Since Sat. Oct. 23rd, 2021


In [1]:
## Setup
import torch
# import tensorflow
from transformers import pipeline

from util import *


txt = get_ted_eg()



## BigBird
"Big Bird: Transformers for Longer Sequences".
Manzil Zaheer, Guru Guruganesh, Kumar Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
*NeurIPS 2020*.



In [4]:
def distill_bart(text):
    """
    Default: `sshleifer/distilbart-cnn-12-6` in pytorch
    """
    summarizer = pipeline('summarization')
    summarizer(text, min_length=50, max_length=200)  # Too much tokens than maximum of 1024


def t5(text):
    """
    `t5` in tf
    """
    summarizer = pipeline('summarization', model='t5-base', tokenizer='t5-base', framework='tf')
    summarizer(text, min_length=5, max_length=20)


def bigbird(text):
    """
    `BigBird` in torch

    Looks like doesn't suppport summarization
    """
    summarizer = pipeline(
        'summarization',
        model='google/bigbird-roberta-base',
        tokenizer='google/bigbird-roberta-base'
    )
    summarizer(text, min_length=5, max_length=20)


# bigbird()


def bigbird_pegasus(text):
    from transformers import PegasusTokenizer, BigBirdPegasusForConditionalGeneration, BigBirdPegasusConfig
    model = BigBirdPegasusForConditionalGeneration.from_pretrained('google/bigbird-pegasus-large-arxiv')
    tokenizer = PegasusTokenizer.from_pretrained('google/bigbird-pegasus-large-arxiv')
    inputs = tokenizer([text], max_length=4096, return_tensors='pt', truncation=True)
    # return inputs['input_ids']
    # print(text[:1000])
    # summary_ids = model.generate(inputs['input_ids'], num_beams=8, max_length=512, early_stopping=True)
    summary_ids = model.generate(inputs['input_ids'], num_beams=8, max_length=512)
    # return [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
    return tokenizer.batch_decode(summary_ids)


def bigbird_pegasus_(text):
    from transformers import BigBirdPegasusForConditionalGeneration, AutoTokenizer
    # by default encoder-attention is `block_sparse` with num_random_blocks=3, block_size=64
    # model = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-arxiv")
    model = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-arxiv", attention_type="original_full")
    # model = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-arxiv", block_size=16, num_random_blocks=2)
    tokenizer = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")

    inputs = tokenizer(text, return_tensors='pt')
    # return inputs['input_ids']
    # prediction = model.generate(**inputs)
    prediction = model.generate(inputs['input_ids'], num_beams=4)
    return tokenizer.batch_decode(prediction)


# Why do they produce drastically different outputs, with only hyper-parameter difference?
# Produces
# '<s> we present a brief discussion of the nature of education in the era of '
# 'big data and big rip.<n> we start with a brief history of education in the '
# 'era of big rip.<n> we then turn to a brief discussion of the nature of '
# 'education in the era of big rip.<n> we conclude with a discussion of the '
# 'future of education.'
# i1 = bigbird_pegasus(txt)
# ic(i1)
# i2 = bigbird_pegasus_(txt)
# ic(i1)
# assert i1.equal(i2)



txt = get_498_eg(section=True)
# txt = get_1st_n_words(txt)
# ic(txt)
ic(bigbird_pegasus(txt))

ic| bigbird_pegasus(txt): ['<s> this is an example of what a supervised learning model looks like.<n> '
                           'this is an example of what a supervised learning model looks like.<n> this '
                           'is an example of what a supervised learning model looks like.<n> this is an '
                           'example of what a supervised learning model looks like.<n> this is an '
                           'example of what a supervised learning model looks like.<n> this is an '
                           'example of what a supervised learning model looks like.<n> this is an '
                           'example of what a supervised learning model looks like.<n> this is an '
                           'example of what a supervised learning model looks like.<n> this is an '
                           'example of what a supervised learning model looks like.<n> this is an '
                           'example of what a supervised learning model looks like.<n

['<s> this is an example of what a supervised learning model looks like.<n> this is an example of what a supervised learning model looks like.<n> this is an example of what a supervised learning model looks like.<n> this is an example of what a supervised learning model looks like.<n> this is an example of what a supervised learning model looks like.<n> this is an example of what a supervised learning model looks like.<n> this is an example of what a supervised learning model looks like.<n> this is an example of what a supervised learning model looks like.<n> this is an example of what a supervised learning model looks like.<n> this is an example of what a supervised learning model looks like.<n> this is an example of what a supervised learning model looks like.<n> this is an example of what a supervised learning model looks like.<n> this is an example of what a supervised learning model looks like.']