# 0. Installing Transformers and Importing Dependencies

In [1]:
!pip install --quiet transformers

[0m

In [2]:
pip install --quiet --upgrade arxiv pypdf pymupdf

[0mNote: you may need to restart the kernel to use updated packages.


In [20]:
!pip install --quiet openai

[0m

In [3]:
from transformers import pipeline
from bs4 import BeautifulSoup
import requests
import arxiv
from pypdf import PdfReader
import fitz


# download pdf file

In [4]:
def download_arxiv(url):
    id_from_url = url.split("/")[-1:]
    paper = next(arxiv.Search(id_list=id_from_url).results())
    print(paper.title)
    print(paper.authors)
    print(paper.primary_category)
    print(paper.published)
    print(paper.summary)
    # Download the PDF to a specified directory with a custom filename.
    paper.download_pdf(dirpath='./', filename="downloaded-paper.pdf")

In [5]:
url = "https://arxiv.org/abs/1706.03762"
download_arxiv(url)

Attention Is All You Need
[arxiv.Result.Author('Ashish Vaswani'), arxiv.Result.Author('Noam Shazeer'), arxiv.Result.Author('Niki Parmar'), arxiv.Result.Author('Jakob Uszkoreit'), arxiv.Result.Author('Llion Jones'), arxiv.Result.Author('Aidan N. Gomez'), arxiv.Result.Author('Lukasz Kaiser'), arxiv.Result.Author('Illia Polosukhin')]
cs.CL
2017-06-12 17:57:34+00:00
The dominant sequence transduction models are based on complex recurrent or
convolutional neural networks in an encoder-decoder configuration. The best
performing models also connect the encoder and decoder through an attention
mechanism. We propose a new simple network architecture, the Transformer, based
solely on attention mechanisms, dispensing with recurrence and convolutions
entirely. Experiments on two machine translation tasks show these models to be
superior in quality while being more parallelizable and requiring significantly
less time to train. Our model achieves 28.4 BLEU on the WMT 2014
English-to-German translati

In [6]:
!ls /kaggle/working

__notebook_source__.ipynb  downloaded-paper.pdf


In [26]:
!pip install --quiet spacy contractions word2number

[0m

## Preprocess file with NLTK

In [27]:
from bs4 import BeautifulSoup
import spacy
import unidecode
from word2number import w2n
import contractions

def download_spacy_model(model="en_core_web_md"):
    print(f"Downloading spaCy model {model}")
    spacy.cli.download(model)
    print(f"Finished downloading model")

download_spacy_model()

nlp = spacy.load('en_core_web_md')

# exclude words from spacy stopwords list
deselect_stop_words = ['no', 'not']
for w in deselect_stop_words:
    nlp.vocab[w].is_stop = False


    
def strip_html_tags(text):
    """remove html tags from text"""
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text(separator=" ")
    return stripped_text


def remove_whitespace(text):
    """remove extra whitespaces from text"""
    text = text.strip()
    return " ".join(text.split())


def remove_accented_chars(text):
    """remove accented characters from text, e.g. café"""
    text = unidecode.unidecode(text)
    return text


def expand_contractions(text):
    """expand shortened words, e.g. don't to do not"""
    text = contractions.fix(text)
    return text


def text_preprocessing(text, accented_chars=True, contractions=True, 
                       convert_num=True, extra_whitespace=True, 
                       lemmatization=True, lowercase=True, punctuations=True,
                       remove_html=True, remove_num=True, special_chars=True, 
                       stop_words=True):
    """preprocess text with default option set to true for all steps"""
    if remove_html == True: #remove html tags
        text = strip_html_tags(text)
    if extra_whitespace == True: #remove extra whitespaces
        text = remove_whitespace(text)
    if accented_chars == True: #remove accented characters
        text = remove_accented_chars(text)
    if contractions == True: #expand contractions
        text = expand_contractions(text)
    if lowercase == True: #convert all characters to lowercase
        text = text.lower()

    doc = nlp(text) #tokenise text

    clean_text = []
    
    for token in doc:
        flag = True
        edit = token.text
        # remove stop words
        if stop_words == True and token.is_stop and token.pos_ != 'NUM': 
            flag = False
        # remove punctuations
        if punctuations == True and token.pos_ == 'PUNCT' and flag == True: 
            flag = False
        # remove special characters
        if special_chars == True and token.pos_ == 'SYM' and flag == True: 
            flag = False
        # remove numbers
        if remove_num == True and (token.pos_ == 'NUM' or token.text.isnumeric()) \
        and flag == True:
            flag = False
        # convert number words to numeric numbers
        if convert_num == True and token.pos_ == 'NUM' and flag == True:
            edit = w2n.word_to_num(token.text)
        # convert tokens to base form
        elif lemmatization == True and token.lemma_ != "-PRON-" and flag == True:
            edit = token.lemma_
        # append tokens edited and not removed to list 
        if edit != "" and flag == True:
            clean_text.append(edit)        
    return clean_text

Downloading spaCy model en_core_web_md
Collecting en-core-web-md==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.3.0/en_core_web_md-3.3.0-py3-none-any.whl (33.5 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 33.5/33.5 MB 8.9 MB/s eta 0:00:00
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.3.0




[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
Finished downloading model


## Read the pdf file content

In [9]:
# Read the pdf file and read its content:
def parse_paper(path):
    print("Parsing paper")
    reader = PdfReader(path)
    number_of_pages = len(reader.pages)
    print(f"Total number of pages: {number_of_pages}")
    paper_text = []
    for i in range(number_of_pages):
        page = reader.pages[i]
        page_text = []

        def visitor_body(text, cm, tm, fontDict, fontSize):
            x = tm[4]
            y = tm[5]
            # ignore header/footer
            if (y > 50 and y < 720) and (len(text.strip()) > 1) and (fontSize<=10 and fontSize>=6) and len(text)>10:
                page_text.append({
                    'fontsize': fontSize,
                    'text': text.strip().replace('\x03', ''),
                    'x': x,
                    'y': y
                })

        _ = page.extract_text(visitor_text=visitor_body)

        blob_font_size = None
        blob_text = ''
        processed_text = []

        for t in page_text:
            if t['fontsize'] == blob_font_size:
                blob_text += f" {t['text']}"
            else:
                if blob_font_size is not None and len(blob_text) > 1:
                    processed_text.append({
                        'fontsize': blob_font_size,
                        'text': blob_text,
                        'page': i
                    })
                blob_font_size = t['fontsize']
                blob_text = t['text']
        paper_text += processed_text
    return paper_text

def parse_paper2(path):
    print("Parsing paper...")
    doc = fitz.open(path)
    print("TotalPages of doc: ", len(doc))
    text_list = []
    for page in doc:
        text = page.get_text("text", sort=True)
        text_list.append(text)
    return " ".join(text_list)


In [10]:
import re


In [11]:
def preprocess_text(text):
    index = text.find("Abstract")
    print("Remove before abstract!!")
    text = text[index+8:]
    # return list of preprocessed text
#     text = text_preprocessing(text, lowercase=False, punctuations=False)
#     text = ' '.join(text)
    text = re.sub(r'\[.*?\]', '', text) # remove square brackets
    text = text.replace("\n", '') # remove \n
    text = re.sub(r'[^\x00-\x7F]+',' ', text) # remove non-ASCII characters
    return text

In [12]:
path_to_pdf = "/kaggle/working/downloaded-paper.pdf"
text = parse_paper2(path_to_pdf)
text = preprocess_text(text)

Parsing paper...
TotalPages of doc:  15
Remove before abstract!!


# 1. Load Summarization Pipeline

In [13]:
summarizer = pipeline("summarization")


No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 (https://huggingface.co/sshleifer/distilbart-cnn-12-6)


Downloading:   0%|          | 0.00/1.76k [00:00<?, ?B/s]

2023-02-03 02:32:32.058605: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-03 02:32:32.059846: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-03 02:32:32.060515: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-03 02:32:32.062723: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

Downloading:   0%|          | 0.00/1.14G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

## 2. Chunk Text

In [14]:
max_chunk = 500
text = text.replace('.', '.<eos>')
text = text.replace('?', '?<eos>')
text = text.replace('!', '!<eos>')

In [15]:
sentences = text.split('<eos>')
current_chunk = 0 
chunks = []
for sentence in sentences:
    if len(chunks) == current_chunk + 1: 
        if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
            chunks[current_chunk].extend(sentence.split(' '))
        else:
            current_chunk += 1
            chunks.append(sentence.split(' '))
    else:
        print(current_chunk)
        chunks.append(sentence.split(' '))

for chunk_id in range(len(chunks)):
    chunks[chunk_id] = ' '.join(chunks[chunk_id])

0


In [16]:
len(chunks)


12

## 3. Summarize Text

In [17]:
res = summarizer(chunks, max_length=120, min_length=30, do_sample=False)


In [18]:
res[0]


{'summary_text': ' We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions . Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring signi cantlyless time to train .'}

In [21]:
result_summary_hf = ' '.join([summ['summary_text'] for summ in res])


In [22]:
print(result_summary_hf)

 We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions . Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring signi cantlyless time to train .  Attention mechanisms have become an integral part of compelling sequence modeling and transduc-tion models in various tasks . In this work we propose the Transformer, a model architecture eschewing recurrence and insteadrelying entirely on an attention mechanism . The Transformer allows for signi cantly more parallelization and can reach a new state of the art intranslation quality after being trained for as little as twelve hours on eight P100 GPUs .  We call our particular attention "Scaled Dot-Product Attention" (Figure 2) Multi-Head Attention consists of severalattention layers running in parallel . We employ residual connections around each of the sub-layers, followed b

### With advance preprocessing

In [88]:
def preprocess_text(text, advance_preprocess=False):
    # remove abstract
    index = text.find('Abstract')
    text = text[index+8:]
    if advance_preprocess:
        #return list of preprocessed text
        text = text_preprocessing(text, lowercase=False, punctuations=False)
        text = ''.join(text)
    text = re.sub(r'\[.*?\]', '', text) # remove square brackets
    text = text.replace("\n", '') # remove \n
    text = re.sub(r'[^\x00-\x7F]+',' ', text) # remove non-ASCII characters
    return text


def generate_chunks(text):
    max_chunk = 500
    text = text.replace('.', '.<eos>')
    text = text.replace('?', '?<eos>')
    text = text.replace('!', '!<eos>')
    sentences = text.split('<eos>')
    current_chunk = 0 
    chunks = []
    for sentence in sentences:
        if len(chunks) == current_chunk + 1: 
            if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
                chunks[current_chunk].extend(sentence.split(' '))
            else:
                current_chunk += 1
                chunks.append(sentence.split(' '))
        else:
            print(current_chunk)
            chunks.append(sentence.split(' '))

    for chunk_id in range(len(chunks)):
        chunks[chunk_id] = ' '.join(chunks[chunk_id])
    
    return chunks
        
def combine_preprocess_page(doc, advance_preprocess=False):
    texts = []
    for page in doc:
        texts.append(page.get_text("text"))
    text = ' '.join(texts)
    text = preprocess_text(text, advance_preprocess)
    return text
        
def summarize_text_with_hf(path_to_pdf, advance_preprocess=False):
    doc = fitz.open(path_to_pdf)
    text = combine_preprocess_page(doc, advance_preprocess=False)
#     print(text)
    chunks = generate_chunks(text)
#     print(chunks)
    res = summarizer(chunks, max_length=120, min_length=30, do_sample=False)
        
    return ' '.join([summ['summary_text'] for summ in res])

In [89]:
path_to_pdf = "/kaggle/working/downloaded-paper.pdf"
result_hf = summarize_text_with_hf(path_to_pdf, advance_preprocess=True)

0


In [90]:
result_hf

' We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions . Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring signi cantlyless time to train .  Attention mechanisms have become an integral part of compelling sequence modeling and transduc-tion models in various tasks . In this work we propose the Transformer, a model architecture eschewing recurrence and insteadrelying entirely on an attention mechanism . The Transformer allows for signi cantly more parallelization and can reach a new state of the art intranslation quality after being trained for as little as twelve hours on eight P100 GPUs .  We call our particular attention "Scaled Dot-Product Attention" (Figure 2) Multi-Head Attention consists of severalattention layers running in parallel . We employ residual connections around each of the sub-layers, followed 

# OpenAI prediction

In [23]:
import openai
openai.api_key =  ""

In [28]:
def preprocess_text(text, advance_preprocess=False):
    if advance_preprocess:
        #return list of preprocessed text
        text = text_preprocessing(text, lowercase=False, punctuations=False)
        text = ''.join(text)
    text = re.sub(r'\[.*?\]', '', text) # remove square brackets
    text = text.replace("\n", '') # remove \n
    text = re.sub(r'[^\x00-\x7F]+',' ', text) # remove non-ASCII characters
    return text

def summarize_text_with_openai(path_to_pdf, advance_preprocess=False):
    doc = fitz.open(path_to_pdf)
    summary_list =[]
    for page in doc:
        text = page.get_text("text")
        text = preprocess_text(text, advance_preprocess)
        prompt= "summarize this text: "+ text + "\n Tl;dr:"
        response = openai.Completion.create(
            model="text-davinci-003",
            prompt=prompt,
            temperature=0.5,
            max_tokens=120,
            top_p=0.9,
            frequency_penalty=0.0,
            presence_penalty=1
          )
        summary_list.append(response["choices"][0]["text"])
        
    return ''.join(summary_list)


In [29]:
path_to_pdf = "/kaggle/working/downloaded-paper.pdf"
# summary without advance spacy preprocess
summary_text1 = summarize_text_with_openai(path_to_pdf)

In [30]:
print(summary_text1)

 We propose a new model architecture, the Transformer, based solely on attention mechanisms. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing. Our model achieves state-of-the-art results on two machine translation tasks. 

Transformer is a new network architecture based solely on attention mechanisms. It dispenses with recurrence and convolution entirely, instead relying entirely on self-attention to draw global dependencies between input and output. This makes it much more parallelizable than previous models, leading to faster training times and improved performance. Exper The Transformer is a model architecture that uses self-attention to draw global dependencies between input and output sequences. It allows for significantly more parallelization than recurrent models, and can reach a new state of the art in translation quality after being trained for as little as twelve hours on eight P100 GPUs. The Transformer 

In [31]:
# summary without advance spacy preprocess
summary_text12 = summarize_text_with_openai(path_to_pdf, advance_preprocess=True)

In [32]:
summary_text12

" We propose a new network architecture, the Transformer, based solely on attention mechanisms, which dispenses with recurrence and convolution entirely. Experiments on machine translation tasks show that models based on the Transformer architecture achieve superior quality to parallelizable RNNs and can be trained significantly faster. Our model achieves a BLEU score of 28.4 on the WMT English-German translation task, improving over existing single-model results, including ensembles, by over 2 BLEU. On the WMT English-French translation task, our model establishes a new single-model state The Transformer is a neural sequence transduction model that relies entirely on self-attention to compute representations of input and output sequences. The model has an encoder-decoder structure, with the encoder mapping an input sequence of symbols to a continuous representation and the decoder generating an output sequence of symbols. Self-attention is used to compute representations of different 

In [36]:
# short summary generation
def generate_short_summary(summary_text):
    prompt= summary_text + "\n Tl;dr:"
    response = openai.Completion.create(
        model="text-davinci-003",
        prompt=prompt,
        temperature=0.6,
        max_tokens=400,
        top_p=0.9,
        frequency_penalty=0.0,
        presence_penalty=1
    )
    return response["choices"][0]["text"]

In [37]:
final_summary = generate_short_summary(summary_text12)

In [38]:
final_summary

' This paper presents the Transformer, a neural sequence transduction model based solely on attention mechanisms which replaces recurrence and convolution. Experiments show that models based on the Transformer architecture achieve superior quality to parallelizable RNNs and can be trained significantly faster. Our model achieves a BLEU score of 28.4 on the WMT English-German translation task, improving over existing single-model results by over 2 BLEU.'

## Summarize the paper abstract

In [51]:
def download_arxiv(url):
    
    id_from_url = url.split("/")[-1:]
    paper = next(arxiv.Search(id_list=id_from_url).results())
#     print(paper.title)
    title = paper.title
#     print(paper.authors)
#     print(paper.primary_category)
#     print(paper.published)
#     print(paper.summary)
    summary = paper.summary
#     for a in paper.authors:
#         print(str(a))
    # Download the PDF to a specified directory with a custom filename.
    paper.download_pdf(dirpath='./', filename="downloaded-paper.pdf")
    return str(title), str(summary)

In [52]:
url = "https://arxiv.org/abs/1706.03762"
title, abstract = download_arxiv(url)

<class 'str'> <class 'str'>


In [55]:
paper_abstract = f"The paper entitle on '{title}'. "+abstract
paper_abstract

"The paper entitle on 'Attention Is All You Need'. The dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks in an encoder-decoder configuration. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Transformer, based\nsolely on attention mechanisms, dispensing with recurrence and convolutions\nentirely. Experiments on two machine translation tasks show these models to be\nsuperior in quality while being more parallelizable and requiring significantly\nless time to train. Our model achieves 28.4 BLEU on the WMT 2014\nEnglish-to-German translation task, improving over the existing best results,\nincluding ensembles by over 2 BLEU. On the WMT 2014 English-to-French\ntranslation task, our model establishes a new single-model state-of-the-art\nBLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction\nof the training costs of t

### With OpenAI

In [56]:
def summarize_abstract(text):
    prompt= text + "\n Tl;dr:"
    response = openai.Completion.create(
        model="text-davinci-003",
        prompt=prompt,
        temperature=0.7,
        max_tokens=400,
        top_p=0.9,
        frequency_penalty=0.0,
        presence_penalty=1
    )
    return response["choices"][0]["text"]

In [57]:
abstract_openai_summary = summarize_abstract(paper_abstract)
abstract_openai_summary

' This paper proposes a new simple network architecture, the Transformer, based solely on attention mechanisms which has been applied to machine translation and English constituency parsing tasks. It has been found to be superior in quality while being more parallelizable and requiring significantly less time to train than other models.'

### With Hugging Face

In [64]:
text = paper_abstract
text = text.replace('.', '.<eos>')
text = text.replace('?', '?<eos>')
text = text.replace('!', '!<eos>')

In [65]:
summary_hf_abstract = summarizer(text, max_length=120, min_length=30, do_sample=False)


In [66]:
summary_hf_abstract[0]['summary_text']

" The paper entitle on 'Attention Is All You Need' The dominant sequence transduction models are based on complex recurrent or convolutional neural networks in an encoder-decoder configuration . We propose a new simple network architecture, the Transformer, based solely on attention mechanisms ."