In [1]:
import os
import fitz
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
summarizer = pipeline('summarization', model="sshleifer/distilbart-cnn-12-6")

In [3]:
current_cwd = os.getcwd()
data_dir = os.path.join(current_cwd, "doc")
data_dir

'/Users/tonmoy/Desktop/Research/paper-summarizer/src/doc'

In [4]:
document_name = "doc.pdf"
document_path = os.path.join(data_dir, document_name)
document_path

'/Users/tonmoy/Desktop/Research/paper-summarizer/src/doc/doc.pdf'

In [5]:
def load_document_data(path):
    extracted_text = ""
    pdf_document = fitz.open(path)

    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        page_text = page.get_text()
        extracted_text += page_text

    return extracted_text

In [6]:
def chunk_and_summarize(text, max_len, chunk_size=500):
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    summaries = [summarizer(chunk, max_length=max_len, min_length=30, do_sample=False) for chunk in chunks]
    return " ".join([summary[0]['summary_text'] for summary in summaries])

In [7]:
summarized_text = chunk_and_summarize(text=load_document_data(document_path), max_len=100)
with open("summarized_text.txt", "w") as f:
    f.write(summarized_text)