In [1]:
!pip install langchain_community transformers pypdf



In [2]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [3]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import BartForConditionalGeneration, BartTokenizer, pipeline

# Load saved model from Drive
model_path = "/content/drive/MyDrive/cnn-summ"
model = BartForConditionalGeneration.from_pretrained(model_path)
tokenizer = BartTokenizer.from_pretrained(model_path)
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)


Device set to use cuda:0


In [4]:
# Load and split PDF
def extract_pdf_chunks(file_path):
    loader = PyPDFLoader(file_path)
    pages = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    chunks = text_splitter.split_documents(pages)

    return [chunk.page_content for chunk in chunks]


In [14]:
# Summarize each chunk and combine
def summarize_large_pdf(file_path):
    chunks = extract_pdf_chunks(file_path)
    all_summaries = []

    for i, chunk in enumerate(chunks):
        try:
            input_length = len(tokenizer.encode(chunk))
            max_len = max(30, input_length // 3)

            summary = summarizer(chunk, max_length=max_len, min_length=15, do_sample=False)

            all_summaries.append(summary[0]['summary_text'])
        except Exception as e:
            print(f"Error summarizing chunk {i}: {e}")

    return " ".join(all_summaries)


In [15]:
# Example usage
pdf_path = "/content/drive/MyDrive/environment.pdf"
final_summary = summarize_large_pdf(pdf_path)
print("Final Summary:\n", final_summary)

Your max_length is set to 30, but your input_length is only 8. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=4)


Final Summary:
