In [None]:
pip install transformers torch accelerate llama-index arxiv pypdf


Collecting llama-index
  Downloading llama_index-0.12.25-py3-none-any.whl.metadata (12 kB)
Collecting arxiv
  Downloading arxiv-2.1.3-py3-none-any.whl.metadata (6.1 kB)
Collecting pypdf
  Downloading pypdf-5.4.0-py3-none-any.whl.metadata (7.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (

In [3]:
import os
import torch
import PyPDF2
from transformers import AutoModelForCausalLM, AutoTokenizer
from llama_index.core import (
    VectorStoreIndex,
    ServiceContext
)
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core.prompts import PromptTemplate

def extract_pdf_text(pdf_path):
    """
    Extract text from PDF using PyPDF2

    Args:
        pdf_path (str): Path to the PDF file

    Returns:
        str: Extracted text from the PDF
    """
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)

        # Extract text from all pages
        full_text = []
        for page in reader.pages:
            full_text.append(page.extract_text())

        return "\n".join(full_text)

def summarize_pdf(pdf_path, model_name="facebook/opt-350m"):
    """
    Summarize PDF using open-source LLM

    Args:
        pdf_path (str): Path to the PDF file
        model_name (str): Hugging Face model to use

    Returns:
        dict: Comprehensive and section summaries
    """
    # Extract text from PDF
    pdf_text = extract_pdf_text(pdf_path)

    # Configure device
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map=device,
        torch_dtype=torch.float16 if device == "cuda" else torch.float32
    )

    # Configure LLamaIndex LLM
    llm = HuggingFaceLLM(
        model=model,
        tokenizer=tokenizer,
        context_window=2048,
        max_new_tokens=512,
        generate_kwargs={"temperature": 0.1, "do_sample": False}
    )

    # Comprehensive summary prompt
    comprehensive_prompt = PromptTemplate(
        "Provide a detailed academic-style summary of this research paper. "
        "Include key points about the research question, methodology, "
        "key findings, and potential implications:\n\n{context}\n\n"
        "Comprehensive Summary:"
    )

    # Section summary prompt
    section_prompt = PromptTemplate(
        "Summarize the following section of the research paper. "
        "Focus on the main ideas and critical information:\n\n{context}\n\n"
        "Section Summary:"
    )

    # Generate comprehensive summary
    comprehensive_prompt_filled = comprehensive_prompt.format(context=pdf_text[:4000])
    comprehensive_summary = llm.complete(comprehensive_prompt_filled)

    # Generate section summaries (split text into chunks)
    section_summaries = []
    text_chunks = [pdf_text[i:i+2000] for i in range(0, len(pdf_text), 2000)]
    for chunk in text_chunks[:3]:  # Limit to first 3 chunks
        section_prompt_filled = section_prompt.format(context=chunk)
        section_summary = llm.complete(section_prompt_filled)
        section_summaries.append(str(section_summary))

    return {
        "comprehensive_summary": str(comprehensive_summary),
        "section_summaries": section_summaries
    }

def main():
    pdf_path = "./sample_data/2312.00812.pdf"

    try:
        summaries = summarize_pdf(pdf_path)

        print("🔍 Comprehensive Summary:")
        print(summaries['comprehensive_summary'])

        print("\n📋 Section Summaries:")
        for idx, summary in enumerate(summaries['section_summaries'], 1):
            print(f"Section {idx}:\n{summary}\n{'='*50}\n")

    except Exception as e:
        print(f"Error during PDF summarization: {e}")

if __name__ == "__main__":
    main()



🔍 Comprehensive Summary:

The present study investigates the impact of the use of large language models (LLMs) in autonomous driving. The study is based on a simulation of a real-world autonomous vehicle (AV) system, which is equipped with a large number of LLMs. The study is based on a simulation of a real-world autonomous vehicle (AV) system, which is equipped with a large number of LLMs. The study is based on a simulation of a real-world autonomous vehicle (AV) system, which is equipped with a large number of LLMs. The study is based on a simulation of a real-world autonomous vehicle (AV) system, which is equipped with a large number of LLMs. The study is based on a simulation of a real-world autonomous vehicle (AV) system, which is equipped with a large number of LLMs. The study is based on a simulation of a real-world autonomous vehicle (AV) system, which is equipped with a large number of LLMs. The study is based on a simulation of a real-world autonomous vehicle (AV) system, whi

In [2]:
pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
