In [None]:
%pip install langchain pypdf tiktoken python-dotenv streamlit rich langchain-community langchain-community langchain-ollama

In [58]:
%pip uninstall torch torchvision torchaudio -y
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128


Note: you may need to restart the kernel to use updated packages.




Looking in indexes: https://download.pytorch.org/whl/cu128
Collecting torch
  Downloading https://download.pytorch.org/whl/cu128/torch-2.9.0%2Bcu128-cp313-cp313-win_amd64.whl.metadata (29 kB)
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu128/torchvision-0.24.0%2Bcu128-cp313-cp313-win_amd64.whl.metadata (6.1 kB)
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cu128/torchaudio-2.9.0%2Bcu128-cp313-cp313-win_amd64.whl.metadata (7.0 kB)
Downloading https://download.pytorch.org/whl/cu128/torch-2.9.0%2Bcu128-cp313-cp313-win_amd64.whl (2860.2 MB)
   ---------------------------------------- 0.0/2.9 GB ? eta -:--:--
   ---------------------------------------- 0.0/2.9 GB 8.6 MB/s eta 0:05:33
   ---------------------------------------- 0.0/2.9 GB 9.0 MB/s eta 0:05:18
   ---------------------------------------- 0.0/2.9 GB 9.3 MB/s eta 0:05:08
   ---------------------------------------- 0.0/2.9 GB 9.0 MB/s eta 0:05:17
   --------------------------------

In [1]:
import torch
print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")

Torch version: 2.9.0+cu128
CUDA available: True
GPU name: NVIDIA GeForce RTX 4090 Laptop GPU


- Stuffing method: for document summarization, condense entire content of PDF into one comprehensive query that LLM can interpret and summarize
- Map-reduce method: for targeted document querying, disect document into manageable pieces and applying specific queries (mapping) to each segment

# Load PDFs 
- Using PDF document loader by LangChain

In [2]:
from langchain_community.document_loaders.pdf import PyPDFLoader

file_path = "papers\ml_model_cardio_disease_detection.pdf"
loader = PyPDFLoader(file_path)
docs = loader.load()

  file_path = "papers\ml_model_cardio_disease_detection.pdf"
  from .autonotebook import tqdm as notebook_tqdm


In [3]:
print(len(docs))
docs[-3:]

19


[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-01-24T17:06:41+08:00', 'author': 'Adedayo Ogunpola, Faisal Saeed, Shadi Basurra, Abdullah M. Albarrak and Sultan Noman Qasem', 'keywords': 'cardiovascular diseases; deep learning; disease detection; heart diseases; machine learning; ensemble learning; XGBoost', 'moddate': '2024-01-24T10:15:42+01:00', 'subject': 'Cardiovascular diseases present a significant global health challenge that emphasizes the critical need for developing accurate and more effective detection methods. Several studies have contributed valuable insights in this field, but it is still necessary to advance the predictive models and address the gaps in the existing detection approaches. For instance, some of the previous studies have not considered the challenge of imbalanced datasets, which can lead to biased predictions, especially when the datasets include minority classes. This study’s primary focus is the ea

In [5]:
from langchain_ollama import ChatOllama

# Example with Yi-1.5 model running locally via Ollama
llm = ChatOllama(
    model="qwen:32b",       # Or "mixtral:latest", "llama3", etc.
    temperature=0.1,
    device = "cuda"
)

In [None]:


from typing import TypedDict, List, Optional

class ResearchState(TypedDict, total=False):
    user_input: str
    pdf_path: Optional[str]
    documents: Optional[List[str]]      # raw extracted pages
    chunks: Optional[List[str]]         # token-split chunks
    summary: Optional[str]              # final output summary
    answer: Optional[str]               # answers to user questions
    web_results: Optional[str]          # scraped content


In [22]:
from tiktoken import get_encoding

enc = get_encoding("cl100k_base")
text = " ".join(page.page_content for page in docs[:-2]) # remove last 2 pages of references
num_tokens = len(enc.encode(text))
print(f"Estimated tokens in paper: {num_tokens}")

# Qwen-32B default context window = 32k tokens (~24k words)

Estimated tokens in paper: 13883


In [23]:
# PDF summarisation prompt

from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

paper_summary_prompt = PromptTemplate.from_template("""
You are an AI research assistant. Summarize the following research paper content with:

1. Title & Authors
2. Research Problem & Motivation
3. Dataset(s) Used (name, size, preprocessing)
4. Methodology / Model Architecture (algorithms, components)
5. Key Results & Metrics (include numeric values)
6. Conclusion & Future Work
7. Limitations (if mentioned)

Content:
{chunk}
""")

paper_summary_chain = paper_summary_prompt | llm | StrOutputParser()


In [24]:
from langchain_community.document_loaders import PyPDFLoader
from tiktoken import get_encoding

enc = get_encoding("cl100k_base")  # safe tokenizer

def load_and_chunk_pdf(state: ResearchState):
    loader = PyPDFLoader(state["pdf_path"])
    pages = loader.load()
    state["documents"] = [p.page_content for p in pages[:-2]]

    text = "\n".join(state["documents"])
    tokens = enc.encode(text)

    chunk_size = 1500
    overlap = 200

    chunks = []
    for i in range(0, len(tokens), chunk_size - overlap):
        chunk_text = enc.decode(tokens[i:i+chunk_size])
        chunks.append(chunk_text)

    state["chunks"] = chunks
    return state


In [25]:
def generate_pdf_summary(state: ResearchState):
    summaries = []
    for chunk in state["chunks"]:
        resp = paper_summary_chain.invoke({"chunk": chunk})
        summaries.append(resp)
    state["summary"] = "\n\n".join(summaries)
    return state


In [44]:
# Web Research Prompt

web_prompt = PromptTemplate.from_template("""
You are an AI research assistant gathering recent developments on: {topic}

1. Extract information only from credible sources (papers, arXiv, official docs)
2. Produce a structured summary:
   - Topic Overview
   - Recent advancements (last 12–24 months)
   - Key findings, models, benchmarks
   - Datasets / Tools used
   - Research gaps / open problems
3. List references (Author(s), Year, Title, URL)

If no reliable data found, say: "No verifiable information found."

Here is the raw web content you found:
{web_results}

Generate the final answer:
""")

web_chain = web_prompt | llm | StrOutputParser()

In [45]:
# %pip install -U ddgs

Note: you may need to restart the kernel to use updated packages.


In [46]:
from langchain_community.tools import DuckDuckGoSearchRun

search = DuckDuckGoSearchRun()

def web_search_and_summarize(state: ResearchState):
    query = state["user_input"]
    results = search.run(query)
    state["web_results"] = results

    state["summary"] = web_chain.invoke({
        "topic": query,
        "web_results": results
    })

    return state


In [47]:
def decide_next_node(state: ResearchState):
    if state.get("pdf_path"):
        return "load_pdf"
    else:
        return "web_search"

def route(state: ResearchState):
    return state


In [48]:
from langgraph.graph import StateGraph, END

graph = StateGraph(ResearchState)

graph.add_node("router", route)
graph.add_node("load_pdf", load_and_chunk_pdf)
graph.add_node("summarize_pdf", generate_pdf_summary)
graph.add_node("web_search", web_search_and_summarize)

graph.set_entry_point("router")  # dynamic routing
graph.add_conditional_edges(
    "router", 
    decide_next_node, 
    {
        "load_pdf": "load_pdf",
        "web_search": "web_search"
    })

graph.add_edge("load_pdf", "summarize_pdf")
graph.add_edge("summarize_pdf", END)

graph.add_edge("web_search", END)

compiled_graph = graph.compile()


In [49]:
# For PDF
result = compiled_graph.invoke({
    "user_input": "Summarize the paper",
    "pdf_path": "papers\ml_model_cardio_disease_detection.pdf"
})
print(result["summary"])

# For Web
result = compiled_graph.invoke({
    "user_input": "Recent SOTA models for medical image segmentation"
})
print(result["summary"])


  "pdf_path": "papers\ml_model_cardio_disease_detection.pdf"


No verifiable information found on recent advancements in summarizing long texts or research papers within the last 12-24 months from credible sources like academic papers, arXiv, or official documentation. The web content provided mostly consists of promotional materials for various tools (Scribbr's free summarizer, Paperpal Write, ChatGPT, Noiz, and scienceOS) that offer AI-powered text summarization services. These tools claim to help users summarize documents, articles, or research papers quickly and efficiently by analyzing the content and extracting key information. However, they do not provide in-depth technical details, benchmarks, datasets, or research gaps related to the underlying AI algorithms or methodologies used for summarization.


KeyboardInterrupt: 

In [None]:
from tiktoken import get_encoding

enc = get_encoding("cl100k_base")
text = " ".join(page.page_content for page in docs[:-2]) # remove last 2 pages of references
num_tokens = len(enc.encode(text))
print(f"Estimated tokens in paper: {num_tokens}")

# Qwen-32B default context window = 32k tokens (~24k words)

Estimated tokens in paper: 13883


# testing qwen 3.2b for PDF summary portion

In [1]:
import time
from langchain_community.document_loaders import PyPDFLoader
from langchain_ollama import ChatOllama
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from tiktoken import get_encoding
import torch

# ===========================
# 1. GPU Setup
# ===========================
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)
if device == "cuda":
    print("GPU name:", torch.cuda.get_device_name(0))

# ===========================
# 2. Load local LLM on GPU
# ===========================
print("\n🔄 Loading local LLM model...")
llm = ChatOllama(
    model="qwen:32b",   # or "yi:1.5", "mixtral:latest"
    temperature=0.1,
    device=device
)
print("✅ LLM loaded.\n")

# ===========================
# 3. Define chunk summarization prompt
# ===========================
chunk_prompt = PromptTemplate.from_template("""
You are an AI research assistant. Summarize the following research paper content with:

1. Paper Info – Title, Authors, Year  
2. Problem Statement  
3. Dataset(s) Used  
4. Model / Methodology  
5. Training setup (hyperparameters, hardware)  
6. Results & Metrics (Accuracy, F1, AUC, etc.)  
7. Key Findings  
8. Limitations & Future Work

Paper Content:
{chunk}

Provide a concise, structured summary for this chunk.
""")

chunk_chain = chunk_prompt | llm | StrOutputParser()

# ===========================
# 4. Load PDF and chunk it
# ===========================
def load_and_chunk_pdf(pdf_path, chunk_size=1500, overlap=200):
    print(f"📄 Loading PDF from: {pdf_path}")
    loader = PyPDFLoader(pdf_path)
    pages = loader.load()
    print(f"✅ PDF loaded. Total pages: {len(pages)}")
    
    full_text = "\n".join([p.page_content for p in pages])
    print(f"📝 Total characters in document: {len(full_text)}")

    enc = get_encoding("cl100k_base")
    tokens = enc.encode(full_text)
    print(f"🔢 Total tokens in document: {len(tokens)}")

    chunks = []
    for i in range(0, len(tokens), chunk_size - overlap):
        chunk_text = enc.decode(tokens[i:i + chunk_size])
        chunks.append(chunk_text)

    print(f"✅ Document split into {len(chunks)} chunks (chunk_size={chunk_size}, overlap={overlap})\n")
    return chunks

# ===========================
# 5. Summarize chunks
# ===========================
def summarize_chunks(chunks):
    chunk_summaries = []
    for i, chunk in enumerate(chunks):
        print(f"⚙️  Summarizing chunk {i+1}/{len(chunks)}...")
        start_time = time.time()
        summary = chunk_chain.invoke({"chunk": chunk})
        end_time = time.time()
        print(f"✅ Chunk {i+1} summarized in {end_time - start_time:.2f}s")
        print(f"🔍 Preview:\n{summary[:300]}...\n")
        chunk_summaries.append(summary)
    return chunk_summaries

# ===========================
# 6. Combine summaries into final structured summary
# ===========================
final_prompt = PromptTemplate.from_template("""
You are an expert AI research assistant.

You received partial summaries from different chunks of a research paper.
Combine them into a **single coherent summary** including:

- Title, Authors, Year
- Research Problem
- Dataset(s)
- Methodology / Model Used
- Training Setup (hyperparameters, compute)
- Results & Metrics
- Key Contributions
- Limitations
- Future Work

Partial Summaries:
{chunk_summaries}

Write the final structured summary in continuous, well-organized form. Do not separate by chunk.
""")

final_chain = final_prompt | llm | StrOutputParser()

def generate_final_summary(chunk_summaries):
    print("🔄 Generating final combined summary from all chunks...")
    start_time = time.time()
    final_summary = final_chain.invoke({"chunk_summaries": "\n\n".join(chunk_summaries)})
    end_time = time.time()
    print(f"✅ Final summary generated in {end_time - start_time:.2f}s\n")
    return final_summary

# ===========================
# 7. Full PDF summarization workflow
# ===========================
def summarize_pdf(pdf_path):
    chunks = load_and_chunk_pdf(pdf_path)
    chunk_summaries = summarize_chunks(chunks)
    final_summary = generate_final_summary(chunk_summaries)
    return final_summary

# ===========================
# 8. Run test
# ===========================

pdf_path = "papers/ml_model_cardio_disease_detection.pdf"
final_summary = summarize_pdf(pdf_path)

print("📌 FINAL SUMMARY (first 1000 chars):\n")
print(final_summary[:1000] + "...")


  from .autonotebook import tqdm as notebook_tqdm


Device: cuda
GPU name: NVIDIA GeForce RTX 4090 Laptop GPU

🔄 Loading local LLM model...
✅ LLM loaded.

📄 Loading PDF from: papers/ml_model_cardio_disease_detection.pdf
✅ PDF loaded. Total pages: 19
📝 Total characters in document: 71347
🔢 Total tokens in document: 16223
✅ Document split into 13 chunks (chunk_size=1500, overlap=200)

⚙️  Summarizing chunk 1/13...
✅ Chunk 1 summarized in 194.01s
🔍 Preview:
1. Paper Info: Title - "Machine Learning-Based Predictive Models for Detection of Cardiovascular Diseases", Authors - Adedayo Ogunpola, Faisal Saeed, Shadi Basurra, Abdullah M. Albarrak, Sultan Noman Qasem, Year - 2024
2. Problem Statement: The study addresses the challenge of accurately detecting c...

⚙️  Summarizing chunk 2/13...
✅ Chunk 2 summarized in 154.56s
🔍 Preview:
1. Paper Info: Title - "Comparative Analysis of Machine Learning Techniques for Heart Disease Detection"; Authors not provided; Year - 2024
2. Problem Statement: The paper addresses the complexity and variability in