In [2]:
import re
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.prompts import PromptTemplate
from langchain_ollama import ChatOllama
from tqdm import tqdm

# -----------------------------
# 🧠 Step 1: Setup Model
# -----------------------------
llm = ChatOllama(model="qwen2.5:7b", temperature=0.2)

# -----------------------------
# 📄 Step 2: Load PDF
# -----------------------------
pdf_path = "papers/ml_model_cardio_disease_detection.pdf"
print(f"[INFO] Loading PDF: {pdf_path}")
loader = PyPDFLoader(pdf_path)
docs = loader.load()
full_text = "\n".join([d.page_content for d in docs])
print(f"[INFO] Loaded {len(docs)} pages ({len(full_text)} characters)")

# -----------------------------
# 🧩 Step 3: Detect Sections
# -----------------------------

SECTION_PATTERNS = [
    r"abstract",
    r"introduction",
    r"related work",
    r"background",
    r"methods?",
    r"methodology",
    r"experiments?",
    r"results?",
    r"discussion",
    r"conclusion",
]

def split_by_sections(text):
    # Apply the case-insensitive flag globally at the start
    pattern = r"(?i)\b(" + "|".join(SECTION_PATTERNS) + r")\b"
    
    matches = list(re.finditer(pattern, text))
    sections = []

    if not matches:
        sections.append({"section_name": "Full Paper", "content": text})
        return sections

    for i, match in enumerate(matches):
        start = match.end()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        section_name = match.group(1).strip().title()
        sections.append({
            "section_name": section_name,
            "content": text[start:end].strip()
        })

    return sections

sections = split_by_sections(full_text)
print(f"[INFO] Detected {len(sections)} section(s): {[s['section_name'] for s in sections]}")

# # -----------------------------
# # 🧠 Step 4: Define Summarization Prompt
# # -----------------------------
# section_prompt = PromptTemplate.from_template("""
# You are an AI research assistant.
# Summarize the following section of a scientific paper.

# Focus on:
# - Key ideas and objectives
# - Data and methods used
# - Metrics and results
# - Limitations if any

# Section Name: {section_name}
# Section Text:
# {context}

# Output format:
# ## {section_name}
# <concise academic summary>
# """)

# # -----------------------------
# # 🧮 Step 5: Summarize Each Section
# # -----------------------------
# summaries = []
# for sec in tqdm(sections, desc="Summarizing sections"):
#     prompt_filled = section_prompt.format(
#         section_name=sec["section_name"],
#         context=sec["content"][:6000]  # truncate to stay within context limits
#     )
#     summary = llm.invoke(prompt_filled)
#     summaries.append(f"## {sec['section_name']}\n{summary.strip()}\n")
#     print(f"\n[DEBUG] Finished summarizing section: {sec['section_name']}")

# # -----------------------------
# # 🧩 Step 6: Combine into Final Structured Summary
# # -----------------------------
# final_summary = "# Research Paper Summary\n\n" + "\n".join(summaries)

# # Save output
# output_path = "outputs/structured_summary.md"
# import os
# os.makedirs("outputs", exist_ok=True)
# with open(output_path, "w", encoding="utf-8") as f:
#     f.write(final_summary)

# print(f"\n✅ Structured summary saved to: {output_path}")
# print("\n--- Preview ---")
# print("\n".join(final_summary.splitlines()[:30]))


[INFO] Loading PDF: papers/ml_model_cardio_disease_detection.pdf
[INFO] Loaded 19 pages (71347 characters)
[INFO] Detected 60 section(s): ['Abstract', 'Methods', 'Results', 'Introduction', 'Methods', 'Methods', 'Background', 'Methods', 'Result', 'Results', 'Method', 'Results', 'Results', 'Methods', 'Methods', 'Method', 'Results', 'Background', 'Methods', 'Methods', 'Results', 'Methods', 'Methods', 'Methods', 'Experiments', 'Methods', 'Methods', 'Method', 'Experiments', 'Methods', 'Methods', 'Method', 'Method', 'Method', 'Result', 'Results', 'Methods', 'Conclusion', 'Results', 'Results', 'Results', 'Results', 'Results', 'Results', 'Results', 'Results', 'Results', 'Results', 'Results', 'Results', 'Results', 'Results', 'Results', 'Results', 'Discussion', 'Results', 'Methodology', 'Methods', 'Methods', 'Methods']


# trial 2 
RAG FOR STRUCTURED SECTION SUMMARIES 
- ingest pDF into a vector database for easy section text retrieval 


In [15]:
from langchain_community.document_loaders import PyPDFLoader
from tiktoken import get_encoding

def load_and_chunk_pdf(pdf_path, chunk_size=2500, overlap=200):
    loader = PyPDFLoader(pdf_path)
    pages = loader.load()
    full_text = "\n".join([p.page_content for p in pages])
    
    enc = get_encoding("cl100k_base")
    tokens = enc.encode(full_text)

    chunks = []
    for i in range(0, len(tokens), chunk_size - overlap):
        chunk_text = enc.decode(tokens[i:i + chunk_size])
        chunks.append(chunk_text)

    return chunks

chunks = load_and_chunk_pdf("papers\ml_model_cardio_disease_detection.pdf")


  chunks = load_and_chunk_pdf("papers\ml_model_cardio_disease_detection.pdf")


In [None]:
len(chunks) # 8 chunks of chunk size 2500

8

In [18]:
chunks[5]

' implies that when the model indicates an individual as having heart \ndisease, the likelihood of accuracy is notably high, signifying a significant advancement \nin the landscape of medical diagnostics. Future directions for this study cou ld involve \nexpanding the scope by incorporating more extensive medical imaging datasets. Leverag-\ning such data could enhance image-based heart disease prediction, potentially leading to \neven more accurate and robust diagnostic tools in the field of cardiovascular health. Fur-\nthermore, exploring ensemble models that merge the strengths of multiple algorithms \nmay offer promising avenues for further improving predictive accuracy in the field of \nheart disease prediction. These considerations shed light on the mul tifaceted nature of \nFigure 5. Accuracy of machine learning models on both datasets.\nAcross both datasets, these models consistently demonstrate exceptional performance,\nemphasizing their efficacy in heart disease prediction. No

In [20]:
%pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-5.1.2-py3-none-any.whl.metadata (16 kB)
Collecting scikit-learn (from sentence-transformers)
  Using cached scikit_learn-1.7.2-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting scipy (from sentence-transformers)
  Using cached scipy-1.16.2-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn->sentence-transformers)
  Using cached joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn->sentence-transformers)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading sentence_transformers-5.1.2-py3-none-any.whl (488 kB)
Using cached scikit_learn-1.7.2-cp313-cp313-win_amd64.whl (8.7 MB)
Using cached joblib-1.5.2-py3-none-any.whl (308 kB)
Using cached scipy-1.16.2-cp313-cp313-win_amd64.whl (38.5 MB)
Using cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, joblib, sci


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [28]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load embedding model (local, free)
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

def embed_chunks(chunks):
    embeddings = embed_model.encode(chunks, convert_to_numpy=True)
    return embeddings


embeddings = embed_chunks(chunks)

In [29]:
# %pip install faiss-cpu
import faiss

def create_faiss_index(embeddings):
    dimension = embeddings.shape[1] # Get dimensionality of embedding
    index = faiss.IndexFlatL2(dimension) # Create index that uses Euclidean distance 
    index.add(embeddings) # Add precomputed embeddings (one per PDF chunk) into index
    return index 

index = create_faiss_index(embeddings)


In [30]:
section_queries = {
    "Problem Statement": "What research problem does this paper address?",
    "Dataset": "Which datasets are used in the experiments?",
    "Methodology": "Describe the model or methods used.",
    "Evaluation": "List the metrics and results of the models tested.",
    "Limitations": "What are the limitations mentioned?",
    "Future Work": "What future work is proposed?"
}


In [None]:
def retrieve_chunks(query, index, chunks, top_k=3):
    query_emb = embed_model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_emb, top_k) # gives top 3 most similar chunks (lowest L2 distance)
    print(f"Retrieved indices: {indices}")
    retrieved_texts = [chunks[i] for i in indices[0]]
    return retrieved_texts




In [32]:
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_ollama import ChatOllama
from tqdm import tqdm

# -----------------------------
# 🧠 Step 1: Setup Model
# -----------------------------
llm = ChatOllama(model="qwen:32b", temperature=0.2)


llm_section_prompt = PromptTemplate.from_template("""
You are an AI research assistant.

From the following text, generate a concise, structured summary for the section: {section_name}

Text:
{context}

Write in clear academic style.
""")

llm_section_chain = llm_section_prompt | llm | StrOutputParser()

def summarize_section(section_name, retrieved_texts):
    context = "\n\n".join(retrieved_texts)
    summary = llm_section_chain.invoke({
        "section_name": section_name,
        "context": context
    })
    return summary


In [33]:
final_summary = {}
for section, query in section_queries.items():
    print(f"Section: {section}")
    print(f"Query: {query}")
    retrieved = retrieve_chunks(query, index, chunks, top_k=3)
    final_summary[section] = summarize_section(section, retrieved)
    break

# Optional: Combine into a single formatted string
structured_summary = "\n\n".join([f"## {s}\n{final_summary[s]}" for s in final_summary])
print(structured_summary)


Section: Problem Statement
Query: What research problem does this paper address?
Retrieved indices: [[7 6 5]]
## Problem Statement
The World Health Organization (WHO) highlights cardiovascular diseases as a significant global health concern, emphasizing the need for innovative approaches to detection and management. Artificial intelligence (AI), particularly machine learning, has emerged as a powerful tool in this domain, enabling more accurate and efficient diagnosis.

Recent studies have demonstrated the potential of AI in detecting and interpreting heart disease through various methods, such as conditional variational auto-encoders, stacked ensemble-learning frameworks, and intelligent medical cyber-physical systems. These advancements can support early identification and monitoring of conditions like heart valve diseases, improving patient outcomes.

Machine learning algorithms have also proven effective in chronic heart failure diagnosis, leveraging large datasets to identify patt

# Hybrid GPT+style summarization pipeline 
Model : Qwen:32B (ollama) model
Embeddings: Ollama embeddings
Vector database: FAISS

**What it does (high level)**

- Load a PDF and chunk it (page-aware, token-based)

- Create Ollama embeddings for each chunk and store them in FAISS

- For each target section (Problem, Dataset, Methodology, Evaluation, Limitations, Future work), it:
    - Runs a semantic retrieval (top-k)
    - Optionally reranks retrieved chunks using the LLM for higher precision
    - Summarizes the retrieved context with a section-specific prompt

- Combines all section outputs into a single polished final summary (one more LLM pass)

- Logs pages/chunk indices and provides short previews for debugging

In [None]:
%pip install ollama langchain-community langchain faiss-cpu tiktoken pypdf
# or faiss-gpu if you've GPU and compatible build

In [5]:
# Hybrid RAG + Sectioned Summarizer for research papers
# Works with: Ollama (Qwen-32B), Ollama embeddings, FAISS
# Author: ChatGPT (adapted for your environment)

import os
import time
import numpy as np
import faiss
import torch
from tiktoken import get_encoding

# langchain / ollama imports
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.chat_models import ChatOllama
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

# --------------------------
# Configuration
# --------------------------
PDF_PATH = "papers/hospital_bed_capacity_planning.pdf"  # change path
OLLAMA_BASE_URL = "http://localhost:11434"
EMBEDDING_MODEL_NAME = "nomic-embed-text"  # adjust if different on your Ollama
LLM_MODEL_NAME = "llama3.1:latest"           # your Qwen model in Ollama
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Chunking / retrieval hyperparams tuned for Qwen-32B
CHUNK_SIZE = 512      # tokens per chunk (reduced for embedding model compatibility)
CHUNK_OVERLAP = 50
TOP_K = 4             # number of chunks to retrieve for each section
RERANK_WITH_LLM = True  # set True to use LLM-based reranker (more precise, more compute)

print("DEVICE:", DEVICE)

# --------------------------
# Initialize Ollama clients (LangChain wrappers)
# --------------------------
print("Initializing Ollama embeddings and LLM wrappers...")
embedding_model = OllamaEmbeddings(model=EMBEDDING_MODEL_NAME, base_url=OLLAMA_BASE_URL)
llm = ChatOllama(model=LLM_MODEL_NAME, temperature=0.05, base_url=OLLAMA_BASE_URL, device=DEVICE)
print("Done.\n")

# --------------------------
# Utility: Load & chunk PDF (page-aware)
# --------------------------
def load_and_chunk_pdf(pdf_path, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
    print(f"Loading PDF: {pdf_path}")
    loader = PyPDFLoader(pdf_path)
    pages = loader.load()  # list of Page objects with .page_content
    page_texts = [p.page_content for p in pages[:-3]]
    print(f"Total pages loaded: {len(page_texts)}")

    # Join with page separators to keep page breaks
    full_text_with_pages = "\n\n[PAGEREF]\n\n".join(page_texts)

    enc = get_encoding("cl100k_base")
    tokens = enc.encode(full_text_with_pages)
    print(f"Total tokens in PDF (approx): {len(tokens)}")

    # create token-wise chunks, but also record which page ranges each chunk covers
    chunks = []
    token_to_page = []  # map every token index to page index for biasing
    # build token->page mapping by encoding each page separately
    page_token_counts = [len(enc.encode(t)) for t in page_texts]
    page_start = 0
    page_token_starts = []
    for cnt in page_token_counts:
        page_token_starts.append(page_start)
        page_start += cnt

    # make chunks using token slices of the full_text (encode/decode)
    for i in range(0, len(tokens), chunk_size - overlap):
        t_slice = tokens[i:i + chunk_size]
        chunk_text = enc.decode(t_slice)
        # estimate page span by finding nearest page start
        # find first token index's page
        start_page = None
        end_page = None
        # find which page contains token index i and i+len-1 approximate
        # naive approach: compare against page_token_starts cumulative
        for pi, start_idx in enumerate(page_token_starts):
            if i >= start_idx:
                start_page = pi
            else:
                break
        # end page
        j = i + len(t_slice) - 1
        end_page = start_page
        for pi, start_idx in enumerate(page_token_starts[start_page:], start=start_page):
            if j >= start_idx:
                end_page = pi
            else:
                break

        chunks.append({
            "text": chunk_text,
            "start_token": i,
            "token_len": len(t_slice),
            "start_page": start_page,
            "end_page": end_page
        })

    print(f"Created {len(chunks)} chunks (chunk_size={chunk_size}, overlap={overlap})")
    return chunks

# --------------------------
# Build FAISS index using Ollama embeddings
# --------------------------
def build_faiss_index(chunks, embedding_model):
    print("Embedding chunks with Ollama embeddings (this may take a while)...")
    vectors = []
    for i, c in enumerate(chunks):
        emb = embedding_model.embed_query(c["text"])
        # ensure vector is a list of floats
        vec = np.array(emb, dtype=np.float32)
        vectors.append(vec)
        if (i + 1) % 20 == 0 or (i + 1) == len(chunks):
            print(f"  ▸ Embedded {i+1}/{len(chunks)} chunks")

    embeddings_array = np.vstack(vectors).astype("float32")
    d = embeddings_array.shape[1]
    print(f"Embedding dimension: {d}")

    # create FAISS index (simple FlatL2 index - replace with IndexIVFFlat for large corpora)
    index = faiss.IndexFlatL2(d)
    index.add(embeddings_array)
    print(f"FAISS index built. Total vectors in index: {index.ntotal}")
    return index, embeddings_array

# --------------------------
# Retrieval with optional position bias & reranking
# --------------------------
def retrieve_chunks_for_query(query, index, chunks, embedding_model, top_k=TOP_K, position_bias=True):
    q_emb = np.array([embedding_model.embed_query(query)], dtype="float32")
    # search larger set then filter/rerank if needed
    search_k = max(top_k * 3, top_k + 5)
    distances, indices = index.search(q_emb, search_k)
    indices = indices[0].tolist()
    distances = distances[0].tolist()

    # Build candidates with metadata
    candidates = []
    for idx, dist in zip(indices, distances):
        if idx < 0 or idx >= len(chunks):
            continue
        c = chunks[idx]
        candidates.append({
            "idx": idx,
            "dist": float(dist),
            "start_page": c["start_page"],
            "end_page": c["end_page"],
            "text": c["text"]
        })

    # apply position bias: prefer earlier pages for queries like "problem statement" or "abstract"
    if position_bias:
        def score_with_bias(item):
            # bias factor: earlier pages get lower score (better)
            page_bias = (item["start_page"] or 0) * 0.03
            return item["dist"] + page_bias
        candidates = sorted(candidates, key=score_with_bias)
    else:
        candidates = sorted(candidates, key=lambda x: x["dist"])

    # trim to top_k final
    final = candidates[:top_k]
    # debug info
    print("Retrieved (idx, pages):", [(c["idx"], (c["start_page"], c["end_page"])) for c in final])
    return final

# Optional LLM-based reranker to improve precision (reranks the candidate texts by asking LLM a short question)
def rerank_with_llm(query, candidates, llm, rerank_k=TOP_K):
    # prepare a short prompt asking the LLM to score relevance (1-10)
    prompt_template = """You are a helpful research assistant. Given the question: "{query}"
For each candidate passage, give a relevance score between 1 (irrelevant) and 10 (directly answers the question).
Return lines in the format: score<TAB>passage_index

Question:
{query}

Candidates:
{listings}

Provide only the scored lines.
"""
    listings = "\n\n".join([f"[{i}] {c['text'][:400].replace('\\n',' ')}" for i, c in enumerate(candidates)])
    prompt = prompt_template.format(query=query, listings=listings)
    # call LLM to score (small response)
    response = llm.invoke(prompt)
    text = response if isinstance(response, str) else (response.content if hasattr(response, "content") else str(response))
    # parse lines
    scores = []
    for line in text.splitlines():
        line = line.strip()
        if not line:
            continue
        parts = line.split()
        # naive parse: first token is score, last token contains [i]
        try:
            score = float(parts[0])
            # find passage index between brackets
            idx_token = [p for p in parts if p.startswith("[") and p.endswith("]")]
            if idx_token:
                pid = int(idx_token[0].strip("[]"))
                scores.append((pid, score))
        except Exception:
            continue
    # map back to candidates and sort by score desc
    scored = []
    for pid, sc in scores:
        if 0 <= pid < len(candidates):
            scored.append((candidates[pid], sc))
    if not scored:
        # fallback: return original candidates
        return candidates[:rerank_k]
    scored_sorted = sorted(scored, key=lambda x: -x[1])
    reranked = [item for item, s in scored_sorted][:rerank_k]
    # return reranked list
    return reranked

# --------------------------
# Section prompts & summarization
# --------------------------
SECTION_QUERIES = {
    "Title & Authors": "What is the title and who are the authors of the paper? Provide year if present.",
    "Problem Statement": "What research problem or objective does this paper address? Summarize briefly (1-2 sentences).",
    "Dataset": "Which datasets were used in the experiments? Provide names, sizes, sources if available.",
    "Methodology": "Describe the model architecture or methods proposed in the paper. Include algorithm names or key components.",
    "Evaluation & Metrics": "What evaluation metrics and experimental results are reported? Provide numeric values when present.",
    "Limitations": "What limitations or weaknesses do the authors mention?",
    "Future Work": "What future work or extensions do the authors propose?"
}

SECTION_PROMPT_TEMPLATE = """
You are an expert AI research assistant. Create a concise academic-style summary for the section: "{section_name}".

Context (retrieved passages):
{context}

Instructions:
- Extract facts from the context only (do not hallucinate).
- If you cannot find explicit info, say "Not stated in retrieved passages".
- Be concise and include numeric metrics where possible.
"""

section_prompt = PromptTemplate.from_template(SECTION_PROMPT_TEMPLATE)
section_chain = section_prompt | llm | StrOutputParser()

def summarize_section(section_name, query, index, chunks, embedding_model, llm, top_k=TOP_K, rerank=RERANK_WITH_LLM):
    print(f"\n--- Section: {section_name} ---")
    candidates = retrieve_chunks_for_query(query, index, chunks, embedding_model, top_k=top_k, position_bias=True)
    # prepare contexts
    if rerank:
        reranked = rerank_with_llm(query, candidates, llm, rerank_k=top_k)
        candidates = reranked
        print("After rerank (idx, pages):", [(c["idx"], (c["start_page"], c["end_page"])) for c in candidates])
    context = "\n\n".join([c["text"] for c in candidates])
    # debug: show snippet of retrieved context
    for c in candidates:
        print(f"  -> idx {c['idx']} pages {c['start_page']}-{c['end_page']} (preview): {c['text'][:200]!r}")
    # call LLM summarizer for this section
    out = section_chain.invoke({"section_name": section_name, "context": context})
    print(f"Section summary preview: {out[:300]}...\n")
    return out

# --------------------------
# Combine section summaries & final polish
# --------------------------
FINAL_COMBINE_PROMPT = """
You are an expert AI research assistant. Combine the following section-level summaries into one coherent and well-structured paper summary.
Keep the headings and produce a short global conclusion (2-3 sentences) at the end.

Section summaries:
{sections_text}

Combine and output as a clean, academic-style summary.
"""

final_prompt = PromptTemplate.from_template(FINAL_COMBINE_PROMPT)
final_chain = final_prompt | llm | StrOutputParser()

def combine_and_refine(section_summaries: dict):
    sections_text = "\n\n".join([f"## {k}\n{v}" for k, v in section_summaries.items()])
    combined = final_chain.invoke({"sections_text": sections_text})
    return combined

# --------------------------
# Full pipeline runner
# --------------------------
def run_full_pipeline(pdf_path):
    # 1) Load & chunk
    chunks = load_and_chunk_pdf(pdf_path)
    # 2) Build embeddings index
    index, embeddings_array = build_faiss_index(chunks, embedding_model)
    # 3) For each section: retrieve, rerank, summarize
    section_summaries = {}
    for sname, sq in SECTION_QUERIES.items():
        summary = summarize_section(sname, sq, index, chunks, embedding_model, llm, top_k=TOP_K, rerank=RERANK_WITH_LLM)
        section_summaries[sname] = summary

    # 4) Combine & final polish
    final_summary = combine_and_refine(section_summaries)
    return {
        "sections": section_summaries,
        "final_summary": final_summary,
        "chunks": chunks,
        "index": index
    }

# --------------------------
# Example usage
# --------------------------

start = time.time()
out = run_full_pipeline(PDF_PATH)
elapsed = time.time() - start
print("\n\n==== FINAL SUMMARY (first 1500 chars) ====\n")
print(out["final_summary"][:1500] + "...\n")
print(f"\nPipeline completed in {elapsed:.2f} seconds.")


DEVICE: cuda
Initializing Ollama embeddings and LLM wrappers...
Done.

Loading PDF: papers/hospital_bed_capacity_planning.pdf
Total pages loaded: 8
Total tokens in PDF (approx): 7411
Created 17 chunks (chunk_size=512, overlap=50)
Embedding chunks with Ollama embeddings (this may take a while)...
  ▸ Embedded 17/17 chunks
Embedding dimension: 768
FAISS index built. Total vectors in index: 17

--- Section: Title & Authors ---
Retrieved (idx, pages): [(9, (3, 3)), (4, (1, 1)), (10, (3, 5)), (11, (5, 5))]
After rerank (idx, pages): [(11, (5, 5)), (9, (3, 3))]
  -> idx 11 pages 5-5 (preview): 'is et al. (2022) [62] ✓\nVieira et al. (2023) [63] ✓\nAlvarez-Chaves et al. (2023) [64] ✓ ✓\nThis paper ✓ ✓ ✓\nTable 4\nThe considered periods for testing the significance of NHP fluctuations.\nNo Period Ab'
  -> idx 9 pages 3-3 (preview): ' ✓ ✓ ✓\nNas and Koyuncu (2019) ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✓\nLaghmati et al. (2020) [41] ✓ ✓ ✓ ✓\nFathi et al. (2020) [42] ✓\nManiruzzaman et al (2020) [43] ✓ ✓ ✓\nRahman et al

In [6]:
out["final_summary"]

"**Title:** Predicting Hospital Bed Capacity: A Machine Learning Approach to Forecasting Length of Stay and Number of Hospitalized Patients\n\n**Abstract:**\n\nThis study presents a comprehensive framework for predicting hospital bed capacity using machine learning (ML) and deep learning (DL) algorithms. The proposed approach aims to forecast the number of hospitalized patients (NHP) and length of stay (LOS) in the Heart ward, considering factors such as patient demographics, LOS classification, and NHP distribution. The study employs a hybrid approach combining data analysis, ML, DL, statistical inference, and mathematical modeling to improve hospital bed capacity forecasting.\n\n**Introduction:**\n\nHospital bed capacity planning is a critical challenge in healthcare systems worldwide. Accurate forecasting of NHP and LOS is essential for ensuring adequate bed capacity and reducing the risk of overcrowding. However, existing studies have limitations, including limited application of M

In [7]:
pdf_path = "papers/hospital_bed_capacity_planning.pdf"

loader = PyPDFLoader(pdf_path)
pages = loader.load()  # list of Page objects with .page_content
page_texts = [p.page_content for p in pages[:-3]]
print(f"Total pages loaded: {len(page_texts)}")

Total pages loaded: 8


In [8]:
pages[0]

Document(metadata={'producer': 'pdfTeX', 'creator': 'Elsevier', 'creationdate': '2023-12-04T09:37:14+05:30', 'gts_pdfa1version': 'PDF/A-1b:2005', 'moddate': '2023-12-04T09:37:14+05:30', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'subject': 'Healthcare Analytics, 4 (2023) 100245. doi:10.1016/j.health.2023.100245', 'trapped': '/False', 'source': 'papers/hospital_bed_capacity_planning.pdf', 'total_pages': 11, 'page': 0, 'page_label': '1'}, page_content='Healthcare Analytics 4 (2023) 100245\nContents lists available at ScienceDirect\nHealthcare Analytics\njournal homepage: www.elsevier.com/locate/health\nA forecasting approach for hospital bed capacity planning using machine\nlearning and deep learning with application to public hospitals\nYounes Mahmoudian, Arash Nemati∗, Abdul Sattar Safaei\nDepartment of Industrial Engineering, Babol Noshirvani University of Technology, Babol, Iran\nA R T I C L E I N F O\nKeywords:\nMachin

In [9]:
pdf_path = "papers/ml_model_cardio_disease_detection.pdf"

loader = PyPDFLoader(pdf_path)
pages = loader.load()  # list of Page objects with .page_content
page_texts = [p.page_content for p in pages[:-3]]
print(f"Total pages loaded: {len(page_texts)}")

Total pages loaded: 16


In [10]:
pages[0]


Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-01-24T17:06:41+08:00', 'author': 'Adedayo Ogunpola, Faisal Saeed, Shadi Basurra, Abdullah M. Albarrak and Sultan Noman Qasem', 'keywords': 'cardiovascular diseases; deep learning; disease detection; heart diseases; machine learning; ensemble learning; XGBoost', 'moddate': '2024-01-24T10:15:42+01:00', 'subject': 'Cardiovascular diseases present a significant global health challenge that emphasizes the critical need for developing accurate and more effective detection methods. Several studies have contributed valuable insights in this field, but it is still necessary to advance the predictive models and address the gaps in the existing detection approaches. For instance, some of the previous studies have not considered the challenge of imbalanced datasets, which can lead to biased predictions, especially when the datasets include minority classes. This study’s primary focus is the ear

# GROBID for hospital bed pdf

In [12]:
import requests

pdf_path = "papers/hospital_bed_capacity_planning.pdf"
url = "http://localhost:8070/api/processFulltextDocument"

with open(pdf_path, 'rb') as pdf_file:
    response = requests.post(url, files={'input': pdf_file})
    
xml_output = response.text
print(xml_output)

<?xml version="1.0" encoding="UTF-8"?>
<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd"
 xmlns:xlink="http://www.w3.org/1999/xlink">
	<teiHeader xml:lang="en">
		<fileDesc>
			<titleStmt>
				<title level="a" type="main">A forecasting approach for hospital bed capacity planning using machine learning and deep learning with application to public hospitals</title>
			</titleStmt>
			<publicationStmt>
				<publisher/>
				<availability status="unknown"><licence/></availability>
			</publicationStmt>
			<sourceDesc>
				<biblStruct>
					<analytic>
						<author>
							<persName><forename type="first">Healthcare</forename><surname>Analytics</surname></persName>
							<affiliation key="aff0">
								<orgName type="department">Department of Industrial Engineering</orgName>
								<

In [19]:
from lxml import etree

root = etree.fromstring(xml_output.encode('utf-8'))
sections = []

for div in root.findall('.//{*}body/{*}div'):
    heading = div.findtext('{*}head')
    print(f"Heading: {heading}")
    text_parts = [t for t in div.itertext() if t.strip() != heading]
    text = " ".join(text_parts)
    if heading and text: 
        sections.append({
            "heading": heading,
            "content": text
        })

for s in sections:
    print(f"\n## {s['heading']}\n{s['content'][:500]}...")

Heading: Introduction
Heading: Literature review
Heading: A hybrid data-driven approach to HBC forecasting
Heading: New framework for HBC forecasting
Heading: LOS classification algorithms
Heading: NHP forecasting algorithms
Heading: Case study
Heading: Data set
Heading: Data analysis
Heading: Table 2
Heading: Research
Heading: NHP descriptive analysis
Heading: Age-based NHP descriptive analysis
Heading: LOS descriptive analysis
Heading: LOS classification
Heading: NHP forecasting
Heading: Heart ward bed capacity forecasting
Heading: Managerial insights
Heading: Conclusions, limitations, and recommendations
Heading: Ethical statement

## Introduction
The Hospital Bed Capacity (HBC) forecasting problem has taken significant attention because of its effects on the sustainability of hospitals, particularly in terms of hospital economic efficiency, and patient satisfaction  [1] [2] [3] [4] [5] [6] [7] . The traditional approach to this problem is using simulation or programming models invo

# GROBID hierarchical section extraction

In [20]:
from lxml import etree
import json

def parse_div(div):
    """
    Recursively parse a <div> element and extract:
    - heading text
    - content text
    - subsections
    """
    heading = div.findtext('{*}head')
    
    # Remove heading text from main content
    all_text_parts = list(div.itertext())
    if heading:
        all_text_parts = [t for t in all_text_parts if t.strip() != heading.strip()]
    content = " ".join(all_text_parts).strip()

    # Parse subsections (nested divs)
    subsections = []
    for sub_div in div.findall('{*}div'):
        subsections.append(parse_div(sub_div))

    return {
        "heading": heading.strip() if heading else None,
        "content": content,
        "subsections": subsections
    }

def extract_sections_from_grobid(xml_text):
    """
    Parse GROBID TEI XML and extract hierarchical sections.
    """
    root = etree.fromstring(xml_text.encode('utf-8'))

    # Find the <body> element
    body = root.find('.//{*}body')
    if body is None:
        raise ValueError("No <body> element found in TEI XML")

    sections = []
    for div in body.findall('{*}div'):
        sections.append(parse_div(div))

    return sections



sections = extract_sections_from_grobid(xml_output)

# Pretty print JSON
print(json.dumps(sections, indent=2, ensure_ascii=False))

# Optional: print readable summary
def print_sections(sections, level=1):
    for s in sections:
        indent = "#" * level
        print(f"\n{indent} {s['heading']}")
        print(s['content'][:300], "...")
        if s["subsections"]:
            print_sections(s["subsections"], level + 1)

print_sections(sections)


[
  {
    "heading": "Introduction",
    "content": "The Hospital Bed Capacity (HBC) forecasting problem has taken significant attention because of its effects on the sustainability of hospitals, particularly in terms of hospital economic efficiency, and patient satisfaction  [1] [2] [3] [4] [5] [6] [7] . The traditional approach to this problem is using simulation or programming models involving several issues, such as the need for some assumptions on attributes of some quantities, for example, the probability distribution of some factors  [8] [9] [10] [11] . In addition, reaching optimum or reasonable solutions is a big challenge of the bed capacity programming models, particularly in the case of large-scale, multi-objective, and integer models. Consequently, using model-free methods for HBC forecasting seems to be a facilitator. Nowadays, business analytics, including Data Analysis (DA), Machine Learning (ML), and Deep Learning (DL) techniques, are widely used as model-free methods 

In [21]:
from lxml import etree
pdf_path = "papers\ml_model_cardio_disease_detection.pdf"
url = "http://localhost:8070/api/processFulltextDocument"

with open(pdf_path, 'rb') as pdf_file:
    response = requests.post(url, files={'input': pdf_file})
    
xml_output = response.text
print(xml_output)
root = etree.fromstring(xml_output.encode('utf-8'))
sections = []

for div in root.findall('.//{*}body/{*}div'):
    heading = div.findtext('{*}head')
    print(f"Heading: {heading}")
    text_parts = [t for t in div.itertext() if t.strip() != heading]
    text = " ".join(text_parts)
    if heading and text: 
        sections.append({
            "heading": heading,
            "content": text
        })

for s in sections:
    print(f"\n## {s['heading']}\n{s['content'][:500]}...")

  pdf_path = "papers\ml_model_cardio_disease_detection.pdf"


<?xml version="1.0" encoding="UTF-8"?>
<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd"
 xmlns:xlink="http://www.w3.org/1999/xlink">
	<teiHeader xml:lang="en">
		<fileDesc>
			<titleStmt>
				<title level="a" type="main">Machine Learning-Based Predictive Models for Detection of Cardiovascular Diseases</title>
			</titleStmt>
			<publicationStmt>
				<publisher/>
				<availability status="unknown"><licence/></availability>
				<date type="published" when="2024-01-08">8 January 2024</date>
			</publicationStmt>
			<sourceDesc>
				<biblStruct>
					<analytic>
						<author>
							<persName><forename type="first">Adedayo</forename><surname>Ogunpola</surname></persName>
							<email>adedayo.ogunpola@mail.bcu.ac.uk</email>
							<affiliation key="aff0">
								<orgName type="depar

# GROBID + Hybrid RAG + Sectioned Summariser

In [None]:

import os
import time
import numpy as np
import faiss
import torch
from tiktoken import get_encoding

# langchain / ollama imports
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.chat_models import ChatOllama
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

# --------------------------
# Configuration
# --------------------------
PDF_PATH = "papers/hospital_bed_capacity_planning.pdf"  # change path
OLLAMA_BASE_URL = "http://localhost:11434"
EMBEDDING_MODEL_NAME = "nomic-embed-text"  # adjust if different on your Ollama
LLM_MODEL_NAME = "llama3.1:latest"           # your Qwen model in Ollama
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Chunking / retrieval hyperparams tuned for Qwen-32B
CHUNK_SIZE = 512      # tokens per chunk (reduced for embedding model compatibility)
CHUNK_OVERLAP = 50
TOP_K = 4             # number of chunks to retrieve for each section
RERANK_WITH_LLM = True  # set True to use LLM-based reranker (more precise, more compute)

print("DEVICE:", DEVICE)

# --------------------------
# Initialize Ollama clients (LangChain wrappers)
# --------------------------
print("Initializing Ollama embeddings and LLM wrappers...")
embedding_model = OllamaEmbeddings(model=EMBEDDING_MODEL_NAME, base_url=OLLAMA_BASE_URL)
llm = ChatOllama(model=LLM_MODEL_NAME, temperature=0.05, base_url=OLLAMA_BASE_URL, device=DEVICE)
print("Done.\n")

# --------------------------
# Utility: Load & chunk PDF (page-aware)
# --------------------------
def load_and_chunk_pdf(pdf_path, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
    print(f"Loading PDF: {pdf_path}")
    loader = PyPDFLoader(pdf_path)
    pages = loader.load()  # list of Page objects with .page_content
    page_texts = [p.page_content for p in pages[:-3]]
    print(f"Total pages loaded: {len(page_texts)}")

    # Join with page separators to keep page breaks
    full_text_with_pages = "\n\n[PAGEREF]\n\n".join(page_texts)

    enc = get_encoding("cl100k_base")
    tokens = enc.encode(full_text_with_pages)
    print(f"Total tokens in PDF (approx): {len(tokens)}")

    # create token-wise chunks, but also record which page ranges each chunk covers
    chunks = []
    token_to_page = []  # map every token index to page index for biasing
    # build token->page mapping by encoding each page separately
    page_token_counts = [len(enc.encode(t)) for t in page_texts]
    page_start = 0
    page_token_starts = []
    for cnt in page_token_counts:
        page_token_starts.append(page_start)
        page_start += cnt

    # make chunks using token slices of the full_text (encode/decode)
    for i in range(0, len(tokens), chunk_size - overlap):
        t_slice = tokens[i:i + chunk_size]
        chunk_text = enc.decode(t_slice)
        # estimate page span by finding nearest page start
        # find first token index's page
        start_page = None
        end_page = None
        # find which page contains token index i and i+len-1 approximate
        # naive approach: compare against page_token_starts cumulative
        for pi, start_idx in enumerate(page_token_starts):
            if i >= start_idx:
                start_page = pi
            else:
                break
        # end page
        j = i + len(t_slice) - 1
        end_page = start_page
        for pi, start_idx in enumerate(page_token_starts[start_page:], start=start_page):
            if j >= start_idx:
                end_page = pi
            else:
                break

        chunks.append({
            "text": chunk_text,
            "start_token": i,
            "token_len": len(t_slice),
            "start_page": start_page,
            "end_page": end_page
        })

    print(f"Created {len(chunks)} chunks (chunk_size={chunk_size}, overlap={overlap})")
    return chunks

# --------------------------
# Build FAISS index using Ollama embeddings
# --------------------------
def build_faiss_index(chunks, embedding_model):
    print("Embedding chunks with Ollama embeddings (this may take a while)...")
    vectors = []
    for i, c in enumerate(chunks):
        emb = embedding_model.embed_query(c["text"])
        # ensure vector is a list of floats
        vec = np.array(emb, dtype=np.float32)
        vectors.append(vec)
        if (i + 1) % 20 == 0 or (i + 1) == len(chunks):
            print(f"  ▸ Embedded {i+1}/{len(chunks)} chunks")

    embeddings_array = np.vstack(vectors).astype("float32")
    d = embeddings_array.shape[1]
    print(f"Embedding dimension: {d}")

    # create FAISS index (simple FlatL2 index - replace with IndexIVFFlat for large corpora)
    index = faiss.IndexFlatL2(d)
    index.add(embeddings_array)
    print(f"FAISS index built. Total vectors in index: {index.ntotal}")
    return index, embeddings_array

# --------------------------
# Retrieval with optional position bias & reranking
# --------------------------
def retrieve_chunks_for_query(query, index, chunks, embedding_model, top_k=TOP_K, position_bias=True):
    q_emb = np.array([embedding_model.embed_query(query)], dtype="float32")
    # search larger set then filter/rerank if needed
    search_k = max(top_k * 3, top_k + 5)
    distances, indices = index.search(q_emb, search_k)
    indices = indices[0].tolist()
    distances = distances[0].tolist()

    # Build candidates with metadata
    candidates = []
    for idx, dist in zip(indices, distances):
        if idx < 0 or idx >= len(chunks):
            continue
        c = chunks[idx]
        candidates.append({
            "idx": idx,
            "dist": float(dist),
            "start_page": c["start_page"],
            "end_page": c["end_page"],
            "text": c["text"]
        })

    # apply position bias: prefer earlier pages for queries like "problem statement" or "abstract"
    if position_bias:
        def score_with_bias(item):
            # bias factor: earlier pages get lower score (better)
            page_bias = (item["start_page"] or 0) * 0.03
            return item["dist"] + page_bias
        candidates = sorted(candidates, key=score_with_bias)
    else:
        candidates = sorted(candidates, key=lambda x: x["dist"])

    # trim to top_k final
    final = candidates[:top_k]
    # debug info
    print("Retrieved (idx, pages):", [(c["idx"], (c["start_page"], c["end_page"])) for c in final])
    return final

# Optional LLM-based reranker to improve precision (reranks the candidate texts by asking LLM a short question)
def rerank_with_llm(query, candidates, llm, rerank_k=TOP_K):
    # prepare a short prompt asking the LLM to score relevance (1-10)
    prompt_template = """You are a helpful research assistant. Given the question: "{query}"
For each candidate passage, give a relevance score between 1 (irrelevant) and 10 (directly answers the question).
Return lines in the format: score<TAB>passage_index

Question:
{query}

Candidates:
{listings}

Provide only the scored lines.
"""
    listings = "\n\n".join([f"[{i}] {c['text'][:400].replace('\\n',' ')}" for i, c in enumerate(candidates)])
    prompt = prompt_template.format(query=query, listings=listings)
    # call LLM to score (small response)
    response = llm.invoke(prompt)
    text = response if isinstance(response, str) else (response.content if hasattr(response, "content") else str(response))
    # parse lines
    scores = []
    for line in text.splitlines():
        line = line.strip()
        if not line:
            continue
        parts = line.split()
        # naive parse: first token is score, last token contains [i]
        try:
            score = float(parts[0])
            # find passage index between brackets
            idx_token = [p for p in parts if p.startswith("[") and p.endswith("]")]
            if idx_token:
                pid = int(idx_token[0].strip("[]"))
                scores.append((pid, score))
        except Exception:
            continue
    # map back to candidates and sort by score desc
    scored = []
    for pid, sc in scores:
        if 0 <= pid < len(candidates):
            scored.append((candidates[pid], sc))
    if not scored:
        # fallback: return original candidates
        return candidates[:rerank_k]
    scored_sorted = sorted(scored, key=lambda x: -x[1])
    reranked = [item for item, s in scored_sorted][:rerank_k]
    # return reranked list
    return reranked

# --------------------------
# Section prompts & summarization
# --------------------------
SECTION_QUERIES = {
    "Title & Authors": "What is the title and who are the authors of the paper? Provide year if present.",
    "Problem Statement": "What research problem or objective does this paper address? Summarize briefly (1-2 sentences).",
    "Dataset": "Which datasets were used in the experiments? Provide names, sizes, sources if available.",
    "Methodology": "Describe the model architecture or methods proposed in the paper. Include algorithm names or key components.",
    "Evaluation & Metrics": "What evaluation metrics and experimental results are reported? Provide numeric values when present.",
    "Limitations": "What limitations or weaknesses do the authors mention?",
    "Future Work": "What future work or extensions do the authors propose?"
}

SECTION_PROMPT_TEMPLATE = """
You are an expert AI research assistant. Create a concise academic-style summary for the section: "{section_name}".

Context (retrieved passages):
{context}

Instructions:
- Extract facts from the context only (do not hallucinate).
- If you cannot find explicit info, say "Not stated in retrieved passages".
- Be concise and include numeric metrics where possible.
"""

section_prompt = PromptTemplate.from_template(SECTION_PROMPT_TEMPLATE)
section_chain = section_prompt | llm | StrOutputParser()

def summarize_section(section_name, query, index, chunks, embedding_model, llm, top_k=TOP_K, rerank=RERANK_WITH_LLM):
    print(f"\n--- Section: {section_name} ---")
    candidates = retrieve_chunks_for_query(query, index, chunks, embedding_model, top_k=top_k, position_bias=True)
    # prepare contexts
    if rerank:
        reranked = rerank_with_llm(query, candidates, llm, rerank_k=top_k)
        candidates = reranked
        print("After rerank (idx, pages):", [(c["idx"], (c["start_page"], c["end_page"])) for c in candidates])
    context = "\n\n".join([c["text"] for c in candidates])
    # debug: show snippet of retrieved context
    for c in candidates:
        print(f"  -> idx {c['idx']} pages {c['start_page']}-{c['end_page']} (preview): {c['text'][:200]!r}")
    # call LLM summarizer for this section
    out = section_chain.invoke({"section_name": section_name, "context": context})
    print(f"Section summary preview: {out[:300]}...\n")
    return out

# --------------------------
# Combine section summaries & final polish
# --------------------------
FINAL_COMBINE_PROMPT = """
You are an expert AI research assistant. Combine the following section-level summaries into one coherent and well-structured paper summary.
Keep the headings and produce a short global conclusion (2-3 sentences) at the end.

Section summaries:
{sections_text}

Combine and output as a clean, academic-style summary.
"""

final_prompt = PromptTemplate.from_template(FINAL_COMBINE_PROMPT)
final_chain = final_prompt | llm | StrOutputParser()

def combine_and_refine(section_summaries: dict):
    sections_text = "\n\n".join([f"## {k}\n{v}" for k, v in section_summaries.items()])
    combined = final_chain.invoke({"sections_text": sections_text})
    return combined

# --------------------------
# Full pipeline runner
# --------------------------
def run_full_pipeline(pdf_path):
    # 1) Load & chunk
    chunks = load_and_chunk_pdf(pdf_path)
    # 2) Build embeddings index
    index, embeddings_array = build_faiss_index(chunks, embedding_model)
    # 3) For each section: retrieve, rerank, summarize
    section_summaries = {}
    for sname, sq in SECTION_QUERIES.items():
        summary = summarize_section(sname, sq, index, chunks, embedding_model, llm, top_k=TOP_K, rerank=RERANK_WITH_LLM)
        section_summaries[sname] = summary

    # 4) Combine & final polish
    final_summary = combine_and_refine(section_summaries)
    return {
        "sections": section_summaries,
        "final_summary": final_summary,
        "chunks": chunks,
        "index": index
    }

# --------------------------
# Example usage
# --------------------------

start = time.time()
out = run_full_pipeline(PDF_PATH)
elapsed = time.time() - start
print("\n\n==== FINAL SUMMARY (first 1500 chars) ====\n")
print(out["final_summary"][:1500] + "...\n")
print(f"\nPipeline completed in {elapsed:.2f} seconds.")

In [26]:
import os
import time
import numpy as np
import faiss
import torch
import requests
from lxml import etree
from tiktoken import get_encoding

# LangChain / Ollama imports
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.chat_models import ChatOllama
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

# --------------------------
# Configuration
# --------------------------
PDF_PATH = "papers/hospital_bed_capacity_planning.pdf"  # adjust path
GROBID_URL = "http://localhost:8070/api/processFulltextDocument"
OLLAMA_BASE_URL = "http://localhost:11434"
EMBEDDING_MODEL_NAME = "nomic-embed-text"
LLM_MODEL_NAME = "llama3.1:latest"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

TOP_K = 4  # top sections to retrieve per query
RERANK_WITH_LLM = True

print("DEVICE:", DEVICE)

# --------------------------
# Initialize Ollama clients
# --------------------------
print("Initializing Ollama embeddings and LLM wrappers...")
embedding_model = OllamaEmbeddings(model=EMBEDDING_MODEL_NAME, base_url=OLLAMA_BASE_URL)
llm = ChatOllama(model=LLM_MODEL_NAME, temperature=0.05, base_url=OLLAMA_BASE_URL, device=DEVICE)
print("Done.\n")

# --------------------------
# GROBID section extraction
# --------------------------
def parse_div(div):
    """Recursive parsing of TEI <div> elements. Includes heading in content."""
    heading = div.findtext('{*}head')
    content = " ".join(list(div.itertext())).strip()
    subsections = [parse_div(d) for d in div.findall('{*}div')]
    return {
        "heading": heading.strip() if heading else None,
        "content": content,
        "subsections": subsections
    }

def extract_sections_from_grobid(xml_text):
    root = etree.fromstring(xml_text.encode("utf-8"))
    body = root.find('.//{*}body')
    if body is None:
        raise ValueError("No <body> element found in TEI XML")
    sections = [parse_div(d) for d in body.findall('{*}div')]
    return sections

def flatten_sections(sections):
    """Flatten nested sections into a list."""
    flat = []
    for s in sections:
        flat.append({"heading": s["heading"], "content": s["content"]})
        flat.extend(flatten_sections(s["subsections"]))
    return flat

def load_pdf_sections_via_grobid(pdf_path, grobid_client_url=GROBID_URL):
    with open(pdf_path, "rb") as f:
        resp = requests.post(grobid_client_url, files={"input": f})
    xml_text = resp.text
    sections = extract_sections_from_grobid(xml_text)
    flat_sections = flatten_sections(sections)
    # Filter empty content
    flat_sections = [s for s in flat_sections if s["content"].strip()]
    print(f"Extracted {len(flat_sections)} sections from GROBID")
    return flat_sections

# --------------------------
# Build FAISS index for sections
# --------------------------
def build_section_index(sections, embedding_model):
    print("Embedding sections...")
    vectors = []
    for i, s in enumerate(sections):
        emb = np.array(embedding_model.embed_query(s["content"]), dtype=np.float32)
        vectors.append(emb)
    embeddings_array = np.vstack(vectors).astype("float32")
    d = embeddings_array.shape[1]
    index = faiss.IndexFlatL2(d)
    index.add(embeddings_array)
    print(f"FAISS index built with {index.ntotal} vectors (dimension {d})")
    return index, embeddings_array

# --------------------------
# Retrieval
# --------------------------
def retrieve_sections_for_query(query, index, sections, embedding_model, top_k=TOP_K):
    q_emb = np.array([embedding_model.embed_query(query)], dtype=np.float32)
    distances, indices = index.search(q_emb, top_k)
    retrieved = []
    for dist, idx in zip(distances[0], indices[0]):
        retrieved.append({
            "idx": idx,
            "text": sections[idx]["content"],
            "heading": sections[idx]["heading"]
        })
    return retrieved

# Optional LLM reranker
def rerank_with_llm(query, candidates, llm, rerank_k=TOP_K):
    prompt_template = """You are a helpful research assistant. Given the question: "{query}"
For each candidate passage, give a relevance score between 1 (irrelevant) and 10 (directly answers the question).
Return lines in the format: score<TAB>passage_index

Question:
{query}

Candidates:
{listings}

Provide only the scored lines.
"""
    listings = "\n\n".join([f"[{i}] {c['text'][:400].replace('\\n',' ')}" for i, c in enumerate(candidates)])
    prompt = prompt_template.format(query=query, listings=listings)
    response = llm.invoke(prompt)
    text = response if isinstance(response, str) else (response.content if hasattr(response, "content") else str(response))
    scores = []
    for line in text.splitlines():
        line = line.strip()
        if not line:
            continue
        parts = line.split()
        try:
            score = float(parts[0])
            idx_token = [p for p in parts if p.startswith("[") and p.endswith("]")]
            if idx_token:
                pid = int(idx_token[0].strip("[]"))
                scores.append((pid, score))
        except Exception:
            continue
    scored = [(candidates[pid], s) for pid, s in scores if 0 <= pid < len(candidates)]
    if not scored:
        return candidates[:rerank_k]
    scored_sorted = sorted(scored, key=lambda x: -x[1])
    return [item for item, s in scored_sorted][:rerank_k]

# --------------------------
# Section prompts & summarization
# --------------------------
SECTION_QUERIES = {
    "Title & Authors": "What is the title and who are the authors of the paper? Provide year if present.",
    "Problem Statement": "What research problem or objective does this paper address? Summarize briefly (1-2 sentences).",
    "Dataset": "Which datasets were used in the experiments? Provide names, sizes, sources if available.",
    "Methodology": "Describe the model architecture or methods proposed in the paper. Include algorithm names or key components.",
    "Evaluation & Metrics": "What evaluation metrics and experimental results are reported? Provide numeric values when present.",
    "Limitations": "What limitations or weaknesses do the authors mention?",
    "Future Work": "What future work or extensions do the authors propose?"
}

SECTION_PROMPT_TEMPLATE = """
You are an expert AI research assistant. Create a concise academic-style summary for the section: "{section_name}".

Context (retrieved passages):
{context}

Instructions:
- Extract facts from the context only (do not hallucinate).
- If you cannot find explicit info, say "Not stated in retrieved passages".
- Be concise and include numeric metrics where possible.
"""
section_prompt = PromptTemplate.from_template(SECTION_PROMPT_TEMPLATE)
section_chain = section_prompt | llm | StrOutputParser()

def summarize_section_with_headings(section_name, query, index, sections, embedding_model, llm, top_k=TOP_K, rerank=RERANK_WITH_LLM):
    print(f"\n--- Section: {section_name} ---")
    candidates = retrieve_sections_for_query(query, index, sections, embedding_model, top_k)
    if rerank:
        candidates = rerank_with_llm(query, candidates, llm, rerank_k=top_k)
    context = "\n\n".join([f"### {c['heading']}\n{c['text']}" for c in candidates])
    out = section_chain.invoke({"section_name": section_name, "context": context})
    return out

# --------------------------
# Combine section summaries
# --------------------------
FINAL_COMBINE_PROMPT = """
You are an expert AI research assistant. Combine the following section-level summaries into one coherent and well-structured paper summary.
Keep the headings and produce a short global conclusion (2-3 sentences) at the end.

Section summaries:
{sections_text}

Combine and output as a clean, academic-style summary.
"""
final_prompt = PromptTemplate.from_template(FINAL_COMBINE_PROMPT)
final_chain = final_prompt | llm | StrOutputParser()

def combine_and_refine(section_summaries: dict):
    sections_text = "\n\n".join([f"## {k}\n{v}" for k, v in section_summaries.items()])
    combined = final_chain.invoke({"sections_text": sections_text})
    return combined

# --------------------------
# Full pipeline runner
# --------------------------
def run_full_pipeline(pdf_path):
    # 1) Extract sections via GROBID
    sections = load_pdf_sections_via_grobid(pdf_path)
    
    # 2) Build FAISS index on sections
    index, embeddings_array = build_section_index(sections, embedding_model)
    
    # 3) Summarize each target section
    section_summaries = {}
    for sname, sq in SECTION_QUERIES.items():
        summary = summarize_section_with_headings(sname, sq, index, sections, embedding_model, llm)
        section_summaries[sname] = summary
    
    # 4) Combine & final polish
    final_summary = combine_and_refine(section_summaries)
    
    return {
        "sections": section_summaries,
        "final_summary": final_summary
    }

# --------------------------
# Example usage
# --------------------------

start = time.time()
output = run_full_pipeline(PDF_PATH)
elapsed = time.time() - start
print("\n\n==== FINAL SUMMARY (first 1500 chars) ===\n")
print(output["final_summary"][:1500] + "...\n")
print(f"\nPipeline completed in {elapsed:.2f} seconds.")


DEVICE: cuda
Initializing Ollama embeddings and LLM wrappers...
Done.



Extracted 20 sections from GROBID
Embedding sections...
FAISS index built with 20 vectors (dimension 768)

--- Section: Title & Authors ---

--- Section: Problem Statement ---

--- Section: Dataset ---

--- Section: Methodology ---

--- Section: Evaluation & Metrics ---

--- Section: Limitations ---

--- Section: Future Work ---


==== FINAL SUMMARY (first 1500 chars) ===

**Title:** Optimizing Hospital Bed Capacity Forecasting: A Machine Learning Approach
**Authors:** Not stated in retrieved passages

**Abstract:**

This study aims to optimize hospital bed capacity forecasting using machine learning algorithms. The problem statement revolves around optimizing the Length of Stay (LOS) classification for patients in a Heart ward, highlighting issues such as inefficient use of bed capacity during holidays, high proportion of children requiring long LOS, and insufficient bed capacity. A dataset containing 51,231 records collected between 2011 and 2018 was used to examine LOS classificatio

In [27]:
output["final_summary"]

"**Title:** Optimizing Hospital Bed Capacity Forecasting: A Machine Learning Approach\n**Authors:** Not stated in retrieved passages\n\n**Abstract:**\n\nThis study aims to optimize hospital bed capacity forecasting using machine learning algorithms. The problem statement revolves around optimizing the Length of Stay (LOS) classification for patients in a Heart ward, highlighting issues such as inefficient use of bed capacity during holidays, high proportion of children requiring long LOS, and insufficient bed capacity. A dataset containing 51,231 records collected between 2011 and 2018 was used to examine LOS classification accuracy using various tools, with Support Vector Machine (SVM) providing the best accuracy.\n\nThe study employed a classification approach to predict LOS using machine learning algorithms, including Bayesian Network (BN), K-Nearest Neighbor (KNN), SVM, Decision Tree (DT), and Logistic Regression (LR). The results show that only 6% of patients experienced LOS for m

In [24]:
load_pdf_sections_via_grobid("papers/hospital_bed_capacity_planning.pdf")

[{'heading': 'Introduction',
  'content': "Introduction The Hospital Bed Capacity (HBC) forecasting problem has taken significant attention because of its effects on the sustainability of hospitals, particularly in terms of hospital economic efficiency, and patient satisfaction  [1] [2] [3] [4] [5] [6] [7] . The traditional approach to this problem is using simulation or programming models involving several issues, such as the need for some assumptions on attributes of some quantities, for example, the probability distribution of some factors  [8] [9] [10] [11] . In addition, reaching optimum or reasonable solutions is a big challenge of the bed capacity programming models, particularly in the case of large-scale, multi-objective, and integer models. Consequently, using model-free methods for HBC forecasting seems to be a facilitator. Nowadays, business analytics, including Data Analysis (DA), Machine Learning (ML), and Deep Learning (DL) techniques, are widely used as model-free metho

# Upgraded adaptive pipline 
- only feed the most relevant sections to the LLM for each schema field 
- model sees fewer, more focused passages, saving tokens and improve accuracy for long papers