In [3]:
%pip install openai langchain-openai

Collecting openai
  Using cached openai-2.6.1-py3-none-any.whl.metadata (29 kB)
Collecting langchain-openai
  Downloading langchain_openai-1.0.1-py3-none-any.whl.metadata (1.8 kB)
Using cached openai-2.6.1-py3-none-any.whl (1.0 MB)
Downloading langchain_openai-1.0.1-py3-none-any.whl (81 kB)
Installing collected packages: openai, langchain-openai

   ---------------------------------------- 0/2 [openai]
   ---------------------------------------- 0/2 [openai]
   ---------------------------------------- 0/2 [openai]
   ---------------------------------------- 0/2 [openai]
   ---------------------------------------- 0/2 [openai]
   ---------------------------------------- 0/2 [openai]
   ---------------------------------------- 0/2 [openai]
   ---------------------------------------- 0/2 [openai]
   ---------------------------------------- 0/2 [openai]
   ---------------------------------------- 0/2 [openai]
   ---------------------------------------- 0/2 [openai]
   ---------------------


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
%pip install dotenv

Collecting dotenv
  Downloading dotenv-0.9.9-py2.py3-none-any.whl.metadata (279 bytes)
Downloading dotenv-0.9.9-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: dotenv
Successfully installed dotenv-0.9.9
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import os
import time
import numpy as np
import faiss
import torch
import requests
from lxml import etree
from langchain_community.embeddings import OllamaEmbeddings
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

from dotenv import load_dotenv
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

def summarize_research_paper(
    pdf_path,
    grobid_url="http://localhost:8070/api/processFulltextDocument",
    ollama_base_url="http://localhost:11434",
    embedding_model_name="nomic-embed-text",
    llm_model_name="gpt-4o-mini",
    top_k=4,
    rerank_with_llm=True,
    temperature=0.05,
):
    """Run full data science paper summarization pipeline on one PDF."""

    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")

    # Initialize models
    embedding_model = OllamaEmbeddings(model=embedding_model_name, base_url=ollama_base_url)
    llm = ChatOpenAI(model=llm_model_name, temperature=temperature)

    # ---------------- GROBID Extraction ----------------
    def parse_div(div):
        heading = div.findtext('{*}head')
        content = " ".join(list(div.itertext())).strip()
        subsections = [parse_div(d) for d in div.findall('{*}div')]
        return {"heading": heading.strip() if heading else None, "content": content, "subsections": subsections}

    def extract_sections_from_grobid(xml_text):
        root = etree.fromstring(xml_text.encode("utf-8"))
        body = root.find('.//{*}body')
        if body is None:
            raise ValueError("No <body> element found in TEI XML")
        sections = [parse_div(d) for d in body.findall('{*}div')]
        return sections

    def flatten_sections(sections):
        flat = []
        for s in sections:
            flat.append({"heading": s["heading"], "content": s["content"]})
            flat.extend(flatten_sections(s["subsections"]))
        return flat

    def load_pdf_sections_via_grobid(pdf_path):
        with open(pdf_path, "rb") as f:
            resp = requests.post(grobid_url, files={"input": f})
        xml_text = resp.text
        sections = extract_sections_from_grobid(xml_text)
        flat_sections = flatten_sections(sections)
        flat_sections = [s for s in flat_sections if s["content"].strip()]
        print(f"Extracted {len(flat_sections)} sections from GROBID")
        return flat_sections

    # ---------------- Build FAISS index ----------------
    def build_section_index(sections):
        print("Embedding and indexing sections...")
        vectors = [np.array(embedding_model.embed_query(s["content"]), dtype=np.float32) for s in sections]
        embeddings_array = np.vstack(vectors).astype("float32")
        d = embeddings_array.shape[1]
        index = faiss.IndexFlatL2(d)
        index.add(embeddings_array)
        print(f"FAISS index built with {index.ntotal} vectors (dimension {d})")
        return index, embeddings_array

    # ---------------- Retrieval & Reranking ----------------
    def retrieve_sections_for_query(query, index, sections):
        q_emb = np.array([embedding_model.embed_query(query)], dtype=np.float32)
        distances, indices = index.search(q_emb, top_k)
        retrieved = [{"idx": idx, "text": sections[idx]["content"], "heading": sections[idx]["heading"]}
                     for dist, idx in zip(distances[0], indices[0])]
        return retrieved

    def rerank(query, candidates):
        prompt_template = """You are a helpful research assistant. Given the question: "{query}"
For each candidate passage, give a relevance score between 1 (irrelevant) and 10 (directly answers the question).
Return lines in the format: score<TAB>passage_index

Question:
{query}

Candidates:
{listings}
"""
        listings = "\n\n".join([f"[{i}] {c['text'][:400].replace('\\n',' ')}" for i, c in enumerate(candidates)])
        prompt = prompt_template.format(query=query, listings=listings)
        response = llm.invoke(prompt)
        text = response if isinstance(response, str) else getattr(response, "content", str(response))
        scores = []
        for line in text.splitlines():
            parts = line.strip().split()
            try:
                score = float(parts[0])
                idx_token = [p for p in parts if p.startswith("[") and p.endswith("]")]
                if idx_token:
                    pid = int(idx_token[0].strip("[]"))
                    scores.append((pid, score))
            except Exception:
                continue
        if not scores:
            return candidates
        scored_sorted = sorted(scores, key=lambda x: -x[1])
        return [candidates[i] for i, _ in scored_sorted[:top_k]]

    # ---------------- Section Summaries ----------------
    SECTION_QUERIES = {
        "Title & Authors": "What is the title and who are the authors of the paper? Provide year if present.",
        "Problem Statement": "What research problem or objective does this paper address?",
        "Dataset": "Which datasets were used in the experiments? Provide names, sizes, sources if available.",
        "Methodology": "Describe the model architecture or methods proposed in the paper.",
        "Evaluation & Metrics": "What evaluation metrics and experimental results are reported?",
        "Limitations": "What limitations or weaknesses do the authors mention?",
        "Future Work": "What future work or extensions do the authors propose?"
    }

    section_prompt = PromptTemplate.from_template("""
You are an expert AI research assistant. Summarize the section "{section_name}" based on context below.
{context}
""")
    section_chain = section_prompt | llm | StrOutputParser()

    def summarize_section(section_name, query, index, sections):
        candidates = retrieve_sections_for_query(query, index, sections)
        if rerank_with_llm:
            candidates = rerank(query, candidates)
        context = "\n\n".join([f"### {c['heading']}\n{c['text']}" for c in candidates])
        return section_chain.invoke({"section_name": section_name, "context": context})

    # ---------------- Combine Final Summary ----------------
    final_prompt = PromptTemplate.from_template("""
You are an expert AI assistant. Combine the following section summaries into a cohesive academic paper summary.
{sections_text}
""")
    final_chain = final_prompt | llm | StrOutputParser()

    def combine_summaries(section_summaries):
        text = "\n\n".join([f"## {k}\n{v}" for k, v in section_summaries.items()])
        return final_chain.invoke({"sections_text": text})

    # ---------------- Run Pipeline ----------------
    start = time.time()
    sections = load_pdf_sections_via_grobid(pdf_path)
    index, _ = build_section_index(sections)

    section_summaries = {}
    for name, query in SECTION_QUERIES.items():
        print(f"Summarizing section: {name}")
        section_summaries[name] = summarize_section(name, query, index, sections)

    final_summary = combine_summaries(section_summaries)
    elapsed = time.time() - start

    print(f"\nPipeline completed in {elapsed:.2f}s.\n")
    print("==== Final Summary (First 1000 chars) ====")
    print(final_summary[:1000] + "...\n")

    return {
        "sections": section_summaries,
        "final_summary": final_summary,
        "elapsed_sec": elapsed
    }


In [11]:
result = summarize_research_paper(
    pdf_path="papers/hospital_bed_capacity_planning.pdf",
    embedding_model_name="nomic-embed-text",
    top_k=4,
    rerank_with_llm=True
)

Using device: cuda
Extracted 20 sections from GROBID
Embedding and indexing sections...
FAISS index built with 20 vectors (dimension 768)
Summarizing section: Title & Authors
Summarizing section: Problem Statement
Summarizing section: Dataset
Summarizing section: Methodology
Summarizing section: Evaluation & Metrics
Summarizing section: Limitations
Summarizing section: Future Work

Pipeline completed in 127.02s.

==== Final Summary (First 1000 chars) ====
### Summary of the Research Paper on Hospital Bed Capacity Forecasting

This research paper addresses the critical issue of hospital bed capacity forecasting, specifically focusing on Length of Stay (LOS) classification within the Heart ward. The authors, whose names and affiliations are acknowledged in the introductory section, highlight a significant gap in existing literature: while there is extensive research on LOS forecasting, studies specifically targeting LOS classification remain scarce. The authors argue that effective class

In [12]:
result['final_summary']

'### Summary of the Research Paper on Hospital Bed Capacity Forecasting\n\nThis research paper addresses the critical issue of hospital bed capacity forecasting, specifically focusing on Length of Stay (LOS) classification within the Heart ward. The authors, whose names and affiliations are acknowledged in the introductory section, highlight a significant gap in existing literature: while there is extensive research on LOS forecasting, studies specifically targeting LOS classification remain scarce. The authors argue that effective classification of patients based on various features is essential for optimizing hospital bed management and improving patient care strategies.\n\nThe dataset utilized in this study comprises 51,231 records collected from 2011 to 2018, focusing on relevant features such as age and LOS. After filtering out outliers, 47,605 records were retained, with a division of 70% for training and 30% for testing purposes. This structured dataset serves as the foundation 

# Multi-agent 
- Agent 1: Section Extractor 
- 

## Section Extractor GROBID

In [20]:
import requests
from lxml import etree

class GrobidSectionAgent:
    """
    Agent for detecting and extracting sections from a research paper PDF using GROBID.
    """

    def __init__(self, grobid_url: str = "http://localhost:8070/api/processFulltextDocument"):
        self.grobid_url = grobid_url

    def extract_sections(self, pdf_path: str):
        """
        Uploads a PDF to GROBID and extracts section structure as a flat list.
        """
        print(f"[GROBID] Processing {pdf_path} ...")
        with open(pdf_path, "rb") as f:
            response = requests.post(self.grobid_url, files={"input": f})

        if response.status_code != 200:
            raise RuntimeError(f"GROBID request failed: {response.status_code} {response.text[:500]}")

        xml_text = response.text
        # print("Printing xml_text...", xml_text)
        sections = self._parse_tei_xml(xml_text)
        flat_sections = self._flatten_sections(sections)
        return [s for s in flat_sections if s["content"].strip()]

    # ---------------- Private Helpers ----------------

    def _parse_tei_xml(self, xml_text: str):
        """Parse TEI XML and recursively extract <div> sections."""
        root = etree.fromstring(xml_text.encode("utf-8"))
        body = root.find(".//{*}body")
        if body is None:
            raise ValueError("No <body> element found in TEI XML")

        def parse_div(div):
            heading = div.findtext("{*}head")
            content = " ".join(list(div.itertext())).strip()
            subsections = [parse_div(d) for d in div.findall("{*}div")]
            return {"heading": heading.strip() if heading else None, "content": content, "subsections": subsections}

        return [parse_div(d) for d in body.findall("{*}div")]

    def _flatten_sections(self, sections):
        """Flatten nested section hierarchy into a single list."""
        flat = []
        for s in sections:
            flat.append({"heading": s["heading"], "content": s["content"]})
            flat.extend(self._flatten_sections(s["subsections"]))
        return flat

    # ----- NOT IN USE: Table Extraction -----------
    # def extract_tables_from_pdf(pdf_path):
    #     tables = camelot.read_pdf(pdf_path, pages="all")
    #     extracted = []
    #     for i, t in enumerate(tables):
    #         extracted.append({
    #             "table_index": i,
    #             "caption": None,  # optional: detect caption via proximity
    #             "data": t.df.to_dict(orient="records")
    #         })
    #     return extracted


# ---------------- Example Usage ----------------
def main(): 
    agent = GrobidSectionAgent()

    pdf_path = "papers/hospital_bed_capacity_planning.pdf"  # Path to your PDF
    sections = agent.extract_sections(pdf_path)

    print(f"✅ Extracted {len(sections)} sections:\n")
    for s in sections[:5]:
        print(f"--- {s['heading']} ---\n{s['content'][:400]}...\n")


In [21]:
main() # takes 6s

[GROBID] Processing papers/hospital_bed_capacity_planning.pdf ...
✅ Extracted 20 sections:

--- Introduction ---
Introduction The Hospital Bed Capacity (HBC) forecasting problem has taken significant attention because of its effects on the sustainability of hospitals, particularly in terms of hospital economic efficiency, and patient satisfaction  [1] [2] [3] [4] [5] [6] [7] . The traditional approach to this problem is using simulation or programming models involving several issues, such as the need for som...

--- Literature review ---
Literature review The literature on healthcare centers' bed capacity forecasting is reviewed to clear the paper's contributions and novelty aspects.  Bachouch et al. (2012)  considered some limitations, such as budget, shared resources, and beds needed by acute and emergency patients, in the problem of hospital bed planning. Using constraints such as incompatibility between pathologies and continu...

--- A hybrid data-driven approach to HBC forecastin

## Section Summary Agent

In [None]:
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

class SectionSummaryAgent:
    def __init__(self, llm_model="gpt-4o-mini", temperature=0.05):
        self.llm = ChatOpenAI(model=llm_model, temperature=temperature)
        # Define prompt template
        self.prompt_template = PromptTemplate.from_template("""
You are an expert AI research assistant. Summarize the section "{section_name}" 
from the following content. Produce a concise, structured, and academic-style summary.

Section Text:
{section_text}

Instructions:
- Include key points and methodology if present
- Mention any datasets, results, or experiments
- Keep summary factual and concise
""")
        self.parser = StrOutputParser()

    def summarize(self, section_name, section_text):
        # Fill prompt
        prompt_input = {
            "section_name": section_name,
            "section_text": section_text
        }
        # Generate summary using LLM
        summary = self.prompt_template | self.llm | self.parser
        return summary.invoke(prompt_input)

## Summary Aggregator Agent

In [35]:

class SummaryAggregatorAgent:
    def __init__(self, llm_model="gpt-4o-mini", temperature=0.05):
        self.llm = ChatOpenAI(model=llm_model, temperature=temperature)
        # Define prompt template
        self.prompt_template = PromptTemplate.from_template("""
You are an expert AI research assistant. Your task is to combine the following individual section summaries into one cohesive and well-structured academic summary of the research paper.

Guidelines:
- Reorganize and merge overlapping or overly detailed sections as needed.
- Choose the most logical and meaningful section headings yourself (e.g., Introduction, Methods, Results, Discussion, Conclusion), based on content.
- Ensure smooth transitions and consistent academic tone throughout.
- Preserve key technical details, results, and findings from the input summaries.
- Do not add information that is not supported by the summaries.

Input Section Summaries:
{sections_text}

Output:
A coherent and complete academic-style summary written in paragraphs with appropriate section headings.
""")
        self.parser = StrOutputParser()

    def combine(self, section_summaries):
        sections_text = "\n\n".join(
            [f"## {s['section']}\n{s['summary']}" for s in section_summaries]
        )
        chain = self.prompt_template | self.llm | self.parser
        return chain.invoke({"sections_text": sections_text})

# Final Integration

In [36]:
### to add in .py file 

# from section_detection_agent import GrobidSectionAgent

pdf_path = "papers/hospital_bed_capacity_planning.pdf"

# Initialize GROBID agent
grobid_agent = GrobidSectionAgent()
sections = grobid_agent.extract_sections(pdf_path)
print(sections)

# Initialize your summarization agent
section_summarizer = SectionSummaryAgent()

summaries = []
for section in sections:
    name = section.get("heading") or "Unnamed Section"
    text = section.get("content", "")
    
    if not text.strip():
        continue  # skip empty sections
    
    print(f"Summarizing: {name}...")
    summary = section_summarizer.summarize(name, text)
    summaries.append({"section": name, "summary": summary})

# Initialize aggregator agent 
summary_aggregator = SummaryAggregatorAgent()
final_summary = summary_aggregator.combine(summaries)



[GROBID] Processing papers/hospital_bed_capacity_planning.pdf ...
[{'heading': 'Introduction', 'content': "Introduction The Hospital Bed Capacity (HBC) forecasting problem has taken significant attention because of its effects on the sustainability of hospitals, particularly in terms of hospital economic efficiency, and patient satisfaction  [1] [2] [3] [4] [5] [6] [7] . The traditional approach to this problem is using simulation or programming models involving several issues, such as the need for some assumptions on attributes of some quantities, for example, the probability distribution of some factors  [8] [9] [10] [11] . In addition, reaching optimum or reasonable solutions is a big challenge of the bed capacity programming models, particularly in the case of large-scale, multi-objective, and integer models. Consequently, using model-free methods for HBC forecasting seems to be a facilitator. Nowadays, business analytics, including Data Analysis (DA), Machine Learning (ML), and De

In [27]:
summaries

[{'section': 'Introduction',
  'summary': '**Summary of the Introduction Section**\n\nThe Hospital Bed Capacity (HBC) forecasting problem is critical for enhancing hospital sustainability, economic efficiency, and patient satisfaction. Traditional forecasting methods rely on simulation or programming models, which often require assumptions about various factors, such as probability distributions, and face challenges in achieving optimal solutions, especially in large-scale, multi-objective, and integer models. As a result, model-free methods have emerged as a promising alternative for HBC forecasting.\n\nRecent advancements in business analytics, particularly Data Analysis (DA), Machine Learning (ML), and Deep Learning (DL), have been effectively utilized across various sectors, including healthcare. These techniques provide insights into market trends without depending on traditional simulation or mathematical models. Successful applications in healthcare include forecasting Length of

In [None]:
final_summary # most recent output 

"# Summary of Research on Hospital Bed Capacity Forecasting\n\n## Introduction\nThe forecasting of Hospital Bed Capacity (HBC) is essential for improving hospital sustainability, economic efficiency, and patient satisfaction. Traditional forecasting methods, which often rely on simulation or programming models, face challenges in achieving optimal solutions, particularly in large-scale, multi-objective, and integer models. To overcome these limitations, model-free methods, particularly business analytics techniques such as Data Analysis (DA), Machine Learning (ML), and Deep Learning (DL), are increasingly being adopted. These methods have demonstrated success in various healthcare applications, including Length of Stay (LOS) forecasting, patient classification, healthcare resource forecasting, and disease diagnosis. This paper aims to expand on existing research by integrating diverse methodologies to explore significant factors influencing bed capacity, such as LOS, patient age, and n

# Most recent output (31.10.2025)

#### Summary of Research on Hospital Bed Capacity Forecasting\n\n
## Introduction\n
The forecasting of Hospital Bed Capacity (HBC) is essential for improving hospital sustainability, economic efficiency, and patient satisfaction. Traditional forecasting methods, which often rely on simulation or programming models, face challenges in achieving optimal solutions, particularly in large-scale, multi-objective, and integer models. To overcome these limitations, model-free methods, particularly business analytics techniques such as Data Analysis (DA), Machine Learning (ML), and Deep Learning (DL), are increasingly being adopted. These methods have demonstrated success in various healthcare applications, including Length of Stay (LOS) forecasting, patient classification, healthcare resource forecasting, and disease diagnosis. This paper aims to expand on existing research by integrating diverse methodologies to explore significant factors influencing bed capacity, such as LOS, patient age, and non-hospitalized patients (NHP).\n\n
## Literature Review\n
The literature on HBC forecasting reveals a variety of methodologies employed to manage healthcare resources effectively. Key contributions include Integer Linear Programming, Multi-Objective Stochastic Programming, Discrete-Event Simulation, and Macrosimulation. Recent studies have also highlighted the application of ML techniques, such as recurrent neural networks and artificial neural networks, for predicting bed occupancy and patient arrivals. Despite the predominance of simulation and programming techniques, there remains a limited application of ML tools in this domain. This paper seeks to fill this gap by proposing a hybrid data-driven framework that combines statistical methods with ML techniques to enhance predictive accuracy.\n\n
## Methodology\n
### Hybrid Data-Driven Framework\n
The proposed hybrid data-driven framework for HBC forecasting integrates various classification and forecasting algorithms. It emphasizes the importance of utilizing diverse datasets and aims to improve forecasting performance by leveraging the strengths of different algorithms. The framework begins with standard data collection and preprocessing steps, including dataset cleaning and feature transformation. Key features analyzed include patients' age, NHP, and LOS, which inform managerial decisions and enhance forecasting accuracy.\n\n
### LOS and NHP Forecasting Algorithms\n
The study evaluates various ML algorithms for LOS classification and NHP forecasting. For LOS classification, algorithms such as Support Vector Machines (SVM), Decision Trees, and Random Forests are highlighted, with SVM achieving the highest accuracy. In terms of NHP forecasting, time series methods, particularly Linear Regression and Long Short-Term Memory (LSTM) neural networks, are identified as the most effective techniques.\n\n
## Case Study: Rouhani Hospital\n
The case study focuses on Rouhani Hospital in Babol City, Iran, which has a total of 508 beds and emphasizes cardiac care. The Heart ward is the most active department, and the study utilizes a dataset comprising 51,231 records from 2011 to 2018, focusing on age and LOS. After data cleaning, 47,605 records were retained for analysis.\n\n
## Data Analysis\n
The data analysis section examines the relationship between NHP, patient age distribution, and LOS in forecasting bed requirements. Key findings indicate that fluctuations in NHP are influenced by seasonal factors, particularly during the Nowruz holidays, which lead to significant drops in admissions. Additionally, a bimodal distribution of NHP based on age reveals a growing prevalence of heart diseases among younger populations, underscoring the need for specialized healthcare resources.\n\n
## Forecasting Results\n
The forecasting results indicate a significant increase in required bed capacity, projecting an average need of 120 beds by 2026, compared to the current capacity of 45 beds. This necessitates strategic planning by hospital administration to expand the heart ward's capacity to 137 beds by the target year. The analysis highlights the impact of an aging patient population on future bed requirements.\n\n
## Managerial Insights\n
The study identifies mismanagement in bed capacity utilization, particularly during holiday periods, and recommends implementing equitable scheduling for healthcare specialists. The establishment of specialized pediatric Heart wards is also emphasized, given the significant proportion of children among heart patients.\n\n
## Conclusions, Limitations, and Recommendations\n
This research contributes to the field of HBC forecasting by proposing a data-driven methodology that integrates DA, ML, and DL techniques. The findings indicate a projected rise in required beds, emphasizing the need for enhanced focus on child heart patients and the rising incidence of heart diseases among younger populations. However, the study is limited to HBC forecasting and does not account for other critical resources such as specialists and equipment. Future research should incorporate these factors and utilize advanced multivariate ML tools to address uncertainties in decision-making processes related to HBC forecasting.\n\n
## Ethical Statement\n
The authors affirm that the research does not utilize any real clinical data, underscoring their ethical commitment to avoiding the use of sensitive or personal information in the study.


In [31]:
total_wc

3142