In [2]:
import re
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.prompts import PromptTemplate
from langchain_ollama import ChatOllama
from tqdm import tqdm

# -----------------------------
# 🧠 Step 1: Setup Model
# -----------------------------
llm = ChatOllama(model="qwen2.5:7b", temperature=0.2)

# -----------------------------
# 📄 Step 2: Load PDF
# -----------------------------
pdf_path = "papers/ml_model_cardio_disease_detection.pdf"
print(f"[INFO] Loading PDF: {pdf_path}")
loader = PyPDFLoader(pdf_path)
docs = loader.load()
full_text = "\n".join([d.page_content for d in docs])
print(f"[INFO] Loaded {len(docs)} pages ({len(full_text)} characters)")

# -----------------------------
# 🧩 Step 3: Detect Sections
# -----------------------------

SECTION_PATTERNS = [
    r"abstract",
    r"introduction",
    r"related work",
    r"background",
    r"methods?",
    r"methodology",
    r"experiments?",
    r"results?",
    r"discussion",
    r"conclusion",
]

def split_by_sections(text):
    # Apply the case-insensitive flag globally at the start
    pattern = r"(?i)\b(" + "|".join(SECTION_PATTERNS) + r")\b"
    
    matches = list(re.finditer(pattern, text))
    sections = []

    if not matches:
        sections.append({"section_name": "Full Paper", "content": text})
        return sections

    for i, match in enumerate(matches):
        start = match.end()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        section_name = match.group(1).strip().title()
        sections.append({
            "section_name": section_name,
            "content": text[start:end].strip()
        })

    return sections

sections = split_by_sections(full_text)
print(f"[INFO] Detected {len(sections)} section(s): {[s['section_name'] for s in sections]}")

# # -----------------------------
# # 🧠 Step 4: Define Summarization Prompt
# # -----------------------------
# section_prompt = PromptTemplate.from_template("""
# You are an AI research assistant.
# Summarize the following section of a scientific paper.

# Focus on:
# - Key ideas and objectives
# - Data and methods used
# - Metrics and results
# - Limitations if any

# Section Name: {section_name}
# Section Text:
# {context}

# Output format:
# ## {section_name}
# <concise academic summary>
# """)

# # -----------------------------
# # 🧮 Step 5: Summarize Each Section
# # -----------------------------
# summaries = []
# for sec in tqdm(sections, desc="Summarizing sections"):
#     prompt_filled = section_prompt.format(
#         section_name=sec["section_name"],
#         context=sec["content"][:6000]  # truncate to stay within context limits
#     )
#     summary = llm.invoke(prompt_filled)
#     summaries.append(f"## {sec['section_name']}\n{summary.strip()}\n")
#     print(f"\n[DEBUG] Finished summarizing section: {sec['section_name']}")

# # -----------------------------
# # 🧩 Step 6: Combine into Final Structured Summary
# # -----------------------------
# final_summary = "# Research Paper Summary\n\n" + "\n".join(summaries)

# # Save output
# output_path = "outputs/structured_summary.md"
# import os
# os.makedirs("outputs", exist_ok=True)
# with open(output_path, "w", encoding="utf-8") as f:
#     f.write(final_summary)

# print(f"\n✅ Structured summary saved to: {output_path}")
# print("\n--- Preview ---")
# print("\n".join(final_summary.splitlines()[:30]))


[INFO] Loading PDF: papers/ml_model_cardio_disease_detection.pdf
[INFO] Loaded 19 pages (71347 characters)
[INFO] Detected 60 section(s): ['Abstract', 'Methods', 'Results', 'Introduction', 'Methods', 'Methods', 'Background', 'Methods', 'Result', 'Results', 'Method', 'Results', 'Results', 'Methods', 'Methods', 'Method', 'Results', 'Background', 'Methods', 'Methods', 'Results', 'Methods', 'Methods', 'Methods', 'Experiments', 'Methods', 'Methods', 'Method', 'Experiments', 'Methods', 'Methods', 'Method', 'Method', 'Method', 'Result', 'Results', 'Methods', 'Conclusion', 'Results', 'Results', 'Results', 'Results', 'Results', 'Results', 'Results', 'Results', 'Results', 'Results', 'Results', 'Results', 'Results', 'Results', 'Results', 'Results', 'Discussion', 'Results', 'Methodology', 'Methods', 'Methods', 'Methods']


# trial 2 
RAG FOR STRUCTURED SECTION SUMMARIES 
- ingest pDF into a vector database for easy section text retrieval 


In [15]:
from langchain_community.document_loaders import PyPDFLoader
from tiktoken import get_encoding

def load_and_chunk_pdf(pdf_path, chunk_size=2500, overlap=200):
    loader = PyPDFLoader(pdf_path)
    pages = loader.load()
    full_text = "\n".join([p.page_content for p in pages])
    
    enc = get_encoding("cl100k_base")
    tokens = enc.encode(full_text)

    chunks = []
    for i in range(0, len(tokens), chunk_size - overlap):
        chunk_text = enc.decode(tokens[i:i + chunk_size])
        chunks.append(chunk_text)

    return chunks

chunks = load_and_chunk_pdf("papers\ml_model_cardio_disease_detection.pdf")


  chunks = load_and_chunk_pdf("papers\ml_model_cardio_disease_detection.pdf")


In [18]:
chunks[5]

' implies that when the model indicates an individual as having heart \ndisease, the likelihood of accuracy is notably high, signifying a significant advancement \nin the landscape of medical diagnostics. Future directions for this study cou ld involve \nexpanding the scope by incorporating more extensive medical imaging datasets. Leverag-\ning such data could enhance image-based heart disease prediction, potentially leading to \neven more accurate and robust diagnostic tools in the field of cardiovascular health. Fur-\nthermore, exploring ensemble models that merge the strengths of multiple algorithms \nmay offer promising avenues for further improving predictive accuracy in the field of \nheart disease prediction. These considerations shed light on the mul tifaceted nature of \nFigure 5. Accuracy of machine learning models on both datasets.\nAcross both datasets, these models consistently demonstrate exceptional performance,\nemphasizing their efficacy in heart disease prediction. No

In [20]:
%pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-5.1.2-py3-none-any.whl.metadata (16 kB)
Collecting scikit-learn (from sentence-transformers)
  Using cached scikit_learn-1.7.2-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting scipy (from sentence-transformers)
  Using cached scipy-1.16.2-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn->sentence-transformers)
  Using cached joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn->sentence-transformers)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading sentence_transformers-5.1.2-py3-none-any.whl (488 kB)
Using cached scikit_learn-1.7.2-cp313-cp313-win_amd64.whl (8.7 MB)
Using cached joblib-1.5.2-py3-none-any.whl (308 kB)
Using cached scipy-1.16.2-cp313-cp313-win_amd64.whl (38.5 MB)
Using cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, joblib, sci


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [21]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load embedding model (local, free)
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

def embed_chunks(chunks):
    embeddings = embed_model.encode(chunks, convert_to_numpy=True)
    return embeddings


embeddings = embed_chunks(chunks)

In [None]:
# %pip install faiss-cpu
import faiss

def create_faiss_index(embeddings):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

index = create_faiss_index(embeddings)


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [11]:
section_queries = {
    "Problem Statement": "What research problem does this paper address?",
    "Dataset": "Which datasets are used in the experiments?",
    "Methodology": "Describe the model or methods used.",
    "Evaluation": "List the metrics and results.",
    "Limitations": "What are the limitations mentioned?",
    "Future Work": "What future work is proposed?"
}


In [12]:
def retrieve_chunks(query, index, chunks, top_k=3):
    query_emb = embed_model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_emb, top_k)
    retrieved_texts = [chunks[i] for i in indices[0]]
    return retrieved_texts


In [14]:
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_ollama import ChatOllama
from tqdm import tqdm

# -----------------------------
# 🧠 Step 1: Setup Model
# -----------------------------
llm = ChatOllama(model="qwen2.5:7b", temperature=0.2)


llm_section_prompt = PromptTemplate.from_template("""
You are an AI research assistant.

From the following text, generate a concise, structured summary for the section: {section_name}

Text:
{context}

Write in clear academic style.
""")

llm_section_chain = llm_section_prompt | llm | StrOutputParser()

def summarize_section(section_name, retrieved_texts):
    context = "\n\n".join(retrieved_texts)
    summary = llm_section_chain.invoke({
        "section_name": section_name,
        "context": context
    })
    return summary


In [None]:
final_summary = {}
for section, query in section_queries.items():
    retrieved = retrieve_chunks(query, index, chunks, top_k=3)
    final_summary[section] = summarize_section(section, retrieved)

# Optional: Combine into a single formatted string
structured_summary = "\n\n".join([f"## {s}\n{final_summary[s]}" for s in final_summary])
print(structured_summary)


In [10]:
load_and_chunk_pdf("papers/ml_model_cardio_disease_detection.pdf")

['Citation: Ogunpola, A.; Saeed, F.;\nBasurra, S.; Albarrak, A.M.; Qasem,\nS.N. Machine Learning-Based\nPredictive Models for Detection of\nCardiovascular Diseases. Diagnostics\n2024, 14, 144. https://doi.org/\n10.3390/diagnostics14020144\nAcademic Editor: Mugahed A.\nAl-antari\nReceived: 27 November 2023\nRevised: 21 December 2023\nAccepted: 25 December 2023\nPublished: 8 January 2024\nCopyright: © 2024 by the authors.\nLicensee MDPI, Basel, Switzerland.\nThis article is an open access article\ndistributed under the terms and\nconditions of the Creative Commons\nAttribution (CC BY) license (https://\ncreativecommons.org/licenses/by/\n4.0/).\ndiagnostics \nArticle\nMachine Learning-Based Predictive Models for Detection of\nCardiovascular Diseases\nAdedayo Ogunpola 1, Faisal Saeed 1, *\n , Shadi Basurra 1, Abdullah M. Albarrak 2\n and Sultan Noman Qasem 2\n1 DAAI Research Group, College of Computing and Digital Technology, Birmingham City University,\nBirmingham B4 7XG, UK; adedayo.ogun