In [1]:
%pip install ollama

Collecting ollama
  Downloading ollama-0.6.0-py3-none-any.whl.metadata (4.3 kB)
Collecting httpx>=0.27 (from ollama)
  Using cached httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting pydantic>=2.9 (from ollama)
  Downloading pydantic-2.12.3-py3-none-any.whl.metadata (87 kB)
Collecting anyio (from httpx>=0.27->ollama)
  Downloading anyio-4.11.0-py3-none-any.whl.metadata (4.1 kB)
Collecting certifi (from httpx>=0.27->ollama)
  Downloading certifi-2025.10.5-py3-none-any.whl.metadata (2.5 kB)
Collecting httpcore==1.* (from httpx>=0.27->ollama)
  Using cached httpcore-1.0.9-py3-none-any.whl.metadata (21 kB)
Collecting idna (from httpx>=0.27->ollama)
  Downloading idna-3.11-py3-none-any.whl.metadata (8.4 kB)
Collecting h11>=0.16 (from httpcore==1.*->httpx>=0.27->ollama)
  Using cached h11-0.16.0-py3-none-any.whl.metadata (8.3 kB)
Collecting annotated-types>=0.6.0 (from pydantic>=2.9->ollama)
  Using cached annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)
Collecting pydantic-co

# Load PDF and extract text from pdf

In [3]:
%pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Note: you may need to restart the kernel to use updated packages.


In [7]:
import os
os.getcwd()

'c:\\Users\\Yue Ning\\Desktop\\MiscLearning\\ai_researcher_database'

In [1]:
import PyPDF2
import json
import time
import ollama

def extract_text_from_pdf(pdf_path):
    """
    Extract text from a PDF file.
    """
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text() or ""
    return text


In [2]:
def is_valid_markdown(summary_text):
    """
    Basic validation: check that required headings exist.
    Returns True if all required sections are present.
    """
    required_headings = [
        "# Title",
        "# Authors",
        "# Problem Statement",
        "# Dataset",
        "# Models and Methods",
        "# Results Summary",
        "# Conclusion",
        "# Keywords"
    ]
    for heading in required_headings:
        if heading not in summary_text:
            return False
    return True

def summarize(pdf_text, model, max_retries=3, delay=2):
    """
    Send extracted PDF text to local Mistral 3.2 model through Ollama with structured prompt.
    """
    prompt = f"""
You are an expert research assistant specialized in AI and healthcare papers.

You will be given the text of a research paper. Extract a detailed structured summary. INCLUDE numbers from the paper.

Summarize the research paper using the following Markdown structure:

# Title  
# Authors  
# Problem Statement  
# Dataset  
- Description  
- Source  
- Size  
- Preprocessing  

# Models and Methods  
For each model:  
- Name  
- Type (e.g., CNN, Random Forest, Transformer)  
- Architecture details  
- Hyperparameters  
- Performance metrics (only those mentioned in the paper, e.g., Accuracy, F1-score, ROC-AUC, MAE, etc.)

# Results Summary  
# Conclusion  
# Keywords 

--- Research Paper Text Starts ---
{pdf_text}
--- Research Paper Text Ends ---
    """

    attempt = 0
    while attempt < max_retries:
        response = ollama.chat(
            model= model, 
            messages=[{"role": "user", "content": prompt}],
            options={
                "temperature": 0.5
            })
        summary = response["message"]["content"]

        if is_valid_markdown(summary):
            return summary
        else: 
            print(f"⚠️ Attempt {attempt+1}: Invalid output, retrying...")
            attempt +=1
            time.sleep(delay)
    
    # If all retries fail, return what we got with a warning
    print("❌ Max retries reached. Returning last output (may be incomplete).")
    return summary


def main():
    pdf_path = "../../Resources/ml_model_cardio_disease_detection.pdf"

    print("Extracting text from PDF...")
    pdf_text = extract_text_from_pdf(pdf_path)

    print("Summarizing using Ollama...")
    summary = summarize(pdf_text, "llama3.1:latest")

    print("\n--- Summary ---\n")
    print(summary)


if __name__ == "__main__":
    main()

Extracting text from PDF...
Summarizing using Ollama...
⚠️ Attempt 1: Invalid output, retrying...
⚠️ Attempt 2: Invalid output, retrying...
⚠️ Attempt 3: Invalid output, retrying...
❌ Max retries reached. Returning last output (may be incomplete).

--- Summary ---

This is a research paper on the topic of heart disease prediction using machine learning algorithms. The paper reviews various studies and techniques used for predicting heart disease, including:

1. **ECG signal analysis**: Several studies have used ECG signals to predict heart disease by analyzing features such as heart rate variability, QRS complex, and T-wave morphology.
2. **Machine learning algorithms**: Various machine learning algorithms, including neural networks, support vector machines, random forests, and gradient boosting, have been applied to predict heart disease from various datasets.
3. **Deep learning models**: Deep learning models, such as convolutional neural networks (CNNs) and recurrent neural networks 

In [3]:
# %pip install pandas
# %pip install camelot
%pip install camelot-py
%pip install ghostscript

# If you're using tabula instead of Camelot:
# pip install tabula-py

# Optional - PromptLayer for tracking prompts
%pip install promptlayer

# Optional - LangChain if you want templated prompts / chaining
%pip install langchain

Collecting camelot-py
  Downloading camelot_py-1.0.9-py3-none-any.whl.metadata (9.8 kB)
Collecting click>=8.0.1 (from camelot-py)
  Using cached click-8.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting chardet>=5.1.0 (from camelot-py)
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Collecting openpyxl>=3.1.0 (from camelot-py)
  Using cached openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting pdfminer-six>=20240706 (from camelot-py)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdf<6.0,>=4.0 (from camelot-py)
  Downloading pypdf-5.9.0-py3-none-any.whl.metadata (7.1 kB)
Collecting tabulate>=0.9.0 (from camelot-py)
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Collecting opencv-python-headless>=4.7.0.68 (from camelot-py)
  Using cached opencv_python_headless-4.12.0.88-cp37-abi3-win_amd64.whl.metadata (20 kB)
Collecting pypdfium2>=4 (from camelot-py)
  Downloading pypdfium2-4.30.0-py3-none-win_amd64.whl.met

  You can safely remove it manually.
  You can safely remove it manually.


Collecting ghostscript
  Downloading ghostscript-0.8.1-py3-none-any.whl.metadata (4.4 kB)
Downloading ghostscript-0.8.1-py3-none-any.whl (25 kB)
Installing collected packages: ghostscript
Successfully installed ghostscript-0.8.1
Note: you may need to restart the kernel to use updated packages.
Collecting promptlayer
  Downloading promptlayer-1.0.71-py3-none-any.whl.metadata (4.9 kB)
Collecting ably<3.0.0,>=2.0.11 (from promptlayer)
  Downloading ably-2.1.1-py3-none-any.whl.metadata (2.5 kB)
Collecting aiohttp<4.0.0,>=3.10.10 (from promptlayer)
  Downloading aiohttp-3.13.1-cp313-cp313-win_amd64.whl.metadata (8.4 kB)
Collecting opentelemetry-api<2.0.0,>=1.26.0 (from promptlayer)
  Downloading opentelemetry_api-1.38.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-sdk<2.0.0,>=1.26.0 (from promptlayer)
  Downloading opentelemetry_sdk-1.38.0-py3-none-any.whl.metadata (1.5 kB)
Collecting requests<3.0.0,>=2.31.0 (from promptlayer)
  Downloading requests-2.32.5-py3-none-any.whl.me

# More complicated pipeline 
- Extract text and tables
- Chunking 
- Output JSON

In [1]:
import os
import re
import json
import PyPDF2
import pandas as pd
import ollama
import camelot  # for table extraction
def extract_tables_from_pdf(pdf_path):
    tables = []
    # Camelot works with PDFs that have text-based tables
    try:
        camelot_tables = camelot.read_pdf(pdf_path, pages='all', flavor='stream')
        for t in camelot_tables:
            tables.append(t.df.to_dict(orient='records'))
    except Exception as e:
        print(f"⚠️ Table extraction failed: {e}")
    return tables

extract_tables_from_pdf("../../Resources/ml_model_cardio_disease_detection.pdf")

ModuleNotFoundError: No module named 'camelot'

In [None]:
import os
import re
import json
import PyPDF2
import pandas as pd
import ollama
import camelot  # for table extraction

# ---------------------------
# 1️⃣ PDF Text Extraction
# ---------------------------
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text() or ""
    return text

# ---------------------------
# 2️⃣ Table Extraction
# ---------------------------
def extract_tables_from_pdf(pdf_path):
    tables = []
    # Camelot works with PDFs that have text-based tables
    try:
        camelot_tables = camelot.read_pdf(pdf_path, pages='all', flavor='stream')
        for t in camelot_tables:
            tables.append(t.df.to_dict(orient='records'))
    except Exception as e:
        print(f"⚠️ Table extraction failed: {e}")
    return tables

# ---------------------------
# 3️⃣ Chunk Text for Large PDFs
# ---------------------------
def chunk_text(text, chunk_size=2000):
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

# ---------------------------
# 4️⃣ Call Mistral Small 3.2 via Ollama
# ---------------------------
def summarize_chunk(chunk_text, tables=None):
    table_text = ""
    if tables:
        # Flatten tables into markdown-like string
        for i, table in enumerate(tables):
            df = pd.DataFrame(table)
            table_text += f"\nTable {i+1}:\n"
            table_text += df.to_csv(index=False, sep='|')

    prompt = f"""
You are a strict JSON-only research assistant. You will be given text from a research paper and optional tables.

Extract a detailed structured summary in **valid JSON only** using this schema:

{{
  "title": "",
  "authors": "",
  "problem_statement": "",
  "dataset": {{
      "description": "",
      "size": "",
      "source": "",
      "preprocessing": ""
  }},
  "models": [
      {{
          "name": "",
          "type": "",
          "architecture_details": "",
          "hyperparameters": "",
          "results": {{
              "accuracy": "",
              "f1_score": "",
              "roc_auc": "",
              "other_metrics": {{}}
          }}
      }}
  ],
  "results_summary": "",
  "conclusion": "",
  "keywords": []
}}

--- PAPER TEXT START ---
{chunk_text}
--- PAPER TEXT END ---

--- TABLES START ---
{table_text}
--- TABLES END ---
    """

    response = ollama.chat(
        model="mistral-small:latest",
        messages=[
            {"role": "system", "content": "You are a strict JSON-only research assistant. Output only valid JSON."},
            {"role": "user", "content": prompt}
        ]
    )
    return response["message"]["content"]

# ---------------------------
# 5️⃣ Post-process and Fix JSON
# ---------------------------
def fix_json(raw_text):
    # remove trailing commas before closing braces/brackets
    cleaned = re.sub(r",(\s*[\]}])", r"\1", raw_text)
    if not cleaned.strip().startswith("{"):
        cleaned = "{" + cleaned
    if not cleaned.strip().endswith("}"):
        cleaned = cleaned + "}"
    return cleaned

# ---------------------------
# 6️⃣ Main Workflow
# ---------------------------
def summarize_paper(pdf_path):
    # Extract text and tables
    text = extract_text_from_pdf(pdf_path)
    tables = extract_tables_from_pdf(pdf_path)

    # Chunk the text
    chunks = chunk_text(text, chunk_size=3000)
    summaries = []

    for chunk in chunks:
        raw_summary = summarize_chunk(chunk, tables)
        fixed_summary = fix_json(raw_summary)
        try:
            summaries.append(json.loads(fixed_summary))
        except json.JSONDecodeError:
            print("⚠️ Could not parse JSON for chunk. Raw output:")
            print(raw_summary)

    # Merge summaries (simple concatenation for now)
    final_summary = {
        "title": summaries[0].get("title", "") if summaries else "",
        "authors": summaries[0].get("authors", "") if summaries else "",
        "problem_statement": " ".join(s.get("problem_statement", "") for s in summaries),
        "dataset": summaries[0].get("dataset", {}) if summaries else {},
        "models": [m for s in summaries for m in s.get("models", [])],
        "results_summary": " ".join(s.get("results_summary", "") for s in summaries),
        "conclusion": " ".join(s.get("conclusion", "") for s in summaries),
        "keywords": list({k for s in summaries for k in s.get("keywords", [])})
    }

    return final_summary

# ---------------------------
# 7️⃣ Run
# ---------------------------
if __name__ == "__main__":
    pdf_path = "../../Resources/ml_model_cardio_disease_detection.pdf"
    summary = summarize_paper(pdf_path)
    print(json.dumps(summary, indent=2))


# Dolphin 3 (archived)
This model (llama3) does not support multimodal inputs 

In [None]:
import requests
import json

def ask_dolphin(prompt):
    response = requests.post(
        "http://localhost:11434/api/generate",
        json={"model": "dolphin3", "prompt": prompt, "stream": False}
    )
    return response.json()["response"]

schema = """
You are an AI that outputs ONLY valid JSON. 
Schema example:
{
  "title": "",
  "summary": "",
  "methodology": {
    "datasets": [],
    "architecture": "",
    "training": ""
  },
  "results": "",
  "citations": []
}
"""

paper_text = "Paste your paper text here or load from file."

prompt = f"{schema}\nSummarize the following paper:\n{paper_text}"

response = ask_dolphin(prompt)

try:
    json_output = json.loads(response)
    print("✅ Valid JSON:", json_output)
except json.JSONDecodeError:
    print("❌ Invalid JSON received:\n", response)
