In [1]:
pip install faiss-cpu sentence-transformers pandas tqdm

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-5.1.0-py3-none-any.whl.metadata (16 kB)
Collecting pandas
  Downloading pandas-2.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.56.0-py3-none-any.whl.metadata (40 kB)
Collecting scikit-learn (from sentence-transformers)
  Downloading scikit_learn-1.7.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting scipy (from sentence-transformers)
  Downloading scipy-1.16.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (61 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.34.4-py3-none-any.whl

In [2]:
pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install datasets

Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-21.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting aiohttp!=4.0.0a0,!=4.0.0a1 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading aiohttp-3.12.15-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting aiohappyeyeballs>=2.5.0 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading aiohappyeyeballs-2.6.1-py3-none-any.whl.metadata (5.9 kB)
Collecting aiosignal>=1.4.0 (from aiohttp!=4.

In [4]:
from Bio import Entrez
from time import sleep
from datasets import Dataset
import pandas as pd
import os
import re
from transformers import BertTokenizer
import random
from transformers import DataCollatorForLanguageModeling
from transformers import BertForMaskedLM
from transformers import TrainingArguments, Trainer

In [5]:
Entrez.email = "edisonzjy@gmail.com"
search_term = "biomedical research"
max_articles = 10000
batch_size = 100

output_file = "pubmed_abstracts.txt"

def fetch_pubmed_abstracts():
    print(f"Searching PubMed for: {search_term}")
    search_handle = Entrez.esearch(db="pubmed", term=search_term, retmax=max_articles)
    record = Entrez.read(search_handle)
    search_handle.close()

    pmids = record["IdList"]
    print(f"Found {len(pmids)} articles.")

    with open(output_file, "w", encoding="utf-8") as f:
        for start in range(0, len(pmids), batch_size):
            end = min(start + batch_size, len(pmids))
            batch_pmids = pmids[start:end]
            print(f"Fetching batch: {start + 1}–{end}")
            try:
                handle = Entrez.efetch(
                    db="pubmed",
                    id=",".join(batch_pmids),
                    rettype="abstract",
                    retmode="text"
                )
                data = handle.read()
                handle.close()

                articles = []
                raw_entries = data.split("\n\n")
                for entry in raw_entries:
                    match = re.search(r"(?:\n\n)?(?:[A-Z].+?\.){2,}", entry, re.DOTALL)
                    if match:
                        cleaned = match.group().strip().replace("\n", " ")
                        articles.append(cleaned)
                for article in articles:
                    f.write(article + "\n")

                sleep(0.5)
            except Exception as e:
                print(f"Error fetching batch {start + 1}–{end}: {e}")
                sleep(2)

    print(f"\nDone. Saved clean abstracts to: {output_file}")

if __name__ == "__main__":
    fetch_pubmed_abstracts()

Searching PubMed for: biomedical research
Found 9999 articles.
Fetching batch: 1–100
Fetching batch: 101–200
Fetching batch: 201–300
Fetching batch: 301–400
Fetching batch: 401–500
Fetching batch: 501–600
Fetching batch: 601–700
Fetching batch: 701–800
Fetching batch: 801–900
Fetching batch: 901–1000
Fetching batch: 1001–1100
Fetching batch: 1101–1200
Fetching batch: 1201–1300
Fetching batch: 1301–1400
Fetching batch: 1401–1500
Fetching batch: 1501–1600
Fetching batch: 1601–1700
Fetching batch: 1701–1800
Fetching batch: 1801–1900
Fetching batch: 1901–2000
Fetching batch: 2001–2100
Fetching batch: 2101–2200
Fetching batch: 2201–2300
Fetching batch: 2301–2400
Fetching batch: 2401–2500
Fetching batch: 2501–2600
Fetching batch: 2601–2700
Fetching batch: 2701–2800
Fetching batch: 2801–2900
Fetching batch: 2901–3000
Fetching batch: 3001–3100
Fetching batch: 3101–3200
Fetching batch: 3201–3300
Fetching batch: 3301–3400
Fetching batch: 3401–3500
Fetching batch: 3501–3600
Fetching batch: 3601–3

In [None]:
# compare biobert, kebiolm, sapbert, and biolinkbert

In [7]:
import faiss
import pandas as pd
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import numpy as np

# -------------------
# CONFIG
# -------------------
MIMIC_FILE = "final_combined_notes.csv"
PUBMED_FILE = "pubmed_abstracts.txt"
TOP_K = 3

# Models to compare
MODELS = {
    "BioBERT": "pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb",
    "SapBERT": "cambridgeltl/SapBERT-from-PubMedBERT-fulltext",
    "MiniLM (baseline)": "sentence-transformers/all-MiniLM-L6-v2",
    "SciBERT (NLI)": "gsarti/scibert-nli"
}


# -------------------
# LOAD DATA
# -------------------
def load_mimic_notes(csv_path, text_column="TEXT"):
    df = pd.read_csv(csv_path, engine="python", on_bad_lines="skip")
    df = df.dropna(subset=[text_column])
    return df[text_column].tolist()

def load_pubmed_abstracts(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = [line.strip() for line in f if line.strip()]
    return lines


# -------------------
# BUILD EMBEDDINGS + INDEX
# -------------------
def build_faiss_index(embeddings):
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    return index

def embed_documents(model, documents):
    return model.encode(documents, show_progress_bar=True, convert_to_numpy=True, batch_size=16)


# -------------------
# MAIN PIPELINE
# -------------------
def main():
    print("📄 Loading documents...")
    mimic_docs = load_mimic_notes(MIMIC_FILE)
    pubmed_docs = load_pubmed_abstracts(PUBMED_FILE)
    combined_docs = mimic_docs + pubmed_docs
    print(f"Total documents: {len(combined_docs)}")

    indexes = {}

    for name, model_name in MODELS.items():
        print(f"\n🔍 Loading model: {name}")
        model = SentenceTransformer(model_name)

        print(f"🔬 Embedding documents with {name}...")
        embeddings = embed_documents(model, combined_docs)

        print(f"📦 Building FAISS index for {name}...")
        index = build_faiss_index(embeddings)
        indexes[name] = (model, index, combined_docs)

    # Query loop
    while True:
        query = input("\n🧠 Enter your medical query (or 'exit'): ").strip()
        if query.lower() in ["exit", "quit"]:
            break

        for name, (model, index, documents) in indexes.items():
            print(f"\n=== Top {TOP_K} results from {name} ===")
            query_vec = model.encode([query], convert_to_numpy=True)
            D, I = index.search(query_vec, TOP_K)

            for rank, idx in enumerate(I[0]):
                snippet = documents[idx][:300].replace("\n", " ")
                print(f"\nResult {rank+1} (Score: {D[0][rank]:.2f}):")
                print(snippet)


if __name__ == "__main__":
    main()

📄 Loading documents...
Total documents: 3332

🔍 Loading model: BioBERT
🔬 Embedding documents with BioBERT...


Batches:   0%|          | 0/209 [00:00<?, ?it/s]

📦 Building FAISS index for BioBERT...

🔍 Loading model: SapBERT


No sentence-transformers model found with name cambridgeltl/SapBERT-from-PubMedBERT-fulltext. Creating a new one with mean pooling.


config.json:   0%|          | 0.00/462 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

🔬 Embedding documents with SapBERT...


Batches:   0%|          | 0/209 [00:00<?, ?it/s]

📦 Building FAISS index for SapBERT...

🔍 Loading model: MiniLM (baseline)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

🔬 Embedding documents with MiniLM (baseline)...


Batches:   0%|          | 0/209 [00:00<?, ?it/s]

📦 Building FAISS index for MiniLM (baseline)...

🔍 Loading model: SciBERT (NLI)


No sentence-transformers model found with name gsarti/scibert-nli. Creating a new one with mean pooling.


config.json:   0%|          | 0.00/450 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/135 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

🔬 Embedding documents with SciBERT (NLI)...


Batches:   0%|          | 0/209 [00:00<?, ?it/s]

📦 Building FAISS index for SciBERT (NLI)...



🧠 Enter your medical query (or 'exit'):  pneumonia oxygen levels



=== Top 3 results from BioBERT ===

Result 1 (Score: 173.52):
Admission Date:  [**2183-8-6**]       Discharge Date: [**2183-9-15**]   Service:  PRIMARY DIAGNOSIS: 1.  Volume overload anasarca. 2.  Status post posterior trach perforation and repair. 3.  Respiratory failure with ventilatory dependence.     Mechanical ventilatory dependence. 4.  Atrial fibrillati

Result 2 (Score: 178.54):
BackgroundNeonates are more susceptible to acute respiratory failure than older  children. It is unknown to what extent high-flow nasal cannula (HFNC) alters  intrathoracic pressure (ITP), potentially decreasing cardiac output (CO) due to  cardiopulmonary interactions. This study evaluated the impac

Result 3 (Score: 185.37):
Ascent to high altitude is accompanied by physiological responses that, to an  extent, mitigate the challenge of hypobaric hypoxia, maintaining arterial oxygen  content and convective oxygen delivery. Nevertheless, arterial oxygen tension  (pO2) remains low and tissue hypoxia pers


🧠 Enter your medical query (or 'exit'):  exit


In [15]:
pip install tabulate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting tabulate
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.9.0
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [16]:
from collections import defaultdict
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss

def main():
    print("📄 Loading documents...")
    mimic_docs = load_mimic_notes(MIMIC_FILE)
    pubmed_docs = load_pubmed_abstracts(PUBMED_FILE)
    combined_docs = mimic_docs + pubmed_docs
    print(f"Total documents: {len(combined_docs)}")

    # Load models and create indexes
    indexes = {}
    for name, model_name in MODELS.items():
        print(f"\n🔍 Loading model: {name}")
        model = SentenceTransformer(model_name)
        embeddings = embed_documents(model, combined_docs)
        index = build_faiss_index(embeddings)
        indexes[name] = (model, index, combined_docs)

    # Store scores for numeric comparison
    results_summary = defaultdict(list)

    # Query loop
    while True:
        query = input("\n🧠 Enter your medical query (or 'exit'): ").strip()
        if query.lower() in ["exit", "quit"]:
            break

        for name, (model, index, documents) in indexes.items():
            print(f"\n=== Top {TOP_K} results from {name} ===")
            query_vec = model.encode([query], convert_to_numpy=True)
            D, I = index.search(query_vec, TOP_K)

            # Store numeric metrics
            results_summary[name].append({
                "top1": D[0][0],
                "avg_topk": np.mean(D[0]),
                "min_topk": np.min(D[0]),
                "max_topk": np.max(D[0])
            })

            for rank, idx in enumerate(I[0]):
                snippet = documents[idx][:300].replace("\n", " ")
                print(f"\nResult {rank+1} (Score: {D[0][rank]:.2f}):")
                print(snippet)

    # Summary after all queries
    print("\n📊 Summary of Model Performance Across Queries")
    summary_data = []
    for name, scores in results_summary.items():
        top1s = [s["top1"] for s in scores]
        avg_topks = [s["avg_topk"] for s in scores]
        summary_data.append({
            "Model": name,
            "Avg Top-1 Distance": np.mean(top1s),
            "Avg Top-K Distance": np.mean(avg_topks),
            "Queries Run": len(scores)
        })
    df_summary = pd.DataFrame(summary_data).sort_values(by="Avg Top-K Distance")
    print(df_summary.to_markdown(index=False))

In [17]:
if __name__ == "__main__":
    main()

📄 Loading documents...
Total documents: 4512

🔍 Loading model: BioBERT


Batches:   0%|          | 0/282 [00:00<?, ?it/s]


🔍 Loading model: SapBERT


No sentence-transformers model found with name cambridgeltl/SapBERT-from-PubMedBERT-fulltext. Creating a new one with mean pooling.


Batches:   0%|          | 0/282 [00:00<?, ?it/s]


🔍 Loading model: MiniLM (baseline)


Batches:   0%|          | 0/282 [00:00<?, ?it/s]


🔍 Loading model: SciBERT (NLI)


No sentence-transformers model found with name gsarti/scibert-nli. Creating a new one with mean pooling.


Batches:   0%|          | 0/282 [00:00<?, ?it/s]


🧠 Enter your medical query (or 'exit'):  pneumonia oxygen levels



=== Top 3 results from BioBERT ===

Result 1 (Score: 173.52):
Admission Date:  [**2183-8-6**]       Discharge Date: [**2183-9-15**]   Service:  PRIMARY DIAGNOSIS: 1.  Volume overload anasarca. 2.  Status post posterior trach perforation and repair. 3.  Respiratory failure with ventilatory dependence.     Mechanical ventilatory dependence. 4.  Atrial fibrillati

Result 2 (Score: 178.54):
BackgroundNeonates are more susceptible to acute respiratory failure than older  children. It is unknown to what extent high-flow nasal cannula (HFNC) alters  intrathoracic pressure (ITP), potentially decreasing cardiac output (CO) due to  cardiopulmonary interactions. This study evaluated the impac

Result 3 (Score: 185.37):
Ascent to high altitude is accompanied by physiological responses that, to an  extent, mitigate the challenge of hypobaric hypoxia, maintaining arterial oxygen  content and convective oxygen delivery. Nevertheless, arterial oxygen tension  (pO2) remains low and tissue hypoxia pers


🧠 Enter your medical query (or 'exit'):  exit



📊 Summary of Model Performance Across Queries
| Model             |   Avg Top-1 Distance |   Avg Top-K Distance |   Queries Run |
|:------------------|---------------------:|---------------------:|--------------:|
| MiniLM (baseline) |             0.931602 |             0.940506 |             1 |
| BioBERT           |           173.519    |           179.144    |             1 |
| SapBERT           |           166.674    |           182.986    |             1 |
| SciBERT (NLI)     |           412.917    |           419.034    |             1 |
