In [1]:
# ==== CONSTANTS: model + dims + index name ====
MODEL_NAME = "NeuML/pubmedbert-base-embeddings"  # PubMedBERT embeddings
EMBED_DIM  = 768                                  # PubMedBERT vector size
INDEX_NAME = "PubMedBERT-Index"                   # Pinecone index name for this notebook

# 1. Data Loading and Filtering Records with Focus (Primary or Secondary)

In [2]:
import json
with open("Data/meta_test.json", "r", encoding="utf-8") as f:
    records = json.load(f)
# Filter JSON entries where focus is primary or secondary
filtered_records = [entry for entry in records if any(f in ["primary", "secondary"] for f in entry["metadata"].get("focus", []))]

# Calculation of Primary and Secondary records %
filtered_records_percent = round(((len(filtered_records)/len(records)) * 100), 2)

print(f"Only {filtered_records_percent}% of entire records are Primary or Secondary ")



Only 85.42% of entire records are Primary or Secondary 


# 2. Data Restructuring

In [3]:
def metadata_restructuring(records):
    restructured_records = []
    for record in records:
        metadata = record.get("metadata", {}).copy()  # copy to avoid mutating original

        # Explicitly ensure top-level fields are part of metadata
        for field in ["root_name", "search_term", "synonyms", "PMID", "pubmed_type"]:
            if field in record:
                metadata[field] = record[field]
        
        restructured_records.append({"metadata": metadata})
    return restructured_records

restructured_records = metadata_restructuring(filtered_records)

### Optional: Validation Checkpoint to get matching record from json_list

In [4]:
# def get_record_by_pmid(json_list, pmid):
#     """Pass PMID and get matching record from json_list"""
#     for record in json_list:
#         if record['metadata']['PMID'] == pmid:
#             return record
#     return None


# # Example usage:
# result = get_record_by_pmid(restructured_records, 11524119)

# if result:
#     print(json.dumps(result, indent=2))  # Prints the entire matching record
# else:
#     print("PMID not found")

# 3. Flattening the Data

In [5]:
for record in restructured_records:
    metadata = record["metadata"]
    
    # Process interventions with Parallel - Indexing
    interventions = metadata.get("interventions", [])
    record["intervention_names"] = [i.get("ingredient") for i in interventions]
    record["intervention_dosages"] = [i.get("daily_dosage") for i in interventions]
    record["intervention_units"] = [i.get("units") if i.get("units") else "" for i in interventions]
    record["intervention_original_texts"] = [i.get("original_text") for i in interventions]
    
    # Process outcomes with Parallel - Indexing
    outcomes = metadata.get("outcomes", [])
    record["biomarker_names"] = [o["name"] for o in outcomes if o["domain"] == "biomarker"]
    record["biomarker_types"] = [o["type"] for o in outcomes if o["domain"] == "biomarker"]
    record["biomarker_results"] = [o["result"] for o in outcomes if o["domain"] == "biomarker"]

    record["function_names"] = [o["name"] for o in outcomes if o["domain"] == "function"]
    record["function_types"] = [o["type"] for o in outcomes if o["domain"] == "function"]
    record["function_results"] = [o["result"] for o in outcomes if o["domain"] == "function"]

    record["condition_names"] = [o["name"] for o in outcomes if o["domain"] == "condition"]
    record["condition_types"] = [o["type"] for o in outcomes if o["domain"] == "condition"]
    record["condition_results"] = [o["result"] for o in outcomes if o["domain"] == "condition"]

    
    # Delete original detailed fields
    for key in ["interventions", "outcomes", "biomarkers", "functions", "conditions"]:
        metadata.pop(key, None)


In [6]:
with open("Data/flatten.json", "w", encoding="utf-8") as f:
    json.dump(restructured_records, f, indent=2, ensure_ascii=False)

# 4. Data Processing

### 4a. Converting into Embeddings and performing Sematic Chunking

In [7]:
import pandas as pd
from llama_index.core import Document, VectorStoreIndex
from llama_index.core.vector_stores import SimpleVectorStore
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from UPDATED_meta_data_generation import *
from dotenv import load_dotenv
from tqdm import tqdm
load_dotenv()

# --------------------------
# Initialize embedding model and semantic chunker
# --------------------------
embed_model = HuggingFaceEmbedding(model_name=MODEL_NAME)

splitter = SemanticSplitterNodeParser(
    buffer_size=1,
    breakpoint_percentile_threshold=95,
    embed_model=embed_model
)

all_nodes = []

for idx, row in enumerate(tqdm(restructured_records, desc="Processing papers")):
    md = row["metadata"]  # all metadata including pmid, year, etc.
    paper = fetch_extract_and_abstract(md['PMID'])
    title = paper['title']  # title
    abstract = paper['abstract']  # abstract text
    #print(f">>>>Title:{idx}",title)

    # --------------------------
    # Title Node
    # --------------------------
    title_node = Document(
        text=title,
        metadata={
            "type": "title",
            "node_index": 0,
            **md  # include all metadata fields directly
        }
    )
    all_nodes.append(title_node)

    # --------------------------
    # Abstract Nodes (Semantic Split)
    # --------------------------
    abstract_doc = Document(
        text=abstract,
        metadata={
            "type": "abstract",
            **md  # include all metadata fields directly
        }
    )

    abstract_nodes = splitter.get_nodes_from_documents([abstract_doc])

    # Assign node_index starting from 1 (after title)
    for i, node in enumerate(abstract_nodes, start=1):
        node.metadata["node_index"] = i
        all_nodes.append(node)

print(f"Built {len(all_nodes)} nodes from {len(restructured_records)} records")

  from .autonotebook import tqdm as notebook_tqdm
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 32888f13-e89a-465a-9c2b-aeb025ba663d)')' thrown while requesting HEAD https://huggingface.co/NeuML/pubmedbert-base-embeddings/resolve/main/./modules.json
Retrying in 1s [Retry 1/5].
Processing papers: 100%|██████████| 82/82 [00:51<00:00,  1.61it/s]

Built 247 nodes from 82 records





### 4b. Storing Embedded Chunks into Local Storage

In [8]:
simple_vs = SimpleVectorStore()
index = VectorStoreIndex(
    all_nodes,
    vector_store=simple_vs,
    embed_model=embed_model,
    show_progress=True
)

Generating embeddings: 100%|██████████| 247/247 [00:14<00:00, 17.52it/s]


## 5. Assessing Embedding Model Performance

In [9]:
import re

K = 5  # top-k to evaluate (keep in sync with retriever)

test_queries = [
    #Cedarwood
    "Against Streptococcus mutans in vitro, how did cedarwood oil perform relative to cinnamon and lemongrass oils, and which assay was used to measure activity?",
    "In patients receiving radiotherapy, did inhaled aromatherapy (lavender/bergamot/cedarwood) reduce anxiety or mood symptoms versus control?",
    "Under supercritical CO2 vs. liquid CO2 extraction, how did temperature/pressure alter the cedrol/cedrene ratio in cedarwood oil?",
    #Eucalyptus
    "In patients with tinea pedis/corporis/cruris, what percent recovered completely vs. showed significant improvement after 14 days of Eucalyptus pauciflora oil/BSHT ointment?",
    "Which hepatic xenobiotic-metabolizing enzymes/biomarkers were induced in brushtail possums fed eucalyptus terpenes for 10 days?",
    "In children with acute/chronic maxillary sinusitis or peritonsillar abscess, what anti-inflammatory effects were reported with eucalymine?",
    #Tobacco
    "In Catalonia’s case–control study of new rheumatoid arthritis diagnoses, how did current vs. ex-smoking relate to RA risk?",
    "Among children prenatally exposed to toxic substances, how did tobacco exposure affect RNFL/GCL thickness on OCT compared with alcohol or other drugs?",
    "Inside private cars in Spain, what proportion of children were exposed to second-hand smoke, and what prevention implication did the authors note?",
    #Marjoram
    "With supercritical CO₂ vs. ethanol Soxhlet extraction of Origanum majorana, how did chlorophyll and carotenoid recovery compare?",
    "Do marjoram extracts inhibit DPP-IV and PTP1B in vitro, and how do they compare to rosemary/mexican oregano for these targets?",
    "Under NaCl (100 mmol L⁻¹) stress, how did marjoram’s essential-oil yield and major constituents (e.g., terpinen-4-ol, sabinene hydrate) change?",
]

# Optional: gold PMIDs (leave empty to use weak keyword rules)
GOLD = {
    #Cedarwood
    test_queries[0]: {"22430697"},
    test_queries[1]: {"12805340"},
    test_queries[2]: {"15080642"},
    #Eucalyptus
    test_queries[3]: {"10657767"},
    test_queries[4]: {"10661715"},
    test_queries[5]: {"10081402"},
    #Tobacco
    test_queries[6]: {"33060030"},
    test_queries[7]: {"30270036"},
    test_queries[8]: {"23608016"},
    #Marjoram
    test_queries[9]: {"11929287"},
    test_queries[10]: {"24881464"},
    test_queries[11]: {"22750822"},
}

RELEVANCE_RULES = {
    # 0) Cedarwood vs cinnamon/lemongrass on S. mutans + assay
    test_queries[0]: {
        "ingredient": [
            "cedarwood", "cedrus", "cedar", "juniperus", "cedrol", "cedrene",
            "cinnamon", "cinnamomum", "lemongrass", "cymbopogon"
        ],
        "intent": [
            "antibacterial", "streptococcus", "s. mutans", "mutans",
            "agar well diffusion", "zone of inhibition", "mic", "minimum inhibitory concentration",
            "assay", "in vitro", "comparative", "relative"
        ],
    },

    # 1) Aromatherapy (lavender/bergamot/cedarwood) during radiotherapy → anxiety/mood
    test_queries[1]: {
        "ingredient": [
            "aromatherapy", "lavender", "bergamot", "cedarwood", "cedrus", "cedar",
            "inhalation", "inhaled"
        ],
        "intent": [
            "radiotherapy", "radiation therapy", "oncology",
            "anxiety", "mood", "depression", "stress", "randomized", "double blind", "control"
        ],
    },

    # 2) Cedarwood oil extraction: supercritical vs liquid CO2; cedrol/cedrene ratio; T/P
    test_queries[2]: {
        "ingredient": [
            "cedarwood", "cedrus", "juniperus", "cedrol", "cedrene", "essential oil"
        ],
        "intent": [
            "supercritical co2", "sc-co2", "liquid co2", "extraction",
            "temperature", "pressure", "ratio", "composition", "cedrol/cedrene"
        ],
    },

    # 3) Eucalyptus pauciflora antifungal clinical (tinea); BSHT; 14 days; outcomes
    test_queries[3]: {
        "ingredient": [
            "eucalyptus pauciflora", "eucalyptus", "bsht", "ointment", "topical"
        ],
        "intent": [
            "antifungal", "tinea", "dermatophyte", "pedis", "corporis", "cruris",
            "clinical", "patients", "recovered", "improvement", "14 days", "treatment"
        ],
    },

    # 4) Eucalyptus terpenes induce hepatic/xenobiotic enzymes in brushtail possums
    test_queries[4]: {
        "ingredient": [
            "eucalyptus", "terpenes", "dietary terpenes", "pauciflora"
        ],
        "intent": [
            "brushtail possum", "possum", "marsupial",
            "xenobiotic", "hepatic", "enzyme induction", "biomarker",
            "cytochrome p450", "cyp", "gst", "glutathione s-transferase",
            "udp-glucuronyltransferase", "phase i", "phase ii", "10 days"
        ],
    },

    # 5) Pediatric ENT: eucalymine anti-inflammatory; sinusitis / peritonsillar abscess
    test_queries[5]: {
        "ingredient": [
            "eucalymine", "eucalyptus", "preparation", "extract"
        ],
        "intent": [
            "children", "pediatric", "sinusitis", "maxillary sinusitis",
            "peritonsillar abscess", "anti-inflammatory", "ent", "otolaryngology",
            "clinical", "outcomes", "symptoms"
        ],
    },

    #Tobacco
    test_queries[6]: {"ingredient": ["tobacco", "smoking"], "intent": ["rheumatoid", "arthritis", "risk"]},
    test_queries[7]: {"ingredient": ["tobacco"], "intent": ["prenatal", "retina", "rnfl", "gcl", "oct"]},
    test_queries[8]: {"ingredient": ["tobacco", "smoke"], "intent": ["second-hand", "children", "vehicle", "car"]},
    #Marjoram
    test_queries[9]: {"ingredient": ["marjoram", "origanum majorana"], "intent": ["supercritical", "soxhlet", "chlorophyll", "carotenoid"]},
    test_queries[10]: {"ingredient": ["marjoram"], "intent": ["dpp-iv", "ptp1b", "inhibit"]},
    test_queries[11]: {"ingredient": ["marjoram", "origanum majorana"], "intent": ["nacl", "salt", "terpinen-4-ol", "sabinene", "essential oil", "yield"]},
}

In [10]:
def _to_text(x):
    """Coerce lists/dicts/None to a single string."""
    if x is None:
        return ""
    if isinstance(x, (list, tuple, set)):
        return " ".join(map(_to_text, x))
    if isinstance(x, dict):
        return " ".join(f"{k}:{_to_text(v)}" for k, v in x.items())
    return str(x)

def _norm(x):
    s = _to_text(x)
    return re.sub(r"\s+", " ", s).lower().strip()

def weak_relevant(text, meta, rule):
    hay = " ".join([
        _norm(text),
        _norm(meta.get("root_name")),
        _norm(meta.get("synonyms")),
        _norm(meta.get("search_term")),
        _norm(meta.get("pubmed_type")),
    ])
    ingr_ok = any(t in hay for t in rule["ingredient"])
    intent_ok = any(t in hay for t in rule["intent"])
    return ingr_ok and intent_ok

def hit_at_k(rels):  # rels = list[bool] for ranks 1..K
    return 1.0 if any(rels) else 0.0

def mrr(rels):
    for i, r in enumerate(rels, start=1):
        if r:
            return 1.0 / i
    return 0.0

def _fmt_score(s):
    try:
        return f"{float(s):.4f}"
    except Exception:
        return "NA"

print(f"\n=== QUICK EVAL — {MODEL_NAME} ===")
retriever = index.as_retriever(similarity_top_k=5)

for q in test_queries:
    results = retriever.retrieve(q)[:K]
    rel_flags = []

    print(f"\nQuery: {q}")
    print("-" * 80)

    gold = {str(x) for x in GOLD.get(q, set())}
    use_gold = len(gold) > 0

    for rank, res in enumerate(results, start=1):
        pmid = str(res.node.metadata.get("PMID"))
        text = (res.node.text or "").replace("\n", " ")
        meta = {
            "root_name": res.node.metadata.get("root_name"),
            "synonyms": res.node.metadata.get("synonyms"),
            "search_term": res.node.metadata.get("search_term"),
            "pubmed_type": res.node.metadata.get("pubmed_type"),
        }
        is_rel = (pmid in gold) if use_gold else weak_relevant(text, meta, RELEVANCE_RULES[q])
        rel_flags.append(bool(is_rel))

        preview = text[:160] + ("..." if len(text) > 160 else "")
        print(f"[{rank}] score={_fmt_score(getattr(res, 'score', None))}  PMID={pmid}  type={res.node.metadata.get('type')}  relevant={is_rel}")
        print(f"     {preview}")

    # Metrics
    h = hit_at_k(rel_flags)
    r = mrr(rel_flags)
    avg_score = (
        sum(float(getattr(r_, "score", 0.0) or 0.0) for r_ in results) / max(len(results), 1)
    )

    print("-" * 80)
    print(f"Hit@{K}: {h:.2f}   MRR: {r:.3f}   AvgScore@{K}: {_fmt_score(avg_score)}   Relevant@{K}: {sum(rel_flags)}/{len(rel_flags)}")


=== QUICK EVAL — NeuML/pubmedbert-base-embeddings ===

Query: Against Streptococcus mutans in vitro, how did cedarwood oil perform relative to cinnamon and lemongrass oils, and which assay was used to measure activity?
--------------------------------------------------------------------------------
[1] score=0.6918  PMID=22430697  type=abstract  relevant=True
     Agar well diffusion assay was used to measure antibacterial activity. Zone of inhibition was measured around the filter paper in millimeters with vernier calipe...
[2] score=0.6646  PMID=22430697  type=title  relevant=True
     Antimicrobial activity of commercially available essential oils against Streptococcus mutans.
[3] score=0.6580  PMID=22430697  type=abstract  relevant=True
     Many essential oils have been advocated for use in complementary medicine for bacterial and fungal infections. However, few of the many claims of therapeutic ef...
[4] score=0.4957  PMID=10399193  type=abstract  relevant=False
     It is notew

In [11]:
GOLD

{'Against Streptococcus mutans in vitro, how did cedarwood oil perform relative to cinnamon and lemongrass oils, and which assay was used to measure activity?': {'22430697'},
 'In patients receiving radiotherapy, did inhaled aromatherapy (lavender/bergamot/cedarwood) reduce anxiety or mood symptoms versus control?': {'12805340'},
 'Under supercritical CO2 vs. liquid CO2 extraction, how did temperature/pressure alter the cedrol/cedrene ratio in cedarwood oil?': {'15080642'},
 'In patients with tinea pedis/corporis/cruris, what percent recovered completely vs. showed significant improvement after 14 days of Eucalyptus pauciflora oil/BSHT ointment?': {'10657767'},
 'Which hepatic xenobiotic-metabolizing enzymes/biomarkers were induced in brushtail possums fed eucalyptus terpenes for 10 days?': {'10661715'},
 'In children with acute/chronic maxillary sinusitis or peritonsillar abscess, what anti-inflammatory effects were reported with eucalymine?': {'10081402'},
 'In Catalonia’s case–contr