# 1. Data Loading and Filtering Records with Focus (Primary or Secondary)

In [42]:
import json
with open("Data/meta_test.json", "r", encoding="utf-8") as f:
    records = json.load(f)
# Filter JSON entries where focus is primary or secondary
filtered_records = [entry for entry in records if any(f in ["primary", "secondary"] for f in entry["metadata"].get("focus", []))]

# Calculation of Primary and Secondary records %
filtered_records_percent = round(((len(filtered_records)/len(records)) * 100), 2)

print(f"Only {filtered_records_percent}% of entire records are Primary or Secondary ")



Only 85.42% of entire records are Primary or Secondary 


# 2. Data Restructuring

In [43]:
def list_all_roots(records):
    roots = set()
    for r in records:
        md = r.get("metadata", {})
        rn = md.get("root_name")
        if rn:
            roots.add(str(rn).strip())
    return sorted(roots)

def build_alias_map(records):
    alias2root = {}
    for r in records:
        md = r.get("metadata", {})
        root = str(md.get("root_name") or "").strip().lower()
        if not root:
            continue
        # map root to itself
        alias2root[root] = root
        # map synonyms to root
        for s in md.get("synonyms", []) or []:
            alias2root[str(s).strip().lower()] = root
    return alias2root

def metadata_restructuring(records):
    restructured_records = []
    for record in records:
        metadata = record.get("metadata", {}).copy()  # copy to avoid mutating original

        # Explicitly ensure top-level fields are part of metadata
        for field in ["root_name", "search_term", "synonyms", "PMID", "pubmed_type"]:
            if field in record:
                metadata[field] = record[field]
        
        restructured_records.append({"metadata": metadata})
    return restructured_records

restructured_records = metadata_restructuring(filtered_records)

ALIAS2ROOT = build_alias_map(restructured_records) 
ALL_ROOTS  = sorted(set(ALIAS2ROOT.values()))

### Optional: Validation Checkpoint to get matching record from json_list

In [44]:
# def get_record_by_pmid(json_list, pmid):
#     """Pass PMID and get matching record from json_list"""
#     for record in json_list:
#         if record['metadata']['PMID'] == pmid:
#             return record
#     return None


# # Example usage:
# result = get_record_by_pmid(restructured_records, 11524119)

# if result:
#     print(json.dumps(result, indent=2))  # Prints the entire matching record
# else:
#     print("PMID not found")

# 3. Flattening the Data

In [45]:
for record in restructured_records:
    metadata = record["metadata"]
    
    # Process interventions with Parallel - Indexing
    interventions = metadata.get("interventions", [])
    record["intervention_names"] = [i.get("ingredient") for i in interventions]
    record["intervention_dosages"] = [i.get("daily_dosage") for i in interventions]
    record["intervention_units"] = [i.get("units") if i.get("units") else "" for i in interventions]
    record["intervention_original_texts"] = [i.get("original_text") for i in interventions]
    
    # Process outcomes with Parallel - Indexing
    outcomes = metadata.get("outcomes", [])
    record["biomarker_names"] = [o["name"] for o in outcomes if o["domain"] == "biomarker"]
    record["biomarker_types"] = [o["type"] for o in outcomes if o["domain"] == "biomarker"]
    record["biomarker_results"] = [o["result"] for o in outcomes if o["domain"] == "biomarker"]

    record["function_names"] = [o["name"] for o in outcomes if o["domain"] == "function"]
    record["function_types"] = [o["type"] for o in outcomes if o["domain"] == "function"]
    record["function_results"] = [o["result"] for o in outcomes if o["domain"] == "function"]

    record["condition_names"] = [o["name"] for o in outcomes if o["domain"] == "condition"]
    record["condition_types"] = [o["type"] for o in outcomes if o["domain"] == "condition"]
    record["condition_results"] = [o["result"] for o in outcomes if o["domain"] == "condition"]

    # force consistent types for filtering
    if "published_year" in metadata and isinstance(metadata["published_year"], str) and metadata["published_year"].isdigit():
        metadata["published_year"] = int(metadata["published_year"])
    if "PMID" in metadata:
        metadata["PMID"] = str(metadata["PMID"])
    
    # Lowercase/canonicalize list fields (beyond synonyms)
    for key in ("study_type", "species", "experimental_model", "usage",
                "keywords", "benefits", "diseases", "symptoms", "sample_gender"):
        if key in metadata and isinstance(metadata[key], list):
            metadata[key] = [str(x).strip().lower() for x in metadata[key] if x is not None and str(x).strip()]

    # Lowercase single-string fields you might filter on:
    if isinstance(metadata.get("population"), str):
        metadata["population"] = metadata["population"].strip().lower()
    if isinstance(metadata.get("location"), str):
        metadata["location"] = metadata["location"].strip().lower()

    # Keep your synonyms normalization after this:
    syns = metadata.get("synonyms") or []
    if isinstance(syns, list):
        syns = sorted({str(x).strip().lower() for x in syns if x is not None and str(x).strip()})
    else:
        syns = []
    metadata["synonyms"] = syns

    if metadata.get("root_name"):
        rn = str(metadata["root_name"]).strip().lower()
        if rn and rn not in metadata["synonyms"]:
            metadata["synonyms"].append(rn)
            
    # Delete original detailed fields
    for key in ["interventions", "outcomes", "biomarkers", "functions", "conditions"]:
        metadata.pop(key, None)


In [46]:
with open("Data/flatten.json", "w", encoding="utf-8") as f:
    json.dump(restructured_records, f, indent=2, ensure_ascii=False)

# 4. Data Ingestion into PineCone 

### 4a. Converting into Embeddings and performing Sematic Chunking

In [47]:
# %pip install -U \
#   pandas \
#   "llama-index" \
#   "llama-index-embeddings-huggingface" \
#   "llama-index-vector-stores-p"inecone" \
#   "llama-index-retrievers-bm25" \
#   pinecone-client \
#   "sentence-transformers" \
#   transformers \
#   "torch" \
#   python-dotenv \
#   tqdm \
#     biopython

In [48]:
#pip install "numpy<2"

In [50]:
import pandas as pd
from llama_index.core import Document, VectorStoreIndex, StorageContext
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec
from UPDATED_meta_data_generation import *
from dotenv import load_dotenv
import os
from tqdm import tqdm

load_dotenv()

# --------------------------
# Initialize Pinecone
# --------------------------
INDEX_NAME = "pubmed-abstracts-v5"
client = Pinecone(api_key=os.getenv("PINECONE_API"))
spec = ServerlessSpec(cloud="aws", region="us-east-1")

if INDEX_NAME not in client.list_indexes().names():
    client.create_index(
        name=INDEX_NAME,
        dimension=768,
        metric="cosine",
        spec=spec
    )

pinecone_index = client.Index(INDEX_NAME)
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)

# --------------------------
# Initialize embedding + semantic chunker
# --------------------------
embed_model = HuggingFaceEmbedding(model_name="NeuML/pubmedbert-base-embeddings")

splitter = SemanticSplitterNodeParser(
    buffer_size=1,
    breakpoint_percentile_threshold=95,
    embed_model=embed_model
)


# --------------------------
# Build all semantic nodes
# --------------------------
all_nodes = []

for idx, row in enumerate(tqdm(restructured_records, desc="Processing papers")):
    md = row["metadata"]
    paper = fetch_extract_and_abstract(md['PMID'])
    title = paper['title']
    abstract = paper['abstract']

    # Title node
    title_node = Document(
        text=title,
        metadata={"type": "title", "node_index": 0, **md}
    )
    title_node.doc_id = f"{md['PMID']}:0"
    all_nodes.append(title_node)

    # Abstract nodes
    abstract_doc = Document(
        text=abstract,
        metadata={"type": "abstract", **md}
    )
    abstract_doc.doc_id = str(md["PMID"])

    abstract_nodes = splitter.get_nodes_from_documents([abstract_doc])
    for i, node in enumerate(abstract_nodes, start=1):
        node.metadata["node_index"] = i
        all_nodes.append(node)

# --------------------------
# Save nodes both to Pinecone (for vector) and local docstore
# --------------------------
# print("Indexing nodes into Pinecone and persisting locally...")

# index = VectorStoreIndex(
#     all_nodes,
#     storage_context=storage_context,
#     embed_model=embed_model,
#     show_progress=True
# )

# Persist docstore + metadata to disk
        
# --------------------------
# Create a persistent docstore
# --------------------------
docstore = SimpleDocumentStore()

docstore.add_documents(all_nodes)

storage_context = StorageContext.from_defaults(
    vector_store=vector_store,
    docstore=docstore
)

storage_context.persist(persist_dir="pubmed_nodes")



Processing papers: 100%|██████████| 82/82 [00:49<00:00,  1.64it/s]


### 4b. Injecting Embedded Chunks into PineCone

In [51]:
# --------------------------
# 4️ Store nodes in Pinecone on Cloud via LlamaIndex
# --------------------------
index = VectorStoreIndex([], storage_context=storage_context, embed_model=embed_model)
if all_nodes:
    index.insert_nodes(all_nodes, show_progress=True)
else:
    print("WARNING: No nodes to upsert.")

Generating embeddings:   0%|          | 0/247 [00:00<?, ?it/s]

Upserted vectors:   0%|          | 0/247 [00:00<?, ?it/s]

In [52]:
# Get Stats of Vector Index
stats = pinecone_index.describe_index_stats()
stats

{'dimension': 768,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 493}},
 'total_vector_count': 493,
 'vector_type': 'dense'}

# Debugging: Similarity Search 

In [61]:
from llama_index.core.vector_stores import MetadataFilter, MetadataFilters, FilterCondition
from openai import OpenAI
import os, json

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=OPENAI_API_KEY)

def llm_extract_facets_simple(query_text: str) -> dict:
    """
    Return STRICT JSON:
    {
      "candidate_ingredients": [str],
      "year_min": int|null,
      "year_max": int|null,
      "hints": {
        "study_type": [str],
        "species": [str],
        "population": str|null,
        "benefits": [str],
        "diseases": [str],
        "symptoms": [str],
        "location": str|null
      }
    }
    """
    system = (
        "You extract minimal retrieval facets for PubMed-style search.\n"
        "Return STRICT JSON with EXACT keys: candidate_ingredients, year_min, year_max, hints.\n"
        "The 'hints' value MUST be a JSON OBJECT with EXACT keys:\n"
        "  study_type (list of strings),\n"
        "  species (list of strings),\n"
        "  population (string or null),\n"
        "  benefits (list of strings),\n"
        "  diseases (list of strings),\n"
        "  symptoms (list of strings),\n"
        "  location (string or null).\n"
        "Do NOT return 'hints' as an array. Use null for unknown scalars and [] for unknown lists.\n"
        "Return ONLY valid JSON. No extra keys."
    )

    # A tiny, explicit JSON shape example helps models obey the spec:
    example = {
        "candidate_ingredients": ["cedarwood essential oil"],
        "year_min": 1997,
        "year_max": 2023,
        "hints": {
            "study_type": ["randomized controlled trial", "double-blind"],
            "species": ["humans"],
            "population": "patients with alopecia areata",
            "benefits": ["hair regrowth"],
            "diseases": ["alopecia areata"],
            "symptoms": [],
            "location": None
        }
    }

    user = (
        f"Query: {query_text}\n"
        "Return only JSON in the exact schema shown below (keys and types must match):\n"
        #f"{json.dumps(example, ensure_ascii=False)}"
    )

    resp = client.chat.completions.create(
        model="gpt-4o-mini",
        response_format={"type": "json_object"},
        messages=[{"role": "system", "content": system},
                  {"role": "user", "content": user}],
        temperature=0.0,
    )
    try:
        data = json.loads(resp.choices[0].message.content)
    except Exception:
        data = {}

    # --- Minimal post-parse normalization (keeps you robust) ---
    hints = data.get("hints", {})
    # If a list slipped through, coerce it to an object with best-effort buckets.
    if isinstance(hints, list):
        hints = _coerce_hints_list_to_object(hints)

    out = {
        "candidate_ingredients": [str(x) for x in data.get("candidate_ingredients", []) if str(x).strip()],
        "year_min": data.get("year_min") if isinstance(data.get("year_min"), int) else None,
        "year_max": data.get("year_max") if isinstance(data.get("year_max"), int) else None,
        "hints": {
            "study_type": [str(x) for x in (hints.get("study_type") or []) if str(x).strip()],
            "species":    [str(x) for x in (hints.get("species")    or []) if str(x).strip()],
            "population": (hints.get("population") if isinstance(hints.get("population"), str) else None),
            "benefits":   [str(x) for x in (hints.get("benefits")   or []) if str(x).strip()],
            "diseases":   [str(x) for x in (hints.get("diseases")   or []) if str(x).strip()],
            "symptoms":   [str(x) for x in (hints.get("symptoms")   or []) if str(x).strip()],
            "location":   (hints.get("location") if isinstance(hints.get("location"), str) else None),
        },
    }
    return out

    
def llm_map_to_roots(candidates: list[str], allowed_roots: list[str]) -> list[str]:
    """
    Ask LLM to map candidate ingredient mentions to the canonical root_name(s) from allowed_roots.
    Returns a unique list of chosen roots that exist in allowed_roots.
    """
    if not candidates:
        return []
    system = (
        "Map ingredient mentions to canonical names from a provided list. "
        "If no match, omit it. Return JSON: {\"roots\": [..canonical names..]}"
    )
    user = (
        "Candidates: " + json.dumps(candidates) + "\n"
        "Allowed roots: " + json.dumps(allowed_roots) + "\n"
        "Return only JSON."
    )
    resp = client.chat.completions.create(
        model="gpt-4o-mini",
        response_format={"type": "json_object"},
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": user},
        ],
        temperature=0.0,
    )
    try:
        data = json.loads(resp.choices[0].message.content)
        roots = data.get("roots", [])
        # keep only valid ones in allowed list
        valid = [r for r in roots if r in allowed_roots]
        return sorted(set(valid))
    except Exception:
        return []

def pinecone_filters_from_facets(roots: list[str], year_min: int|None, year_max: int|None):
    fs = []
    if roots:
        # we indexed root + aliases in "synonyms"; this matches both
        fs.append(MetadataFilter(key="synonyms", operator="in", value=[r.lower() for r in roots]))
    if isinstance(year_min, int):
        fs.append(MetadataFilter(key="published_year", operator=">=", value=year_min))
    if isinstance(year_max, int):
        fs.append(MetadataFilter(key="published_year", operator="<=", value=year_max))
    return MetadataFilters(filters=fs, condition=FilterCondition.AND) if fs else None


In [62]:
def _lc_set(x):
    if isinstance(x, list): return {str(v).strip().lower() for v in x if v is not None}
    if isinstance(x, str):  return {x.strip().lower()}
    return set()

def rerank_with_facets(candidates, hints: dict, weights=None, top_k=20):
    """
    candidates: list[NodeWithScore] from vector retrieval
    hints: dict (study_type, species, population, benefits, diseases, symptoms, location)
    weights: per-field boosts (default small)
    """
    if weights is None:
        weights = {
            "study_type": 0.10,
            "species": 0.08,
            "population": 0.06,
            "benefits": 0.06,
            "diseases": 0.06,
            "symptoms": 0.06,
            "location": 0.05,
        }

    # normalize hints to lowercase sets/strings
    h = {
        "study_type": _lc_set(hints.get("study_type", [])),
        "species":    _lc_set(hints.get("species", [])),
        "population": (hints.get("population") or "").strip().lower(),
        "benefits":   _lc_set(hints.get("benefits", [])),
        "diseases":   _lc_set(hints.get("diseases", [])),
        "symptoms":   _lc_set(hints.get("symptoms", [])),
        "location":   (hints.get("location") or "").strip().lower(),
    }

    rescored = []
    for n in candidates:
        base = float(n.score or 0.0)
        md = n.node.metadata or {}

        # list overlaps
        def jaccard(a, b):
            if not a or not b: return 0.0
            inter = len(a & b); uni = len(a | b)
            return inter / uni if uni else 0.0

        st = jaccard(_lc_set(md.get("study_type", [])), h["study_type"])
        sp = jaccard(_lc_set(md.get("species", [])), h["species"])
        bf = jaccard(_lc_set(md.get("benefits", [])), h["benefits"])
        ds = jaccard(_lc_set(md.get("diseases", [])), h["diseases"])
        sy = jaccard(_lc_set(md.get("symptoms", [])), h["symptoms"])

        # scalar contains
        pop_hit = 1.0 if h["population"] and isinstance(md.get("population"), str) and h["population"] in md["population"].lower() else 0.0
        loc_hit = 1.0 if h["location"] and isinstance(md.get("location"), str) and h["location"] in md["location"].lower() else 0.0

        boost = (
            weights["study_type"] * st +
            weights["species"]    * sp +
            weights["benefits"]   * bf +
            weights["diseases"]   * ds +
            weights["symptoms"]   * sy +
            weights["population"] * pop_hit +
            weights["location"]   * loc_hit
        )
        rescored.append((base + boost, n))

    rescored.sort(key=lambda x: x[0], reverse=True)
    return [n for _, n in rescored[:top_k]]

In [63]:
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.retrievers import QueryFusionRetriever

def make_minimal_hybrid(index, docstore, filters=None, vec_k=40, bm25_k=50, final_k=20):
    vector_ret = index.as_retriever(similarity_top_k=vec_k, filters=filters)
    bm25_ret   = BM25Retriever.from_defaults(docstore=docstore, similarity_top_k=bm25_k)
    return QueryFusionRetriever(
        retrievers=[vector_ret, bm25_ret],
        mode="reciprocal_rerank",
        num_queries=1,
        similarity_top_k=final_k,
    )

def _passes_simple_filters(md, roots_lc, year_min, year_max):
    # roots via synonyms (you indexed all aliases in "synonyms")
    if roots_lc:
        syns = {str(x).strip().lower() for x in (md.get("synonyms") or [])}
        if not syns.intersection(set(roots_lc)):
            return False
    # year range
    y = md.get("published_year")
    if isinstance(y, int):
        if year_min is not None and y < year_min: return False
        if year_max is not None and y > year_max: return False
    return True

In [None]:
def map_candidates_to_roots_via_alias(cands, alias2root):
    roots = []
    for c in cands:
        key = str(c).strip().lower()
        if key in alias2root:
            roots.append(alias2root[key])
    # unique & stable
    return sorted(set(roots))

def rag_retrieve(query_text, vec_top_k=50, final_top_k=20):
    # 1) LLM → facets (your simple extractor)
    raw = llm_extract_facets_simple(query_text)
    print(raw)
    # 2) map ingredient aliases → roots
    roots = map_candidates_to_roots_via_alias(raw.get("candidate_ingredients", []), ALIAS2ROOT)

    # 3) build Pinecone filters (ingredient + year)
    filters = pinecone_filters_from_facets(
        roots=roots,
        year_min=raw.get("year_min"),
        year_max=raw.get("year_max")
    )

    # 4) HYBRID retrieve (vector uses filters; BM25 ignores them)
    hybrid = make_minimal_hybrid(
        index=index,
        docstore=docstore,
        filters=filters,
        vec_k=vec_top_k,
        bm25_k=max(vec_top_k, 50),  # small bump to keep BM25 recall decent
        final_k=max(final_top_k, 20)
    )
    candidates = hybrid.retrieve(query_text)

    # 4b) (Optional) local hard filter so BM25 strays get trimmed
    roots_lc = [r.lower() for r in roots] if roots else []
    y_min, y_max = raw.get("year_min"), raw.get("year_max")
    if roots_lc or (y_min is not None) or (y_max is not None):
        candidates = [
            c for c in candidates
            if _passes_simple_filters(c.node.metadata, roots_lc, y_min, y_max)
        ]

    # If still empty, relax filters (rare). Try vector-only, no filters.
    if not candidates:
        vret = index.as_retriever(similarity_top_k=vec_top_k)  # no filters
        candidates = vret.retrieve(query_text)

    # 5) facet-aware soft re-rank (your existing function)
    hints = raw.get("hints") if isinstance(raw.get("hints"), dict) else {}
    reranked = rerank_with_facets(candidates, hints, top_k=final_top_k)

    used_filters = {
        "synonyms": roots,
        "year_min": raw.get("year_min"),
        "year_max": raw.get("year_max"),
    }
    return reranked, used_filters, hints

# NOT HYBRID
# NOT HYBRID
# NOT HYBRID
# def rag_retrieve(query_text, vec_top_k=50, final_top_k=20):
#     # 1) LLM → facets
#     raw = llm_extract_facets_simple(query_text)

#     # 2) map ingredient aliases → roots
#     roots = map_candidates_to_roots_via_alias(raw.get("candidate_ingredients", []), ALIAS2ROOT)

#     # 3) build ultra-simple pinecone filters (ingredient + year)
#     filters = pinecone_filters_from_facets(roots, raw.get("year_min"), raw.get("year_max"))

#     # 4) vector retrieve (first with filters, fall back without if 0)
#     vret = index.as_retriever(similarity_top_k=vec_top_k, filters=filters)
#     cands = vret.retrieve(query_text)
#     if not cands and filters is not None:
#         # fallback: remove filters entirely, keep recall
#         vret = index.as_retriever(similarity_top_k=vec_top_k)
#         cands = vret.retrieve(query_text)

#     # 5) facet-aware re-rank (soft boosts)
#     hints = raw.get("hints") if isinstance(raw.get("hints"), dict) else {}
#     reranked = rerank_with_facets(cands, hints, top_k=final_top_k)

#     # return also a compact view of the (used) filters
#     used_filters = {"synonyms": roots, "year_min": raw.get("year_min"), "year_max": raw.get("year_max")}
#     return reranked, used_filters, raw.get("hints", {})


In [70]:
# Create a retriever for similarity search
#retriever = index.as_retriever(similarity_top_k=5)  # retrieve top 5 similar chunks
query_text = "Show human randomized, double-blind clinical trials since the late 1990s where cedarwood oil was used topically to treat alopecia areata and report primary clinical outcomes."
hits, used_filters, hints = rag_retrieve(
    query_text,
    vec_top_k=50,
    final_top_k=25
)

print("used_filters:", used_filters)
print("hints:", hints)
print("Number of hits:", len(hits))
for r in hits:
    md = r.node.metadata
    print(f"Score {r.score:.4f} | PMID {md.get('PMID')} | {md.get('type')} | year {md.get('published_year')} | root {md.get('root_name')}")

print("\n")
# results = retriever.retrieve(query_text)
# for res in results:
#     print("Score:", res.score)
#     print("Text:", res.node.text)
#     print("PMID:", res.node.metadata.get("PMID"))
#     print("Type:", res.node.metadata.get("type"))
#     print("-" * 80)


{'candidate_ingredients': ['cedarwood oil'], 'year_min': 1997, 'year_max': 2023, 'hints': {'study_type': ['randomized controlled trial', 'double-blind study'], 'species': ['Homo sapiens'], 'population': None, 'benefits': ['improvement in hair regrowth', 'reduction in hair loss'], 'diseases': ['alopecia areata'], 'symptoms': ['hair loss'], 'location': None}}
used_filters: {'synonyms': ['cedarwood'], 'year_min': 1997, 'year_max': 2023}
hints: {'study_type': ['randomized controlled trial', 'double-blind study'], 'species': ['Homo sapiens'], 'population': None, 'benefits': ['improvement in hair regrowth', 'reduction in hair loss'], 'diseases': ['alopecia areata'], 'symptoms': ['hair loss'], 'location': None}
Number of hits: 21
Score 0.0492 | PMID 9828867 | abstract | year 1998 | root Cedarwood
Score 0.0479 | PMID 9828867 | abstract | year 1998 | root Cedarwood
Score 0.0328 | PMID 9828867 | title | year 1998 | root Cedarwood
Score 0.0426 | PMID 12805340 | abstract | year 2003 | root Cedarwo

In [None]:
#12805340
#"I’m looking for solid human studies on cedar leaf oil to help with anxiety during radiotherapy. Can you pull primary-outcome evidence since about 2000, inhalation use only, and ignore chemistry/extraction papers?"

#12805340
#"Find human randomized, double-blind studies where cedarwood or cedar leaf oil was delivered by inhalation to patients undergoing radiotherapy to reduce anxiety, from ~2000 onward. Ignore extraction or chemistry papers."

#9828867
#"Show human randomized, double-blind clinical trials since the late 1990s where cedarwood oil was used topically to treat alopecia areata and report primary clinical outcomes."

# Debugging: Reconstruction the Paper

In [None]:
# Function to reconstruct a paper from nodes
def reconstruct_paper(all_nodes, pmid):
    # Filter nodes belonging to this paper
    paper_nodes = [node for node in all_nodes if str(node.metadata.get("PMID")) == str(pmid)]
    
    # Sort nodes by node_index
    paper_nodes = sorted(paper_nodes, key=lambda x: x.metadata.get("node_index", 0))
    print("Noumber of Nodes:",len(paper_nodes))
    # Concatenate the text
    full_text = "\n".join([node.text for node in paper_nodes])
    
    return full_text

# Example usage
pmid_to_reconstruct = restructured_records[0]['metadata']['PMID']
full_paper_text = reconstruct_paper(all_nodes, pmid_to_reconstruct)

print("Reconstructed Paper Text:")
print(full_paper_text)


### Optional: Delete PineCone Index

In [None]:
# from pinecone import Pinecone
# import os
# from dotenv import load_dotenv
# load_dotenv()

# INDEX_NAME = "pubmed-abstracts"

# # Initialize Pinecone client
# client = Pinecone(api_key=os.getenv("PINECONE_API"))

# try:
#     client.delete_index(name=INDEX_NAME)
#     print("Index deleted")
# except:
#     print("Data base is empty")

### Opitional: Check available Indices

In [None]:
# indexes = client.list_indexes()
# print(f"Available indexes: {indexes.names()}")
# print(f"Current index name: {INDEX_NAME}")

# Hybrid Search Retrival Pipeline directly from Pinecone
Note: Make Sure to restart the kernal before you run the below cell to ensure that data is not being retrived from local- memory/in-memory/RAM

In [None]:
from llama_index.core import StorageContext
#from llama_index.core.storage.docstore import SimpleDocumentStore

# Just point to the folder where you persisted
storage_context = StorageContext.from_defaults(persist_dir="pubmed_nodes")

# Now access your persisted documents
docstore = storage_context.docstore
print("Number of documents:", len(docstore.docs))


In [None]:
from pinecone import Pinecone
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from dotenv import load_dotenv

import os

load_dotenv()

pc = Pinecone(api_key=os.getenv("PINECONE_API"))
pinecone_index = pc.Index(INDEX_NAME)

vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

embed_model = HuggingFaceEmbedding(model_name="NeuML/pubmedbert-base-embeddings")

index = VectorStoreIndex.from_vector_store(
    vector_store=vector_store,
    storage_context=storage_context,
    embed_model=embed_model
)

# Create vector retriever
vector_retriever = index.as_retriever(similarity_top_k=5)

# Create BM25 retriever for keyword-based search
# Ensure you have the documents loaded in memory for BM25
bm25_retriever = BM25Retriever.from_defaults(
    docstore=docstore,
    similarity_top_k=5
)

# Create hybrid retriever using QueryFusionRetriever
# This combines results from both retrievers
# hybrid_retriever = QueryFusionRetriever(
#     retrievers=[vector_retriever, bm25_retriever],
#     retriever_weights=[0.5, 0.5],  # Equal weight to both retrievers
#     llm=MockLLM(),  # Use MockLLM to avoid needing OpenAI API key
#     use_async=False,
#     #mode="reciprocal_rerank",
# )

# Perform hybrid search
query = "Which analytical method was used to photosynthetic tissues?"
results = hybrid_retriever.retrieve(query)

# Display results
for res in results:
    print("Score:", res.score)
    print("Text:", res.node.text)
    print("PMID:", res.node.metadata.get("PMID"))
    print("Type:", res.node.metadata.get("type"))
    print("-" * 80)
