# 1. Data Loading and Filtering Records with Focus (Primary or Secondary)

In [1]:
import json
with open("output/test_enriched_10|17.json", "r", encoding="utf-8") as f:
    records = json.load(f)
# Filter JSON entries where focus is primary or secondary
filtered_records = [entry for entry in records if any(f in ["primary", "secondary"] for f in entry["metadata"].get("focus", []))]

# Calculation of Primary and Secondary records %
filtered_records_percent = round(((len(filtered_records)/len(records)) * 100), 2)

print(f"Only {filtered_records_percent}% of entire records are Primary or Secondary ")



Only 60.0% of entire records are Primary or Secondary 


# 2. Data Restructuring

In [2]:
def metadata_restructuring(records):
    restructured_records = []
    for record in records:
        metadata = record.get("metadata", {}).copy()  # copy to avoid mutating original

        # Explicitly ensure top-level fields are part of metadata
        for field in ["root_name", "search_term", "synonyms", "PMID", "pubmed_type"]:
            if field in record:
                metadata[field] = record[field]
        
        restructured_records.append({"metadata": metadata})
    return restructured_records

restructured_records = metadata_restructuring(filtered_records)

### Optional: Validation Checkpoint to get matching record from json_list

In [3]:
# def get_record_by_pmid(json_list, pmid):
#     """Pass PMID and get matching record from json_list"""
#     for record in json_list:
#         if record['metadata']['PMID'] == pmid:
#             return record
#     return None


# # Example usage:
# result = get_record_by_pmid(restructured_records, 11524119)

# if result:
#     print(json.dumps(result, indent=2))  # Prints the entire matching record
# else:
#     print("PMID not found")

# 3. Flattening the Data

In [3]:
for record in restructured_records:
    metadata = record["metadata"]
    
    # Process interventions with Parallel - Indexing
    interventions = metadata.get("interventions", [])
    record["intervention_names"] = [i.get("ingredient") for i in interventions]
    record["intervention_dosages"] = [i.get("daily_dosage") for i in interventions]
    record["intervention_units"] = [i.get("units") if i.get("units") else "" for i in interventions]
    record["intervention_original_texts"] = [i.get("original_text") for i in interventions]
    
    # Process outcomes with Parallel - Indexing
    outcomes = metadata.get("outcomes", [])
    record["biomarker_names"] = [o["name"] for o in outcomes if o["domain"] == "biomarker"]
    record["biomarker_types"] = [o["type"] for o in outcomes if o["domain"] == "biomarker"]
    record["biomarker_results"] = [o["result"] for o in outcomes if o["domain"] == "biomarker"]

    record["function_names"] = [o["name"] for o in outcomes if o["domain"] == "function"]
    record["function_types"] = [o["type"] for o in outcomes if o["domain"] == "function"]
    record["function_results"] = [o["result"] for o in outcomes if o["domain"] == "function"]

    record["condition_names"] = [o["name"] for o in outcomes if o["domain"] == "condition"]
    record["condition_types"] = [o["type"] for o in outcomes if o["domain"] == "condition"]
    record["condition_results"] = [o["result"] for o in outcomes if o["domain"] == "condition"]

    
    # Delete original detailed fields
    for key in ["interventions", "outcomes", "biomarkers", "functions", "conditions"]:
        metadata.pop(key, None)


In [4]:
with open("flatten.json", "w", encoding="utf-8") as f:
    json.dump(restructured_records, f, indent=2, ensure_ascii=False)

# 4. Data Ingestion into PineCone 

### 4a. Converting into Embeddings and performing Sematic Chunking

In [5]:
import pandas as pd
from llama_index.core import Document, VectorStoreIndex, StorageContext
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec
from UPDATED_meta_data_generation import *
from dotenv import load_dotenv
import os
from tqdm import tqdm

load_dotenv()

# --------------------------
# Initialize Pinecone
# --------------------------
INDEX_NAME = "pubmed-abstracts"
client = Pinecone(api_key=os.getenv("PINECONE_API"))
spec = ServerlessSpec(cloud="aws", region="us-east-1")

if INDEX_NAME not in client.list_indexes().names():
    client.create_index(
        name=INDEX_NAME,
        dimension=768,
        metric="cosine",
        spec=spec
    )

pinecone_index = client.Index(INDEX_NAME)
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)

# --------------------------
# Initialize embedding + semantic chunker
# --------------------------
embed_model = HuggingFaceEmbedding(model_name="NeuML/pubmedbert-base-embeddings")

splitter = SemanticSplitterNodeParser(
    buffer_size=1,
    breakpoint_percentile_threshold=95,
    embed_model=embed_model
)


# --------------------------
# Build all semantic nodes
# --------------------------
all_nodes = []

for idx, row in enumerate(tqdm(restructured_records, desc="Processing papers")):
    md = row["metadata"]
    paper = fetch_extract_and_abstract(md['PMID'])
    title = paper['title']
    abstract = paper['abstract']

    # Title node
    title_node = Document(
        text=title,
        metadata={"type": "title", "node_index": 0, **md}
    )
    all_nodes.append(title_node)

    # Abstract nodes
    abstract_doc = Document(
        text=abstract,
        metadata={"type": "abstract", **md}
    )

    abstract_nodes = splitter.get_nodes_from_documents([abstract_doc])
    for i, node in enumerate(abstract_nodes, start=1):
        node.metadata["node_index"] = i
        all_nodes.append(node)

# --------------------------
# Save nodes both to Pinecone (for vector) and local docstore
# --------------------------
# print("Indexing nodes into Pinecone and persisting locally...")

# index = VectorStoreIndex(
#     all_nodes,
#     storage_context=storage_context,
#     embed_model=embed_model,
#     show_progress=True
# )

# Persist docstore + metadata to disk
        
# --------------------------
# Create a persistent docstore
# --------------------------
docstore = SimpleDocumentStore()

docstore.add_documents(all_nodes)

storage_context = StorageContext.from_defaults(
    vector_store=vector_store,
    docstore=docstore
)

storage_context.persist(persist_dir="pubmed_nodes")



Processing papers: 100%|██████████| 3/3 [00:02<00:00,  1.15it/s]


### 4b. Injecting Embedded Chunks into PineCone

In [6]:
# --------------------------
# 4️ Store nodes in Pinecone on Cloud via LlamaIndex
# --------------------------
index = VectorStoreIndex([], storage_context=storage_context, embed_model=embed_model)
if all_nodes:
    index.insert_nodes(all_nodes, show_progress=True)
else:
    print("WARNING: No nodes to upsert.")

Generating embeddings:   0%|          | 0/9 [00:00<?, ?it/s]

Upserted vectors:   0%|          | 0/9 [00:00<?, ?it/s]

In [9]:
# Get Stats of Vector Index
stats = pinecone_index.describe_index_stats()
stats

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 9}},
 'total_vector_count': 9}

# Debugging: Similarity Search 

In [10]:
# Create a retriever for similarity search
retriever = index.as_retriever(similarity_top_k=5)  # retrieve top 5 similar chunks
query_text = " Cedrus, Hesperopeuce, Keteleeria, Pseudolarix, and Tsuga and preliminary inferences on the taxonomy of Pinaceae."

results = retriever.retrieve(query_text)
for res in results:
    print("Score:", res.score)
    print("Text:", res.node.text)
    print("PMID:", res.node.metadata.get("PMID"))
    print("Type:", res.node.metadata.get("type"))
    print("-" * 80)


Score: 0.61619097
Text: appear to be a major exception, as their seed FA compositions are much like those of species from the Pinoid group. In this respect, Hesperopeuce mertensiana, also known as Tsuga mertensiana, has little in common with Abietoids and fits the general FA pattern of Pinoids well. Tsuga spp. and H. mertensiana, from their seed FA compositions, should perhaps be separated from the Abietoid group and their taxonomic position revised. It is suggested that a "Tsugoid" subfamily be created, with seed FA in compliance with the Pinoid pattern and other botanical and immunological criteria of the Abietoid type. All Pinaceae genera, with the exception of Pinus, are quite homogeneous when considering their overall seed FA compositions, including delta5-olefinic acids. In all cases but one (Pinus), variations from one species to another inside a given genus are of small amplitude. Pinus spp., on the other hand, have highly variable levels of delta5-olefinic acids in their FA co

# Debugging: Reconstruction the Paper

In [11]:
# Function to reconstruct a paper from nodes
def reconstruct_paper(all_nodes, pmid):
    # Filter nodes belonging to this paper
    paper_nodes = [node for node in all_nodes if str(node.metadata.get("PMID")) == str(pmid)]
    
    # Sort nodes by node_index
    paper_nodes = sorted(paper_nodes, key=lambda x: x.metadata.get("node_index", 0))
    print("Noumber of Nodes:",len(paper_nodes))
    # Concatenate the text
    full_text = "\n".join([node.text for node in paper_nodes])
    
    return full_text

# Example usage
pmid_to_reconstruct = restructured_records[0]['metadata']['PMID']
full_paper_text = reconstruct_paper(all_nodes, pmid_to_reconstruct)

print("Reconstructed Paper Text:")
print(full_paper_text)


Noumber of Nodes: 3
Reconstructed Paper Text:
Screening of Turkish anti-ulcerogenic folk remedies for anti-Helicobacter pylori activity.
The anti-Helicobacter pylori effect of the extracts and fractions obtained from seven Turkish plants, which are used in folk medicine for the treatment of gastric ailments including peptic ulcers, were studied against one standard strain and eight clinical isolates of H. pylori by using the agar dilution method. Flowers of Cistus laurifolius and Spartium junceum, cones of Cedrus libani, herbs and flowers of Centaurea solstitialis ssp. 
solstitialis, fruits of Momordica charantia, herbaceous parts of Sambucus ebulus, and flowering herbs of Hypericum perforatum were evaluated in this study. Results showed that all except one extract from six of these plants showed activity against the microorganism with MICs between 1.95 and 250 microg/ml, with S. junceum being the only inactive species. Amongst the active plants the inhibitory properties of C. laurifol

### Optional: Delete PineCone Index

In [12]:
# from pinecone import Pinecone
# import os
# from dotenv import load_dotenv
# load_dotenv()

# INDEX_NAME = "pubmed-abstracts"

# # Initialize Pinecone client
# client = Pinecone(api_key=os.getenv("PINECONE_API"))

# try:
#     client.delete_index(name=INDEX_NAME)
#     print("Index deleted")
# except:
#     print("Data base is empty")

### Opitional: Check available Indices

In [8]:
# indexes = client.list_indexes()
# print(f"Available indexes: {indexes.names()}")
# print(f"Current index name: {INDEX_NAME}")

# Hybrid Search Retrival Pipeline directly from Pinecone
Note: Make Sure to restart the kernal before you run the below cell to ensure that data is not being retrived from local- memory/in-memory/RAM

In [13]:
from llama_index.core import StorageContext
#from llama_index.core.storage.docstore import SimpleDocumentStore

# Just point to the folder where you persisted
storage_context = StorageContext.from_defaults(persist_dir="pubmed_nodes")

# Now access your persisted documents
docstore = storage_context.docstore
print("Number of documents:", len(docstore.docs))


Number of documents: 9


In [14]:
from pinecone import Pinecone
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.retrievers import QueryFusionRetriever
from llama_index.core.llms import MockLLM
from dotenv import load_dotenv
import os

load_dotenv()

pc = Pinecone(api_key=os.getenv("PINECONE_API"))
pinecone_index = pc.Index("pubmed-abstracts")

vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

embed_model = HuggingFaceEmbedding(model_name="NeuML/pubmedbert-base-embeddings")

index = VectorStoreIndex.from_vector_store(
    vector_store=vector_store,
    storage_context=storage_context,
    embed_model=embed_model
)

# Create vector retriever
vector_retriever = index.as_retriever(similarity_top_k=5)

# Create BM25 retriever for keyword-based search
# Ensure you have the documents loaded in memory for BM25
bm25_retriever = BM25Retriever.from_defaults(
    docstore=docstore,
    similarity_top_k=5
)

# Create hybrid retriever using QueryFusionRetriever
# This combines results from both retrievers
hybrid_retriever = QueryFusionRetriever(
    retrievers=[vector_retriever, bm25_retriever],
    retriever_weights=[0.5, 0.5],  # Equal weight to both retrievers
    llm=MockLLM(),  # Use MockLLM to avoid needing OpenAI API key
    use_async=False,
    #mode="reciprocal_rerank",
)

# Perform hybrid search
query = "Which analytical method was used to photosynthetic tissues?"
results = hybrid_retriever.retrieve(query)

# Display results
for res in results:
    print("Score:", res.score)
    print("Text:", res.node.text)
    print("PMID:", res.node.metadata.get("PMID"))
    print("Type:", res.node.metadata.get("type"))
    print("-" * 80)


Score: 2.0504798889160156
Text: The anti-Helicobacter pylori effect of the extracts and fractions obtained from seven Turkish plants, which are used in folk medicine for the treatment of gastric ailments including peptic ulcers, were studied against one standard strain and eight clinical isolates of H. pylori by using the agar dilution method. Flowers of Cistus laurifolius and Spartium junceum, cones of Cedrus libani, herbs and flowers of Centaurea solstitialis ssp. 
PMID: 10473175
Type: abstract
--------------------------------------------------------------------------------
Score: 1.329048752784729
Text: The fatty acid composition of photosynthetic tissues from 137 species of gymnosperms belonging to 14 families was determined by gas chromatography. Statistical analysis clearly discriminated four groups. 
PMID: 11524119
Type: abstract
--------------------------------------------------------------------------------
