# 1. Data Loading and Filtering Records with Focus (Primary or Secondary)

In [1]:
import json
with open("Data/meta_test.json", "r", encoding="utf-8") as f:
    records = json.load(f)
# Filter JSON entries where focus is primary or secondary
filtered_records = [entry for entry in records if any(f in ["primary", "secondary"] for f in entry["metadata"].get("focus", []))]

# Calculation of Primary and Secondary records %
filtered_records_percent = round(((len(filtered_records)/len(records)) * 100), 2)

print(f"Only {filtered_records_percent}% of entire records are Primary or Secondary ")



Only 95.65% of entire records are Primary or Secondary 


# 2. Data Restructuring

In [2]:
def metadata_restructuring(records):
    restructured_records = []
    for record in records:
        metadata = record.get("metadata", {}).copy()  # copy to avoid mutating original

        # Explicitly ensure top-level fields are part of metadata
        for field in ["root_name", "search_term", "synonyms", "PMID", "pubmed_type"]:
            if field in record:
                metadata[field] = record[field]
        
        restructured_records.append({"metadata": metadata})
    return restructured_records

restructured_records = metadata_restructuring(filtered_records)

### Optional: Validation Checkpoint to get matching record from json_list

In [3]:
# def get_record_by_pmid(json_list, pmid):
#     """Pass PMID and get matching record from json_list"""
#     for record in json_list:
#         if record['metadata']['PMID'] == pmid:
#             return record
#     return None


# # Example usage:
# result = get_record_by_pmid(restructured_records, 11524119)

# if result:
#     print(json.dumps(result, indent=2))  # Prints the entire matching record
# else:
#     print("PMID not found")

# 3. Flattening the Data

In [4]:
for record in restructured_records:
    metadata = record["metadata"]
    
    # Process interventions with Parallel - Indexing
    interventions = metadata.get("interventions", [])
    record["intervention_names"] = [i.get("ingredient") for i in interventions]
    record["intervention_dosages"] = [i.get("daily_dosage") for i in interventions]
    record["intervention_units"] = [i.get("units") if i.get("units") else "" for i in interventions]
    record["intervention_original_texts"] = [i.get("original_text") for i in interventions]
    
    # Process outcomes with Parallel - Indexing
    outcomes = metadata.get("outcomes", [])
    record["biomarker_names"] = [o["name"] for o in outcomes if o["domain"] == "biomarker"]
    record["biomarker_types"] = [o["type"] for o in outcomes if o["domain"] == "biomarker"]
    record["biomarker_results"] = [o["result"] for o in outcomes if o["domain"] == "biomarker"]

    record["function_names"] = [o["name"] for o in outcomes if o["domain"] == "function"]
    record["function_types"] = [o["type"] for o in outcomes if o["domain"] == "function"]
    record["function_results"] = [o["result"] for o in outcomes if o["domain"] == "function"]

    record["condition_names"] = [o["name"] for o in outcomes if o["domain"] == "condition"]
    record["condition_types"] = [o["type"] for o in outcomes if o["domain"] == "condition"]
    record["condition_results"] = [o["result"] for o in outcomes if o["domain"] == "condition"]

    
    # Delete original detailed fields
    for key in ["interventions", "outcomes", "biomarkers", "functions", "conditions"]:
        metadata.pop(key, None)


In [5]:
with open("Data/flatten.json", "w", encoding="utf-8") as f:
    json.dump(restructured_records, f, indent=2, ensure_ascii=False)

# 4. Data Ingestion into PineCone 

### 4a. Converting into Embeddings and performing Sematic Chunking

In [6]:
# %pip install -U \
#   pandas \
#   "llama-index" \
#   "llama-index-embeddings-huggingface" \
#   "llama-index-vector-stores-pinecone" \
#   pinecone-client \
#   "sentence-transformers" \
#   transformers \
#   "torch" \
#   python-dotenv \
#   tqdm \
#     biopython

In [7]:
import pandas as pd
from llama_index.core import Document, VectorStoreIndex, StorageContext
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.pinecone import PineconeVectorStore
from pinecone import Pinecone
from pinecone import ServerlessSpec
from UPDATED_meta_data_generation import *
from dotenv import load_dotenv
import os
from tqdm import tqdm
load_dotenv()

# --------------------------
# Initialize Pinecone
# --------------------------
INDEX_NAME = "pubmed-abstracts"

# Initialize Pinecone client
client = Pinecone(api_key=os.getenv("PINECONE_API"))

# Define the index specification
spec = ServerlessSpec(cloud="aws", region="us-east-1")

# Create index if it doesn't exist
if INDEX_NAME not in client.list_indexes().names():
    client.create_index(name=INDEX_NAME, dimension=768, metric="cosine",spec=spec)  # PubMedBERT embedding dim=768

# Connect to the index
pinecone_index = client.Index(INDEX_NAME)

# Use your PineconeVectorStore wrapper as before
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)

storage_context = StorageContext.from_defaults(vector_store=vector_store)

# --------------------------
# Initialize embedding model and semantic chunker
# --------------------------
embed_model = HuggingFaceEmbedding(model_name="NeuML/pubmedbert-base-embeddings")

splitter = SemanticSplitterNodeParser(
    buffer_size=1,
    breakpoint_percentile_threshold=95,
    embed_model=embed_model
)

all_nodes = []

for idx, row in enumerate(tqdm(restructured_records, desc="Processing papers")):
    md = row["metadata"]  # all metadata including pmid, year, etc.
    paper = fetch_extract_and_abstract(md['PMID'])
    title = paper['title']  # title
    abstract = paper['abstract']  # abstract text
    #print(f">>>>Title:{idx}",title)

    # --------------------------
    # Title Node
    # --------------------------
    title_node = Document(
        text=title,
        metadata={
            "type": "title",
            "node_index": 0,
            **md  # include all metadata fields directly
        }
    )
    all_nodes.append(title_node)

    # --------------------------
    # Abstract Nodes (Semantic Split)
    # --------------------------
    abstract_doc = Document(
        text=abstract,
        metadata={
            "type": "abstract",
            **md  # include all metadata fields directly
        }
    )

    abstract_nodes = splitter.get_nodes_from_documents([abstract_doc])

    # Assign node_index starting from 1 (after title)
    for i, node in enumerate(abstract_nodes, start=1):
        node.metadata["node_index"] = i
        all_nodes.append(node)

# --------------------------
# 4️ Store nodes in Pinecone Local memory via LlamaIndex (Should be skipped if need to upload)
# --------------------------
# index = VectorStoreIndex(
#     all_nodes,
#     vector_store=vector_store,
#     embed_model=embed_model,
#     show_progress=True)


  from .autonotebook import tqdm as notebook_tqdm
Processing papers: 100%|██████████| 44/44 [00:25<00:00,  1.74it/s]


### 4b. Injecting Embedded Chunks into PineCone

In [8]:
# --------------------------
# 4️ Store nodes in Pinecone on Cloud via LlamaIndex
# --------------------------
index = VectorStoreIndex([], storage_context=storage_context, embed_model=embed_model)
if all_nodes:
    index.insert_nodes(all_nodes, show_progress=True)
else:
    print("WARNING: No nodes to upsert.")

Generating embeddings: 100%|██████████| 131/131 [00:08<00:00, 15.90it/s]
Upserted vectors: 100%|██████████| 131/131 [00:01<00:00, 75.44it/s]


In [15]:
# Get Stats of Vector Index
stats = pinecone_index.describe_index_stats()
stats

{'dimension': 768,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 131}},
 'total_vector_count': 131,
 'vector_type': 'dense'}

# Debugging: Similarity Search 

In [16]:
# Create a retriever for similarity search
retriever = index.as_retriever(similarity_top_k=5)  # retrieve top 5 similar chunks
query_text = " Cedrus, Hesperopeuce, Keteleeria, Pseudolarix, and Tsuga and preliminary inferences on the taxonomy of Pinaceae."

results = retriever.retrieve(query_text)
for res in results:
    print("Score:", res.score)
    print("Text:", res.node.text)
    print("PMID:", res.node.metadata.get("PMID"))
    print("Type:", res.node.metadata.get("type"))
    print("-" * 80)


Score: 0.338782787
Text: The effect of two plant secondary metabolites, tannins and formylated phloroglucinol compounds (FPCs), on the intake of Eucalyptus foliage by common ringtail (Pseudocheirus peregrinus) and common brushtail possums (Trichosurus vulpecula) was studied. We manipulated the amount of tannin that was free to bind with protein by coating foliage with polyethylene glycol 4000 (PEG) and relied on natural intraspecific variation in FPC concentrations. 
PMID: 12592445
Type: abstract
--------------------------------------------------------------------------------
Score: 0.334806472
Text: Ectomycorrhizas produced between Pisolithus tinctorius and Eucalyptus pilularis under axenic conditions were rapidly frozen, freeze-substituted in tetrahydrofuran and embedded anhydrously, and dry-sectioned for X-ray microanalysis. The vacuoles of the sheath and Hartig net hyphae were rich in phosphorus and potassium. 
PMID: 10512669
Type: abstract
-----------------------------------------

# Debugging: Reconstruction the Paper

In [17]:
# Function to reconstruct a paper from nodes
def reconstruct_paper(all_nodes, pmid):
    # Filter nodes belonging to this paper
    paper_nodes = [node for node in all_nodes if str(node.metadata.get("PMID")) == str(pmid)]
    
    # Sort nodes by node_index
    paper_nodes = sorted(paper_nodes, key=lambda x: x.metadata.get("node_index", 0))
    print("Noumber of Nodes:",len(paper_nodes))
    # Concatenate the text
    full_text = "\n".join([node.text for node in paper_nodes])
    
    return full_text

# Example usage
pmid_to_reconstruct = restructured_records[0]['metadata']['PMID']
full_paper_text = reconstruct_paper(all_nodes, pmid_to_reconstruct)

print("Reconstructed Paper Text:")
print(full_paper_text)


Noumber of Nodes: 3
Reconstructed Paper Text:
Inhalation aromatherapy during radiotherapy: results of a placebo-controlled double-blind randomized trial.
To determine whether the inhalation of aromatherapy during radiotherapy reduces anxiety. Three hundred thirteen patients undergoing radiotherapy were randomly assigned to receive either carrier oil with fractionated oils, carrier oil only, or pure essential oils of lavender, bergamot, and cedarwood administered by inhalation concurrently with radiation treatment. Patients underwent assessment by the Hospital Anxiety and Depression Scale (HADS) and the Somatic and Psychological Health Report (SPHERE) at baseline and at treatment completion. 
There were no significant differences in HADS depression or SPHERE scores between the randomly assigned groups. However, HADS anxiety scores were significantly lower at treatment completion in the carrier oil only group compared with either of the fragrant arms (P =.04). Aromatherapy, as administer

### Optional: Delete PineCone Index

In [18]:
# try:
#     client.delete_index(name=INDEX_NAME)
#     print("Index deleted")
# except:
#     print("Data base is empty")

### Opitional: Check available Indices

In [19]:
# indexes = client.list_indexes()
# print(f"Available indexes: {indexes.names()}")
# print(f"Current index name: {INDEX_NAME}")

# Retrival Pipelne directly from PineCone
Note: Make Sure to restart the kernal before you run the below cell to ensure that data is not being retrived from local- memory/in-memory/RAM

In [20]:
from pinecone import Pinecone
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from dotenv import load_dotenv
import os
load_dotenv()

pc = Pinecone(api_key=os.getenv("PINECONE_API"))
pinecone_index = pc.Index("pubmed-abstracts")


vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

embed_model = HuggingFaceEmbedding(model_name="NeuML/pubmedbert-base-embeddings")

index = VectorStoreIndex.from_vector_store(
    vector_store=vector_store,
    storage_context=storage_context,
    embed_model=embed_model
)



retriever = index.as_retriever(similarity_top_k=5)

results = retriever.retrieve("Pinaceae species are characterized by the genera Larix, and Abies and Cedrus,")

for res in results:
    print("Score:", res.score)
    print("Text:", res.node.text)
    print("PMID:", res.node.metadata.get("PMID"))
    print("Type:", res.node.metadata.get("type"))
    print("-" * 80)



Score: 0.313742638
Text: Which majestic ornamental tree, conspicuous in the Australian landscape, has ten million hectares in cultivation worldwide--yielding timber, fuel, essential oil and cut foliage? It could only be eucalyptus.
PMID: 11190248
Type: abstract
--------------------------------------------------------------------------------
Score: 0.286951095
Text: Ectomycorrhizas produced between Pisolithus tinctorius and Eucalyptus pilularis under axenic conditions were rapidly frozen, freeze-substituted in tetrahydrofuran and embedded anhydrously, and dry-sectioned for X-ray microanalysis. The vacuoles of the sheath and Hartig net hyphae were rich in phosphorus and potassium. 
PMID: 10512669
Type: abstract
--------------------------------------------------------------------------------
Score: 0.281497955
Text: The spatial root distribution after two years of three energy crops was investigated, with the influence of two rates of dairy pond effluent application, applied every fortnig