# 1. Meta Data Loading and Filtering Records with Focus (Primary or Secondary)

In [1]:
import json
with open("output/test_enriched_10|31.json", "r", encoding="utf-8") as f:
    records = json.load(f)
# Filter JSON entries where focus is primary or secondary
filtered_records = [entry for entry in records if any(f in ["primary", "secondary"] for f in entry["metadata"].get("focus", []))]

# Calculation of Primary and Secondary records %
filtered_records_percent = round(((len(filtered_records)/len(records)) * 100), 2)

print(f"Only {filtered_records_percent}% of entire records are Primary or Secondary ")



Only 97.27% of entire records are Primary or Secondary 


# 2. Meta Data Restructuring

In [2]:
def metadata_restructuring(records):
    restructured_records = []
    for record in records:
        metadata = record.get("metadata", {}).copy()  # copy to avoid mutating original

        # Explicitly ensure top-level fields are part of metadata
        for field in ["root_name", "search_term", "synonyms", "PMID", "pubmed_type"]:
            if field in record:
                metadata[field] = record[field]
        
        restructured_records.append({"metadata": metadata})
    return restructured_records

restructured_records = metadata_restructuring(filtered_records)

### Optional: Validation Checkpoint to get matching record from json_list

In [3]:
# def get_record_by_pmid(json_list, pmid):
#     """Pass PMID and get matching record from json_list"""
#     for record in json_list:
#         if record['metadata']['PMID'] == pmid:
#             return record
#     return None


# # Example usage:
# result = get_record_by_pmid(restructured_records, 11524119)

# if result:
#     print(json.dumps(result, indent=2))  # Prints the entire matching record
# else:
#     print("PMID not found")

# 3. Flattening the Meta Data

In [3]:
for record in restructured_records:
    metadata = record["metadata"]

    # Process interventions
    interventions = metadata.get("interventions", [])
    metadata["intervention_names"] = [i.get("ingredient") for i in interventions]
    #metadata["intervention_dosages"] = [i.get("daily_dosage") for i in interventions]
    metadata["intervention_dosages"] = [str(i.get("daily_dosage")) for i in interventions if i.get("daily_dosage") is not None]

    metadata["intervention_units"] = [i.get("units") or "" for i in interventions]
    metadata["intervention_original_texts"] = [i.get("original_text") for i in interventions]

    # Process outcomes
    outcomes = metadata.get("outcomes", [])
    metadata["biomarker_names"] = [o["name"] for o in outcomes if o["domain"] == "biomarker"]
    metadata["biomarker_types"] = [o["type"] for o in outcomes if o["domain"] == "biomarker"]
    metadata["biomarker_results"] = [o["result"] for o in outcomes if o["domain"] == "biomarker"]

    metadata["function_names"] = [o["name"] for o in outcomes if o["domain"] == "function"]
    metadata["function_types"] = [o["type"] for o in outcomes if o["domain"] == "function"]
    metadata["function_results"] = [o["result"] for o in outcomes if o["domain"] == "function"]

    metadata["condition_names"] = [o["name"] for o in outcomes if o["domain"] == "condition"]
    metadata["condition_types"] = [o["type"] for o in outcomes if o["domain"] == "condition"]
    metadata["condition_results"] = [o["result"] for o in outcomes if o["domain"] == "condition"]

    # Delete original nested fields
    for key in ["interventions", "outcomes", "biomarkers", "functions", "conditions"]:
        metadata.pop(key, None)


In [4]:
with open("Output/flatten.json", "w", encoding="utf-8") as f:
    json.dump(restructured_records, f, indent=2, ensure_ascii=False)

# 4. Data Ingestion into PineCone 

### 4a. Converting into Embeddings and performing Sematic Chunking

In [8]:
import pandas as pd
from llama_index.core import Document, VectorStoreIndex, StorageContext
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec
from llama_index.core.extractors import QuestionsAnsweredExtractor
from llama_index.core.ingestion import IngestionPipeline
from meta_data_generation import *
from llama_index.llms.openai import OpenAI
from dotenv import load_dotenv
import os
from tqdm import tqdm

load_dotenv()

# --------------------------
# Initialize Pinecone
# --------------------------
INDEX_NAME = "pubmed-abstracts"
client = Pinecone(api_key=os.getenv("PINECONE_API"))
spec = ServerlessSpec(cloud="aws", region="us-east-1")

if INDEX_NAME not in client.list_indexes().names():
    client.create_index(
        name=INDEX_NAME,
        dimension=768,
        metric="cosine",
        spec=spec
    )

pinecone_index = client.Index(INDEX_NAME)
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)

# --------------------------
# Initialize embedding + semantic chunker
# --------------------------
embed_model = HuggingFaceEmbedding(model_name="NeuML/pubmedbert-base-embeddings")

splitter = SemanticSplitterNodeParser(
    buffer_size=1,
    breakpoint_percentile_threshold=95,
    embed_model=embed_model
)

# --------------------------
# Initialize Question–Answer Extractor
# --------------------------

qa_llm = OpenAI(model='gpt-4o-mini', temperature=0.1,api_key=os.getenv("OPEN_AI_API"))
qa_extractor = QuestionsAnsweredExtractor(
    llm = qa_llm,
    questions=3,          # extract top 3 questions per node
    metadata_mode="all"   # include metadata context
)


# --------------------------
# Build all semantic nodes
# --------------------------
all_nodes = []

for idx, row in enumerate(tqdm(restructured_records, desc="Processing papers")):
    md = row["metadata"]
    paper = fetch_extract_and_abstract(md['PMID'])
    title = paper['title']
    abstract = paper['abstract']

    # Title node
    title_node = Document(
        text=title,
        metadata={"type": "title", "node_index": 0, **md},
        excluded_embed_metadata_keys=[
        "PMID", "url", "published_year","duration_days"
        "pubmed_type", "intervention_units","focus","location"
        "intervention_dosages", "intervention_original_texts"
        ],
        excluded_llm_metadata_keys=["url", "PMID", "focus"],
        text_template = "Metadata:\n{metadata_str}\n-----\nContent:\n{content}"
    )
    all_nodes.append(title_node)

    # Abstract nodes
    abstract_doc = Document(
        text=abstract,
        metadata={"type": "abstract", **md},
        excluded_embed_metadata_keys=[
        "PMID", "url", "published_year","duration_days"
        "pubmed_type", "intervention_units","focus","location"
        "intervention_dosages", "intervention_original_texts"
        ],
        excluded_llm_metadata_keys=["url", "PMID","focus"],
        text_template = 'Metadata:\n{metadata_str}\n-----\nContent:\n{content}'
    )

    abstract_nodes = splitter.get_nodes_from_documents([abstract_doc])
    for i, node in enumerate(abstract_nodes, start=1):
        node.metadata["node_index"] = i
        all_nodes.append(node)

# --------------------------
# Apply Question–Answer Extraction
# --------------------------
pipeline = IngestionPipeline(transformations=[qa_extractor])
qa_enriched_nodes = pipeline.run(nodes=all_nodes)

# --------------------------
# Save nodes both to Pinecone (for vector) and local docstore
# --------------------------
# print("Indexing nodes into Pinecone and persisting locally...")

# index = VectorStoreIndex(
#     all_nodes,
#     storage_context=storage_context,
#     embed_model=embed_model,
#     show_progress=True
# )

# Persist docstore + metadata to disk
        
# --------------------------
# Create a persistent docstore
# --------------------------
docstore = SimpleDocumentStore()

docstore.add_documents(qa_enriched_nodes)

storage_context = StorageContext.from_defaults(
    vector_store=vector_store,
    docstore=docstore
)

storage_context.persist(persist_dir="pubmed_nodes")



Processing papers: 100%|██████████| 178/178 [01:19<00:00,  2.25it/s]
100%|██████████| 531/531 [07:25<00:00,  1.19it/s]


In [20]:
import pprint

pprint.pprint(dict(qa_enriched_nodes[1]))

{'embedding': None,
 'end_char_idx': 1402,
 'excluded_embed_metadata_keys': ['PMID',
                                  'url',
                                  'published_year',
                                  'duration_dayspubmed_type',
                                  'intervention_units',
                                  'focus',
                                  'locationintervention_dosages',
                                  'intervention_original_texts'],
 'excluded_llm_metadata_keys': ['url', 'PMID', 'focus'],
 'id_': '2c388047-e7f0-4726-b766-a0112a3588fa',
 'metadata': {'PMID': 40219102,
              'benefits': ['anti-oxidant',
                           'anti-inflammatory',
                           'antimicrobial',
                           'antidepressant',
                           'anticancer'],
              'biomarker_names': [],
              'biomarker_results': [],
              'biomarker_types': [],
              'conclusion': 'this review underscores the 

In [18]:
from llama_index.core.schema import MetadataMode

print("LLM See's the below format\n",all_nodes[4].get_content(metadata_mode = MetadataMode.LLM))

LLM See's the below format
 [Excerpt from document]
type: abstract
duration_days: not mentioned
sample_size: not mentioned
sample_gender: ['not mentioned']
species: ['animals', 'cell lines']
experimental_model: ['in vitro']
population: rhipicephalus sanguineus sensu lato unfed adults and normal fibroblast cells (bj-1)
study_type: ['not mentioned']
benefits: ['acaricidal activity']
synergies_interactions_positive: ['not mentioned']
synergies_interactions_negative: ['not mentioned']
safety_side_effects: ['cytotoxicity']
usage: ['not mentioned']
purpose: to evaluate the acaricidal activity and cytotoxicity of myrrh, patchouli, and cypress oils and their nanoemulsions against rhipicephalus sanguineus and normal fibroblast cells.
conclusion: the three oils and their nanoemulsions have good acaricidal activity against rhipicephalus sanguineus unfed adults, but further toxicity studies on mammals are needed to ensure safe use.
diseases: []
symptoms: []
keywords: ['rhipicephalus sanguineus', '

In [19]:
print("Embedding Model See's the below format\n",all_nodes[4].get_content(metadata_mode = MetadataMode.EMBED))

Embedding Model See's the below format
 [Excerpt from document]
type: abstract
duration_days: not mentioned
sample_size: not mentioned
sample_gender: ['not mentioned']
species: ['animals', 'cell lines']
experimental_model: ['in vitro']
population: rhipicephalus sanguineus sensu lato unfed adults and normal fibroblast cells (bj-1)
study_type: ['not mentioned']
benefits: ['acaricidal activity']
synergies_interactions_positive: ['not mentioned']
synergies_interactions_negative: ['not mentioned']
safety_side_effects: ['cytotoxicity']
usage: ['not mentioned']
purpose: to evaluate the acaricidal activity and cytotoxicity of myrrh, patchouli, and cypress oils and their nanoemulsions against rhipicephalus sanguineus and normal fibroblast cells.
conclusion: the three oils and their nanoemulsions have good acaricidal activity against rhipicephalus sanguineus unfed adults, but further toxicity studies on mammals are needed to ensure safe use.
diseases: []
symptoms: []
keywords: ['rhipicephalus sa

### 4b. Injecting Embedded Chunks into PineCone

In [9]:
# --------------------------
# 4️ Store nodes in Pinecone on Cloud via LlamaIndex
# --------------------------
index = VectorStoreIndex([], storage_context=storage_context, embed_model=embed_model)
if all_nodes:
    index.insert_nodes(all_nodes, show_progress=True)
else:
    print("WARNING: No nodes to upsert.")

Generating embeddings:   0%|          | 0/531 [00:00<?, ?it/s]

Upserted vectors:   0%|          | 0/531 [00:00<?, ?it/s]

In [10]:
# Get Stats of Vector Index
stats = pinecone_index.describe_index_stats()
stats

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 531}},
 'total_vector_count': 531}

### Debugging: Similarity Search 

In [16]:
# Create a retriever for similarity search
retriever = index.as_retriever(similarity_top_k=5)  # retrieve top 5 similar chunks
query_text = "Long bag-like glandular hairs with one-cell head existing in the intercellular space are found in the cortex of stems, mesophyll and parenchyma of midrib."

results = retriever.retrieve(query_text)
for res in results:
    print("Score:", res.score)
    print("Text:", res.node.text)
    print("PMID:", res.node.metadata.get("PMID"))
    print("Type:", res.node.metadata.get("type"))
    print("-" * 80)


Score: 0.321853638
Text: Long bag-like glandular hairs with one-cell head existing in the intercellular space are found in the cortex of stems, mesophyll and parenchyma of midrib. The glandular hairs with special large one-cell head and with a special short and small one-cell stalk, flat bag-like cuticle above secretory cell are caved in the forms of square, triangle, convex len and round-oids are also found in the surface of leaves and stems. The primary pholem fiber bundles with very thick walls are uncontinuous and arranged with a circle in the transverse section of stems; the perimedullary cells contain needle and prism crystals. Above-mentioned characters are differed from other plants of Labiate family. The old stems are cylinderoid forms, branch angles are usually smaller than 60 degrees, the surface of upper, middle stems and young leaves with purple colour are unusual, the palisade ratio are less than 10(5.05 and 9.26 cultivated seperatedly in Guangzhou and Gaoyao), in Shipai 

### Debugging: Reconstruction the Paper

In [12]:
# Function to reconstruct a paper from nodes
def reconstruct_paper(all_nodes, pmid):
    # Filter nodes belonging to this paper
    paper_nodes = [node for node in all_nodes if str(node.metadata.get("PMID")) == str(pmid)]
    
    # Sort nodes by node_index
    paper_nodes = sorted(paper_nodes, key=lambda x: x.metadata.get("node_index", 0))
    print("Noumber of Nodes:",len(paper_nodes))
    # Concatenate the text
    full_text = "\n".join([node.text for node in paper_nodes])
    
    return full_text

# Example usage
pmid_to_reconstruct = restructured_records[2]['metadata']['PMID']
full_paper_text = reconstruct_paper(all_nodes, pmid_to_reconstruct)

print("Reconstructed Paper Text:")
print(full_paper_text)


Noumber of Nodes: 3
Reconstructed Paper Text:
Unveiling the phyto-restorative potential of ethereal distillates for atopic dermatitis: an advanced therapeutic approach.
Atopic dermatitis is acknowledged as a vital inflammatory disorder associated with the integumentary system of the body and is characterized by the formation of thick reddish-grey scars and erythema formation on skin, prevalent amidst the populace. Numerous synthetic drugs are available for treatment like antihistamines, immunosuppressants, glucocorticoids etc., but contrarily, essential oil therapy is exclusively lime lighted to favour the purpose. The utilization of available engineered drugs, possess the marked adverse effects owing to prolonged duration of therapy and therefore, essential oils are explored well and proved to exhibit the anti-eczematic, anti-inflammatory and antipruritic properties. Ethereal distillates own the assorted and selective therapeutic properties attributable to presence of bioactive compou

### Optional: Delete PineCone Index

In [13]:
# from pinecone import Pinecone
# import os
# from dotenv import load_dotenv
# load_dotenv()

# INDEX_NAME = "pubmed-abstracts"

# # Initialize Pinecone client
# client = Pinecone(api_key=os.getenv("PINECONE_API"))

# try:
#     client.delete_index(name=INDEX_NAME)
#     print("Index deleted")
# except:
#     print("Data base is empty")

### Opitional: Check available Indices

In [14]:
# indexes = client.list_indexes()
# print(f"Available indexes: {indexes.names()}")
# print(f"Current index name: {INDEX_NAME}")

Available indexes: ['pubmed-abstracts']
Current index name: pubmed-abstracts
