# Data Loading and Filtering Records with Focus (Primary or Secondary)

In [1]:
import json
with open("Data/enriched_data.json", "r", encoding="utf-8") as f:
    records = json.load(f)
# Filter JSON entries where focus is primary or secondary
filtered_records = [entry for entry in records if any(f in ["primary", "secondary"] for f in entry["metadata"].get("focus", []))]

# Calculation of Primary and Secondary records %
filtered_records_percent = round(((len(filtered_records)/len(records)) * 100), 2)

print(f"Only {filtered_records_percent}% of entire records are Primary or Secondary ")



Only 60.0% of entire records are Primary or Secondary 


# Data Restructuring

In [2]:
def metadata_restructuring(records):
    restructured_records = []
    for record in records:
        metadata = record.get("metadata", {}).copy()  # copy to avoid mutating original

        # Explicitly ensure top-level fields are part of metadata
        for field in ["root_name", "search_term", "synonyms", "PMID", "pubmed_type"]:
            if field in record:
                metadata[field] = record[field]
        
        restructured_records.append({"metadata": metadata})
    return restructured_records

restructured_records = metadata_restructuring(filtered_records)

# Data Ingestion into PineCone 

In [3]:
import pandas as pd
from llama_index.core import Document, VectorStoreIndex
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.pinecone import PineconeVectorStore
from pinecone import Pinecone
from pinecone import ServerlessSpec
from UPDATED_meta_data_generation import *
from dotenv import load_dotenv
import os
from tqdm import tqdm
load_dotenv()

# --------------------------
# 1 Initialize Pinecone
# --------------------------
INDEX_NAME = "pubmed-abstracts"

# Initialize Pinecone client
client = Pinecone(api_key=os.getenv("PINECONE_API"))

# Define the index specification
spec = ServerlessSpec(cloud="aws", region="us-east-1")

# Create index if it doesn't exist
if INDEX_NAME not in client.list_indexes().names():
    client.create_index(name=INDEX_NAME, dimension=768, metric="cosine",spec=spec)  # PubMedBERT embedding dim=768

# Connect to the index
pinecone_index = client.Index(INDEX_NAME)

# Use your PineconeVectorStore wrapper as before
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)

# --------------------------
# 2 Initialize embedding model and semantic chunker
# --------------------------
embed_model = HuggingFaceEmbedding(model_name="NeuML/pubmedbert-base-embeddings")

splitter = SemanticSplitterNodeParser(
    buffer_size=1,
    breakpoint_percentile_threshold=95,
    embed_model=embed_model
)

all_nodes = []

for idx, row in enumerate(tqdm(restructured_records, desc="Processing papers")):
    md = row["metadata"]  # all metadata including pmid, year, etc.
    paper = fetch_extract_and_abstract(md['PMID'])
    title = paper['title']  # title
    abstract = paper['abstract']  # abstract text
    #print(f">>>>Title:{idx}",title)

    # --------------------------
    # Title Node
    # --------------------------
    title_node = Document(
        text=title,
        metadata={
            "type": "title",
            "node_index": 0,
            **md  # include all metadata fields directly
        }
    )
    all_nodes.append(title_node)

    # --------------------------
    # Abstract Nodes (Semantic Split)
    # --------------------------
    abstract_doc = Document(
        text=abstract,
        metadata={
            "type": "abstract",
            **md  # include all metadata fields directly
        }
    )

    abstract_nodes = splitter.get_nodes_from_documents([abstract_doc])

    # Assign node_index starting from 1 (after title)
    for i, node in enumerate(abstract_nodes, start=1):
        node.metadata["node_index"] = i
        all_nodes.append(node)


# --------------------------
# 4️ Store nodes in Pinecone via LlamaIndex
# --------------------------
index = VectorStoreIndex(
    all_nodes,
    vector_store=vector_store,
    embed_model=embed_model,
    show_progress=True
)


Processing papers: 100%|██████████| 3/3 [00:01<00:00,  2.46it/s]


Generating embeddings:   0%|          | 0/9 [00:00<?, ?it/s]

# Similarity Search Debugging

In [4]:
# Create a retriever for similarity search
retriever = index.as_retriever(similarity_top_k=5)  # retrieve top 5 similar chunks
query_text = " Cedrus, Hesperopeuce, Keteleeria, Pseudolarix, and Tsuga and preliminary inferences on the taxonomy of Pinaceae."

results = retriever.retrieve(query_text)
for res in results:
    print("Score:", res.score)
    print("Text:", res.node.text)
    print("PMID:", res.node.metadata.get("PMID"))
    print("Type:", res.node.metadata.get("type"))
    print("-" * 80)


Score: 0.4733242432320938
Text: Abietoid seed fatty acid compositions--a review of the genera Abies, Cedrus, Hesperopeuce, Keteleeria, Pseudolarix, and Tsuga and preliminary inferences on the taxonomy of Pinaceae.
PMID: 11876259
Type: title
--------------------------------------------------------------------------------
Score: 0.4634513074762053
Text: The seed fatty acid (FA) compositions of Abietoids (Abies, Cedrus, Hesperopeuce, Keteleeria, Pseudolarix, and Tsuga) are reviewed in the present study in conclusion to our survey of Pinaceae seed FA compositions. Many unpublished data are given. Abietoids and Pinoids (Pinus, Larix, Picea, and Pseudotsuga)-constituting the family Pinaceae-are united by the presence of several delta5-olefinic acids, taxoleic (5,9-18:2), pinolenic (5,9,12-18:3), coniferonic (5,9,12,15-1 8:4), keteleeronic (5,11-20:2), and sciadonic (5,11,14-20:3) acids, and of 14-methyl hexadecanoic (anteiso-17:0) acid. These acids seldom occur in angiosperm seeds. The propo

# Reconstruction the Paper

In [5]:
# Function to reconstruct a paper from nodes
def reconstruct_paper(all_nodes, pmid):
    # Filter nodes belonging to this paper
    paper_nodes = [node for node in all_nodes if str(node.metadata.get("PMID")) == str(pmid)]
    
    # Sort nodes by node_index
    paper_nodes = sorted(paper_nodes, key=lambda x: x.metadata.get("node_index", 0))
    print("Noumber of Nodes:",len(paper_nodes))
    # Concatenate the text
    full_text = "\n".join([node.text for node in paper_nodes])
    
    return full_text

# Example usage
pmid_to_reconstruct = restructured_records[0]['metadata']['PMID']
full_paper_text = reconstruct_paper(all_nodes, pmid_to_reconstruct)

print("Reconstructed Paper Text:")
print(full_paper_text)


Noumber of Nodes: 3
Reconstructed Paper Text:
Screening of Turkish anti-ulcerogenic folk remedies for anti-Helicobacter pylori activity.
The anti-Helicobacter pylori effect of the extracts and fractions obtained from seven Turkish plants, which are used in folk medicine for the treatment of gastric ailments including peptic ulcers, were studied against one standard strain and eight clinical isolates of H. pylori by using the agar dilution method. Flowers of Cistus laurifolius and Spartium junceum, cones of Cedrus libani, herbs and flowers of Centaurea solstitialis ssp. 
solstitialis, fruits of Momordica charantia, herbaceous parts of Sambucus ebulus, and flowering herbs of Hypericum perforatum were evaluated in this study. Results showed that all except one extract from six of these plants showed activity against the microorganism with MICs between 1.95 and 250 microg/ml, with S. junceum being the only inactive species. Amongst the active plants the inhibitory properties of C. laurifol

In [6]:
# Get total number of vectors (nodes) in the Pinecone index
stats = pinecone_index.describe_index_stats()
total_nodes = stats['total_vector_count']

print(f"Total nodes in Pinecone index '{INDEX_NAME}': {total_nodes}")

Total nodes in Pinecone index 'pubmed-abstracts': 0


# Optional: Delete PineCone Index

In [30]:
# try:
#     client.delete_index(name=INDEX_NAME)
#     print("Index deleted")
# except:
#     print("Data base is empty")

Index deleted


# Opitional: Check available Indices

In [7]:
# indexes = client.list_indexes()
# print(f"Available indexes: {indexes.names()}")
# print(f"Current index name: {INDEX_NAME}")

Available indexes: ['pubmed-abstracts']
Current index name: pubmed-abstracts
