# Knowledge Graphs - NCCN


## 1. Setup


In [None]:
from dotenv import find_dotenv, load_dotenv
load_dotenv(find_dotenv())

In [7]:
import asyncio
import nest_asyncio

nest_asyncio.apply()

In [1]:
import os
LLAMACLOUD_API_KEY=os.getenv("LLAMACLOUD_API_KEY")
len(LLAMACLOUD_API_KEY)

52

In [2]:
from llama_index.llms.openai import OpenAI

llm = OpenAI(model="gpt-4.1-mini")

## 2. Loading & Parsing Document


In [3]:
from llama_cloud_services.parse import LlamaParse
# Initialize parser with specified mode
parser = LlamaParse(
    api_key=LLAMACLOUD_API_KEY,
    num_workers=4,
    verbose=True,
    language="en",
)

In [4]:

# Define the PDF file to parse
pdf_path = "../data/nccn_breast_cancer.pdf"

# Parse the document asynchronously
results = await parser.aparse(pdf_path)

Started parsing the file under job_id 6847207f-9662-48c9-8dcd-113a30fc27b3
.

In [5]:
len(results.pages)

263

In [13]:
[d for d in dir(results) if not d.startswith('_')]

['aget_image_data',
 'aget_image_documents',
 'aget_image_nodes',
 'aget_json',
 'aget_markdown',
 'aget_markdown_documents',
 'aget_markdown_nodes',
 'aget_text',
 'aget_text_documents',
 'aget_text_nodes',
 'aget_xlsx_data',
 'asave_all_images',
 'asave_image',
 'construct',
 'copy',
 'dict',
 'error',
 'file_name',
 'from_orm',
 'get_image_data',
 'get_image_documents',
 'get_image_names',
 'get_image_nodes',
 'get_json',
 'get_markdown',
 'get_markdown_documents',
 'get_markdown_nodes',
 'get_text',
 'get_text_documents',
 'get_text_nodes',
 'get_xlsx_data',
 'is_done',
 'job_id',
 'job_metadata',
 'json',
 'model_computed_fields',
 'model_config',
 'model_construct',
 'model_copy',
 'model_dump',
 'model_dump_json',
 'model_extra',
 'model_fields',
 'model_fields_set',
 'model_json_schema',
 'model_parametrized_name',
 'model_post_init',
 'model_rebuild',
 'model_validate',
 'model_validate_json',
 'model_validate_strings',
 'pages',
 'parse_file',
 'parse_obj',
 'parse_raw',
 'sa

In [18]:
page100 = results.pages[100]
[d for d in dir(page100) if not d.startswith('_')]

['charts',
 'construct',
 'copy',
 'dict',
 'from_orm',
 'height',
 'images',
 'items',
 'json',
 'layout',
 'links',
 'md',
 'model_computed_fields',
 'model_config',
 'model_construct',
 'model_copy',
 'model_dump',
 'model_dump_json',
 'model_extra',
 'model_fields',
 'model_fields_set',
 'model_json_schema',
 'model_parametrized_name',
 'model_post_init',
 'model_rebuild',
 'model_validate',
 'model_validate_json',
 'model_validate_strings',
 'noStructuredContent',
 'noTextContent',
 'page',
 'parse_file',
 'parse_obj',
 'parse_raw',
 'parsingMode',
 'schema',
 'schema_json',
 'status',
 'structuredData',
 'tables',
 'text',
 'triggeredAutoMode',
 'update_forward_refs',
 'validate',
 'width']

In [32]:
page100.text[:1000]

'Printed by Stina Singel on 6/17/2025 3:05:05 AM. For personal use only. Not approved for distribution. Copyright © 2025 National Comprehensive Cancer Network, Inc., All Rights Reserved.\n\n                       NCCN Guidelines Version 4.2025                                                                                                               NCCN Guidelines Index\n                       Invasive Breast Cancer                                                                                                                           Table of Contents\n                                                                                                                                                                                         Discussion\n\n                                 TARGETED THERAPIES AND ASSOCIATED BIOMARKER TESTING\n                       FOR RECURRENT UNRESECTABLE (LOCAL OR REGIONAL) OR STAGE IV (M1) DISEASE\n\n                                                     

In [33]:
page100.md[:1000]

'\nPrinted by Stina Singel on 6/17/2025 3:05:05 AM. For personal use only. Not approved for distribution. Copyright © 2025 National Comprehensive Cancer Network, Inc., All Rights Reserved.\n\n# NCCN Guidelines Version 4.2025\n\n# NCCN Guidelines Index\n\n# Invasive Breast Cancer\n\n# Table of Contents\n\n# Discussion\n\n# TARGETED THERAPIES AND ASSOCIATED BIOMARKER TESTING FOR RECURRENT UNRESECTABLE (LOCAL OR REGIONAL) OR STAGE IV (M1) DISEASE\n\n# REFERENCES\n\n1. Andre F, Ciruelos E, Rubovszky G, et al. Alpelisib for PIK3CA-mutated, hormone receptor-positive advanced breast cancer. N Engl J Med 2019;380:1929-1940.\n2. Turner NC, Oliveira M, Howell SJ, et al. Capivasertib in hormone receptor–positive advanced breast cancer. N Engl J Med 2023;388:2058-2070.\n3. Berton D, Banerjee S, Curigliano G, et al. Antitumor activity of dostarlimab in patients with mismatch repair–deficient (dMMR) tumors: a combined analysis of 2 cohorts in the GARNET study. Poster presented at American Society fo

In [23]:
[p.charts for p in results.pages if p.charts != []]

[]

In [25]:
[p.tables for p in results.pages if p.tables != []]

[]

In [26]:
[p.images for p in results.pages if p.images != []]

[[ImageItem(name='img_p124_1.png', height=167.0, width=486.0, x=23.5500755, y=19.450134299999966, original_width=486, original_height=167, type=None)],
 [ImageItem(name='img_p125_1.png', height=167.0, width=486.0, x=23.5500755, y=19.450134299999966, original_width=486, original_height=167, type=None),
  ImageItem(name='img_p125_2.png', height=729.0, width=729.0, x=221.5500641, y=176.29901130000002, original_width=729, original_height=729, type=None)]]

In [36]:
from llama_index.core.schema import Document
documents = [
    Document(text=page.md)
    for page in results.pages
]
len(documents)

263

In [37]:
documents[0].text[:1000]

'\n# NCCN Clinical Practice Guidelines in Oncology (NCCN Guidelines®)\n\n# Breast Cancer\n\n# Version 4.2025 — April 17, 2025\n\nNCCN.org\n\nNCCN recognizes the importance of clinical trials and encourages participation when applicable and available. Trials should be designed to maximize inclusiveness and broad representative enrollment.\n\nNCCN Guidelines for Patients® available at www.nccn.org/patients\n\nContinue\n\nVersion 4.2025, 4/17/25 © 2025 National Comprehensive Cancer Network® (NCCN®), All rights reserved. NCCN Guidelines® and this illustration may not be reproduced in any form without the express written permission of NCCN.'

## 3. GraphRAGExtractor

The `GraphRAGExtractor` class is designed to extract triples (subject-relation-object) from text and enrich them by adding descriptions for entities and relationships to their properties using an LLM.

This functionality is similar to that of the SimpleLLMPathExtractor, but includes additional enhancements to handle entity, relationship descriptions. For guidance on implementation, you may look at similar existing extractors.

Here's a breakdown of its functionality:

**Key Components**:

- llm: The language model used for extraction.
- extract_prompt: A prompt template used to guide the LLM in extracting information.
- parse_fn: A function to parse the LLM's output into structured data.
- max_paths_per_chunk: Limits the number of triples extracted per text chunk.
- num_workers: For parallel processing of multiple text nodes.

**Main Methods**:

- **call**: The entry point for processing a list of text nodes.
- acall: An asynchronous version of call for improved performance.
- \_aextract: The core method that processes each individual node.

**Extraction Process**:

For each input node (chunk of text):

1.  It sends the text to the LLM along with the extraction prompt.
2.  The LLM's response is parsed to extract entities, relationships, descriptions for entities and relations.
3.  Entities are converted into EntityNode objects. Entity description is stored in metadata
4.  Relationships are converted into Relation objects. Relationship description is stored in metadata.
5.  These are added to the node's metadata under KG_NODES_KEY and KG_RELATIONS_KEY.

NOTE: In the current implementation, we are using only relationship descriptions. In the next implementation, we will utilize entity descriptions during the retrieval stage.


In [8]:
from typing import Any, List, Callable, Optional, Union, Dict
from IPython.display import Markdown, display

from llama_index.core.async_utils import run_jobs
from llama_index.core.indices.property_graph.utils import (
    default_parse_triplets_fn,
)
from llama_index.core.graph_stores.types import (
    EntityNode,
    KG_NODES_KEY,
    KG_RELATIONS_KEY,
    Relation,
)
from llama_index.core.llms.llm import LLM
from llama_index.core.prompts import PromptTemplate
from llama_index.core.prompts.default_prompts import (
    DEFAULT_KG_TRIPLET_EXTRACT_PROMPT,
)
from llama_index.core.schema import TransformComponent, BaseNode
from llama_index.core.bridge.pydantic import BaseModel, Field


class GraphRAGExtractor(TransformComponent):
    """Extract triples from a graph.

    Uses an LLM and a simple prompt + output parsing to extract paths (i.e. triples) and entity, relation descriptions from text.

    Args:
        llm (LLM):
            The language model to use.
        extract_prompt (Union[str, PromptTemplate]):
            The prompt to use for extracting triples.
        parse_fn (callable):
            A function to parse the output of the language model.
        num_workers (int):
            The number of workers to use for parallel processing.
        max_paths_per_chunk (int):
            The maximum number of paths to extract per chunk.
    """

    llm: LLM
    extract_prompt: PromptTemplate
    parse_fn: Callable
    num_workers: int
    max_paths_per_chunk: int

    def __init__(
        self,
        llm: Optional[LLM] = None,
        extract_prompt: Optional[Union[str, PromptTemplate]] = None,
        parse_fn: Callable = default_parse_triplets_fn,
        max_paths_per_chunk: int = 10,
        num_workers: int = 4,
    ) -> None:
        """Init params."""
        from llama_index.core import Settings

        if isinstance(extract_prompt, str):
            extract_prompt = PromptTemplate(extract_prompt)

        super().__init__(
            llm=llm or Settings.llm,
            extract_prompt=extract_prompt or DEFAULT_KG_TRIPLET_EXTRACT_PROMPT,
            parse_fn=parse_fn,
            num_workers=num_workers,
            max_paths_per_chunk=max_paths_per_chunk,
        )

    @classmethod
    def class_name(cls) -> str:
        return "GraphExtractor"

    def __call__(
        self, nodes: List[BaseNode], show_progress: bool = False, **kwargs: Any
    ) -> List[BaseNode]:
        """Extract triples from nodes."""
        return asyncio.run(
            self.acall(nodes, show_progress=show_progress, **kwargs)
        )

    async def _aextract(self, node: BaseNode) -> BaseNode:
        """Extract triples from a node."""
        assert hasattr(node, "text")

        text = node.get_content(metadata_mode="llm")
        try:
            llm_response = await self.llm.apredict(
                self.extract_prompt,
                text=text,
                max_knowledge_triplets=self.max_paths_per_chunk,
            )
            entities, entities_relationship = self.parse_fn(llm_response)
        except ValueError:
            entities = []
            entities_relationship = []

        existing_nodes = node.metadata.pop(KG_NODES_KEY, [])
        existing_relations = node.metadata.pop(KG_RELATIONS_KEY, [])
        entity_metadata = node.metadata.copy()
        for entity, entity_type, description in entities:
            entity_metadata["entity_description"] = description
            entity_node = EntityNode(
                name=entity, label=entity_type, properties=entity_metadata
            )
            existing_nodes.append(entity_node)

        relation_metadata = node.metadata.copy()
        for triple in entities_relationship:
            subj, obj, rel, description = triple
            relation_metadata["relationship_description"] = description
            rel_node = Relation(
                label=rel,
                source_id=subj,
                target_id=obj,
                properties=relation_metadata,
            )

            existing_relations.append(rel_node)

        node.metadata[KG_NODES_KEY] = existing_nodes
        node.metadata[KG_RELATIONS_KEY] = existing_relations
        return node

    async def acall(
        self, nodes: List[BaseNode], show_progress: bool = False, **kwargs: Any
    ) -> List[BaseNode]:
        """Extract triples from nodes async."""
        jobs = []
        for node in nodes:
            jobs.append(self._aextract(node))

        return await run_jobs(
            jobs,
            workers=self.num_workers,
            show_progress=show_progress,
            desc="Extracting paths from text",
        )

## 4. GraphRAGStore

The GraphRAGStore class is an extension of the Neo4jPropertyGraphStoreclass, designed to implement GraphRAG pipeline. Here's a breakdown of its key components and functions:

The class uses community detection algorithms to group related nodes in the graph and then it generates summaries for each community using an LLM.


In [10]:
import re
import networkx as nx
from graspologic.partition import hierarchical_leiden
from collections import defaultdict

from llama_index.core.llms import ChatMessage
from llama_index.graph_stores.neo4j import Neo4jPropertyGraphStore


class GraphRAGStore(Neo4jPropertyGraphStore):
    community_summary = {}
    entity_info = None
    max_cluster_size = 5

    def generate_community_summary(self, text):
        """Generate summary for a given text using an LLM."""
        messages = [
            ChatMessage(
                role="system",
                content=(
                    "You are provided with a set of relationships from a knowledge graph, each represented as "
                    "entity1->entity2->relation->relationship_description. Your task is to create a summary of these "
                    "relationships. The summary should include the names of the entities involved and a concise synthesis "
                    "of the relationship descriptions. The goal is to capture the most critical and relevant details that "
                    "highlight the nature and significance of each relationship. Ensure that the summary is coherent and "
                    "integrates the information in a way that emphasizes the key aspects of the relationships."
                ),
            ),
            ChatMessage(role="user", content=text),
        ]
        response = OpenAI(model="gpt-4.1-mini").chat(messages)
        clean_response = re.sub(r"^assistant:\s*", "", str(response)).strip()
        return clean_response

    def build_communities(self):
        """Builds communities from the graph and summarizes them."""
        nx_graph = self._create_nx_graph()
        community_hierarchical_clusters = hierarchical_leiden(
            nx_graph, max_cluster_size=self.max_cluster_size
        )
        self.entity_info, community_info = self._collect_community_info(
            nx_graph, community_hierarchical_clusters
        )
        self._summarize_communities(community_info)

    def _create_nx_graph(self):
        """Converts internal graph representation to NetworkX graph."""
        nx_graph = nx.Graph()
        triplets = self.get_triplets()
        for entity1, relation, entity2 in triplets:
            nx_graph.add_node(entity1.name)
            nx_graph.add_node(entity2.name)
            nx_graph.add_edge(
                relation.source_id,
                relation.target_id,
                relationship=relation.label,
                description=relation.properties["relationship_description"],
            )
        return nx_graph

    def _collect_community_info(self, nx_graph, clusters):
        """
        Collect information for each node based on their community,
        allowing entities to belong to multiple clusters.
        """
        entity_info = defaultdict(set)
        community_info = defaultdict(list)

        for item in clusters:
            node = item.node
            cluster_id = item.cluster

            # Update entity_info
            entity_info[node].add(cluster_id)

            for neighbor in nx_graph.neighbors(node):
                edge_data = nx_graph.get_edge_data(node, neighbor)
                if edge_data:
                    detail = f"{node} -> {neighbor} -> {edge_data['relationship']} -> {edge_data['description']}"
                    community_info[cluster_id].append(detail)

        # Convert sets to lists for easier serialization if needed
        entity_info = {k: list(v) for k, v in entity_info.items()}

        return dict(entity_info), dict(community_info)

    def _summarize_communities(self, community_info):
        """Generate and store summaries for each community."""
        for community_id, details in community_info.items():
            details_text = (
                "\n".join(details) + "."
            )  # Ensure it ends with a period
            self.community_summary[
                community_id
            ] = self.generate_community_summary(details_text)

    def get_community_summaries(self):
        """Returns the community summaries, building them if not already done."""
        if not self.community_summary:
            self.build_communities()
        return self.community_summary

## 5. GraphRAGQueryEngine

The GraphRAGQueryEngine class is a custom query engine designed to process queries using the GraphRAG approach. It leverages the community summaries generated by the GraphRAGStore to answer user queries. Here's a breakdown of its functionality:

Main Components:

`graph_store`: An instance of GraphRAGStore, which contains the community summaries. llm: A Language Model (LLM) used for generating and aggregating answers.


In [11]:
from llama_index.core.query_engine import CustomQueryEngine
from llama_index.core.llms import LLM
from llama_index.core import PropertyGraphIndex

import re

class GraphRAGQueryEngine(CustomQueryEngine):
    graph_store: GraphRAGStore
    index: PropertyGraphIndex
    llm: LLM
    similarity_top_k: int = 20

    def custom_query(self, query_str: str) -> str:
        """Process all community summaries to generate answers to a specific query."""

        entities = self.get_entities(query_str, self.similarity_top_k)

        community_ids = self.retrieve_entity_communities(
            self.graph_store.entity_info, entities
        )
        community_summaries = self.graph_store.get_community_summaries()
        community_answers = [
            self.generate_answer_from_summary(community_summary, query_str)
            for id, community_summary in community_summaries.items()
            if id in community_ids
        ]

        final_answer = self.aggregate_answers(community_answers)
        return final_answer

    def get_entities(self, query_str, similarity_top_k):
        nodes_retrieved = self.index.as_retriever(
            similarity_top_k=similarity_top_k
        ).retrieve(query_str)

        enitites = set()
        pattern = (
            r"^(\w+(?:\s+\w+)*)\s*->\s*([a-zA-Z\s]+?)\s*->\s*(\w+(?:\s+\w+)*)$"
        )

        for node in nodes_retrieved:
            matches = re.findall(
                pattern, node.text, re.MULTILINE | re.IGNORECASE
            )

            for match in matches:
                subject = match[0]
                obj = match[2]
                enitites.add(subject)
                enitites.add(obj)

        return list(enitites)

    def retrieve_entity_communities(self, entity_info, entities):
        """
        Retrieve cluster information for given entities, allowing for multiple clusters per entity.

        Args:
        entity_info (dict): Dictionary mapping entities to their cluster IDs (list).
        entities (list): List of entity names to retrieve information for.

        Returns:
        List of community or cluster IDs to which an entity belongs.
        """
        community_ids = []

        for entity in entities:
            if entity in entity_info:
                community_ids.extend(entity_info[entity])

        return list(set(community_ids))

    def generate_answer_from_summary(self, community_summary, query):
        """Generate an answer from a community summary based on a given query using LLM."""
        prompt = (
            f"Given the community summary: {community_summary}, "
            f"how would you answer the following query? Query: {query}"
        )
        messages = [
            ChatMessage(role="system", content=prompt),
            ChatMessage(
                role="user",
                content="I need an answer based on the above information.",
            ),
        ]
        response = self.llm.chat(messages)
        cleaned_response = re.sub(r"^assistant:\s*", "", str(response)).strip()
        return cleaned_response

    def aggregate_answers(self, community_answers):
        """Aggregate individual community answers into a final, coherent response."""
        # intermediate_text = " ".join(community_answers)
        prompt = "Combine the following intermediate answers into a final, concise response."
        messages = [
            ChatMessage(role="system", content=prompt),
            ChatMessage(
                role="user",
                content=f"Intermediate answers: {community_answers}",
            ),
        ]
        final_response = self.llm.chat(messages)
        cleaned_final_response = re.sub(
            r"^assistant:\s*", "", str(final_response)
        ).strip()
        return cleaned_final_response

## 6. Build End to End GraphRAG Pipeline

Now that we have defined all the necessary components, let’s construct the GraphRAG pipeline:

1. Create nodes/chunks from the text.
2. Build a PropertyGraphIndex using GraphRAGExtractor and GraphRAGStore.
3. Construct communities and generate a summary for each community using the graph built above.
4. Create a GraphRAGQueryEngine and begin querying.

### Create nodes/ chunks from the text.


In [44]:
from llama_index.core.schema import TextNode

nodes = [TextNode(text=document.text) for document in documents]
len(nodes)

263

In [None]:
# from llama_index.core.node_parser import SentenceSplitter

# splitter = SentenceSplitter(
#     chunk_size=1024,
#     chunk_overlap=20,
# )
# nodes = splitter.get_nodes_from_documents(documents)

### Build ProperGraphIndex using GraphRAGExtractor and GraphRAGStore


In [45]:
KG_TRIPLET_EXTRACT_TMPL = """
-Goal-
Given a text document, identify all entities and their entity types from the text and all relationships among the identified entities.
Given the text, extract up to {max_knowledge_triplets} entity-relation triplets.

-Steps-
1. Identify all entities. For each identified entity, extract the following information:
- entity_name: Name of the entity, capitalized
- entity_type: Type of the entity
- entity_description: Comprehensive description of the entity's attributes and activities

2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.
For each pair of related entities, extract the following information:
- source_entity: name of the source entity, as identified in step 1
- target_entity: name of the target entity, as identified in step 1
- relation: relationship between source_entity and target_entity
- relationship_description: explanation as to why you think the source entity and the target entity are related to each other

3. Output Formatting:
- Return the result in valid JSON format with two keys: 'entities' (list of entity objects) and 'relationships' (list of relationship objects).
- Exclude any text outside the JSON structure (e.g., no explanations or comments).
- If no entities or relationships are identified, return empty lists: { "entities": [], "relationships": [] }.

-An Output Example-
{
  "entities": [
    {
      "entity_name": "Albert Einstein",
      "entity_type": "Person",
      "entity_description": "Albert Einstein was a theoretical physicist who developed the theory of relativity and made significant contributions to physics."
    },
    {
      "entity_name": "Theory of Relativity",
      "entity_type": "Scientific Theory",
      "entity_description": "A scientific theory developed by Albert Einstein, describing the laws of physics in relation to observers in different frames of reference."
    },
    {
      "entity_name": "Nobel Prize in Physics",
      "entity_type": "Award",
      "entity_description": "A prestigious international award in the field of physics, awarded annually by the Royal Swedish Academy of Sciences."
    }
  ],
  "relationships": [
    {
      "source_entity": "Albert Einstein",
      "target_entity": "Theory of Relativity",
      "relation": "developed",
      "relationship_description": "Albert Einstein is the developer of the theory of relativity."
    },
    {
      "source_entity": "Albert Einstein",
      "target_entity": "Nobel Prize in Physics",
      "relation": "won",
      "relationship_description": "Albert Einstein won the Nobel Prize in Physics in 1921."
    }
  ]
}

-Real Data-
######################
text: {text}
######################
output:"""

In [None]:
import json


def parse_fn(response_str: str) -> Any:
    json_pattern = r"\{.*\}"
    match = re.search(json_pattern, response_str, re.DOTALL)
    entities = []
    relationships = []
    if not match:
        return entities, relationships
    json_str = match.group(0)
    try:
        data = json.loads(json_str)
        entities = [
            (
                entity["entity_name"],
                entity["entity_type"],
                entity["entity_description"],
            )
            for entity in data.get("entities", [])
        ]
        relationships = [
            (
                relation["source_entity"],
                relation["target_entity"],
                relation["relation"],
                relation["relationship_description"],
            )
            for relation in data.get("relationships", [])
        ]
        return entities, relationships
    except json.JSONDecodeError as e:
        print("Error parsing JSON:", e)
        return entities, relationships


kg_extractor = GraphRAGExtractor(
    llm=llm,
    extract_prompt=KG_TRIPLET_EXTRACT_TMPL,
    max_paths_per_chunk=2,
    parse_fn=parse_fn,
)

## 7. Docker Setup And Neo4J setup

To launch Neo4j locally, first ensure you have docker installed. Then, you can launch the database with the following docker command.

```
docker run \
    -p 7474:7474 -p 7687:7687 \
    -v $PWD/data:/data -v $PWD/plugins:/plugins \
    --name neo4j-apoc \
    -e NEO4J_apoc_export_file_enabled=true \
    -e NEO4J_apoc_import_file_enabled=true \
    -e NEO4J_apoc_import_file_use__neo4j__config=true \
    -e NEO4JLABS_PLUGINS=\[\"apoc\"\] \
    neo4j:latest
```


From here, you can open the db at http://localhost:7474/. On this page, you will be asked to sign in. Use the default username/password of neo4j and neo4j.

Once you login for the first time, you will be asked to change the password.


In [48]:
from llama_index.graph_stores.neo4j import Neo4jPropertyGraphStore

# Note: used to be `Neo4jPGStore`
graph_store = GraphRAGStore(
    username="neo4j", password="Salesforce1", url="bolt://localhost:7687", # database="nccn"
)

In [None]:

from llama_index.core import PropertyGraphIndex

index = PropertyGraphIndex(
    nodes=nodes,
    kg_extractors=[kg_extractor],
    property_graph_store=graph_store,
    show_progress=True,
)

Extracting paths from text: 100%|██████████| 263/263 [06:20<00:00,  1.45s/it]
Generating embeddings: 100%|██████████| 3/3 [00:01<00:00,  1.80it/s]
Generating embeddings: 100%|██████████| 10/10 [00:01<00:00,  9.69it/s]


In [50]:
index.property_graph_store.get_triplets()[10]

[EntityNode(label='Organization', embedding=None, properties={'id': 'National Comprehensive Cancer Network', 'entity_description': 'A not-for-profit alliance of leading cancer centers dedicated to improving the quality, effectiveness, and efficiency of cancer care.', 'triplet_source_id': 'e4b6e5ee-99e9-4bfc-bfeb-cc9e95b42f1c'}, name='National Comprehensive Cancer Network'),
 Relation(label='recommends', source_id='National Comprehensive Cancer Network', target_id='Abemaciclib', properties={'triplet_source_id': 'eb077e13-7ae1-4b28-9c12-bd5cd739b710', 'relationship_description': 'The NCCN publishes guidelines recommending the use of abemaciclib as adjuvant endocrine therapy in eligible invasive breast cancer patients.'}),
 EntityNode(label='Drug', embedding=None, properties={'id': 'Abemaciclib', 'entity_description': 'Abemaciclib is a drug used in combination with fulvestrant for women with hormone receptor-positive, HER2-negative advanced breast cancer who have progressed while receivin

In [51]:
index.property_graph_store.get_triplets()[10][0].properties

{'id': 'National Comprehensive Cancer Network',
 'entity_description': 'A not-for-profit alliance of leading cancer centers dedicated to improving the quality, effectiveness, and efficiency of cancer care.',
 'triplet_source_id': 'e4b6e5ee-99e9-4bfc-bfeb-cc9e95b42f1c'}

In [52]:
index.property_graph_store.get_triplets()[10][1].properties

{'triplet_source_id': 'eb077e13-7ae1-4b28-9c12-bd5cd739b710',
 'relationship_description': 'The NCCN publishes guidelines recommending the use of abemaciclib as adjuvant endocrine therapy in eligible invasive breast cancer patients.'}

### Build communities

This will create communities and summary for each community.


In [64]:
index.property_graph_store.build_communities()

## 8. Create Query Engine


In [65]:
query_engine = GraphRAGQueryEngine(
    graph_store=index.property_graph_store, 
    llm=llm,
    index=index,
    similarity_top_k=20
)

In [66]:
response = query_engine.query("How best to treat breast cancer for patients with HER2?")
display(Markdown(f"{response.response}"))

The National Comprehensive Cancer Network (NCCN) guidelines recommend a multidisciplinary, evidence-based approach for treating HER2-positive breast cancer that centers on HER2-targeted therapies combined with chemotherapy, surgery, and radiation as appropriate. Key elements include:

- **HER2-Targeted Therapy:** Trastuzumab is the cornerstone agent, administered typically for up to one year in the adjuvant setting. Pertuzumab is often added, especially in neoadjuvant and adjuvant regimens for node-positive disease, improving invasive disease-free survival. For metastatic HER2-positive breast cancer, agents such as ado-trastuzumab emtansine (T-DM1), tucatinib (combined with trastuzumab and capecitabine), and fam-trastuzumab deruxtecan-nxki are recommended based on clinical trial evidence.

- **Chemotherapy:** Taxanes, particularly paclitaxel and docetaxel, are commonly combined with HER2-targeted therapies. Anthracycline-based regimens (e.g., doxorubicin and cyclophosphamide) followed by taxanes plus trastuzumab are standard in many cases. Chemotherapy choice may be tailored considering efficacy and side effect profiles.

- **Endocrine Therapy:** For patients with hormone receptor-positive, HER2-positive breast cancer, combining HER2-targeted therapy (e.g., trastuzumab) with endocrine agents such as anastrozole is effective, especially in metastatic settings, as demonstrated in trials like TAnDEM.

- **Surgery and Radiation:** Surgical options include breast-conserving surgery or mastectomy with sentinel lymph node biopsy or axillary dissection as indicated. Radiation therapy is recommended postoperatively based on surgical and pathological findings and can be safely administered concurrently with HER2-targeted agents.

- **Diagnostic and Staging:** Accurate HER2 testing per ASCO/CAP guidelines is essential to confirm HER2 status and guide therapy. Imaging modalities such as FDG-PET/CT are advised for staging, particularly in advanced or recurrent disease.

- **Genetic Testing:** Germline BRCA1/2 mutation testing is recommended in recurrent or metastatic cases to inform potential use of PARP inhibitors, although these are not standard for HER2-positive disease.

In summary, the best treatment for HER2-positive breast cancer involves confirmed HER2 status followed by a tailored, multidisciplinary regimen combining trastuzumab-based HER2-targeted therapies with chemotherapy (commonly taxanes), appropriate endocrine therapy if hormone receptor-positive, and individualized surgical and radiation management, all guided by NCCN evidence-based protocols and clinical trial data to optimize patient outcomes.