# Setup

In [2]:
import os

GRAPHD_HOST = "127.0.0.1"
GRAPHD_PORT = "9669"
NEBULA_USER = "root"
NEBULA_PASSWORD = "nebula"
NEBULA_ADDRESS = "127.0.0.1:9669"
GOOGLE_API_KEY = ""

os.environ["GRAPHD_HOST"] = GRAPHD_HOST
os.environ["GRAPHD_PORT"] = GRAPHD_PORT
os.environ["NEBULA_USER"] = NEBULA_USER
os.environ["NEBULA_PASSWORD"] = NEBULA_PASSWORD
os.environ["NEBULA_ADDRESS"] = NEBULA_ADDRESS
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY
os.environ["DEFAULT_NEBULAGRAPH_NL2CYPHER_PROMPT"] = ""

In [3]:
import os

GRAPHD_HOST = os.getenv("GRAPHD_HOST")
GRAPHD_PORT = os.getenv("GRAPHD_PORT")
NEBULA_USER = os.getenv("NEBULA_USER")
NEBULA_PASSWORD = os.getenv("NEBULA_PASSWORD")
NEBULA_ADDRESS = os.getenv("NEBULA_ADDRESS")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

In [4]:
import os

import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)

from llama_index.core import Settings


import logging
import sys


from llama_index.llms.gemini import Gemini
from llama_index.embeddings.gemini import GeminiEmbedding


# define LLM
llm = Gemini(temperature=0, model="models/gemini-pro")
query_llm = Gemini(temperature=0.5, model="models/gemini-pro")
embedding_model = GeminiEmbedding(
    model_name="models/embedding-001",
    api_key=GOOGLE_API_KEY,
    title="this is a document",
)

Settings.llm = llm
Settings.chunk_size = 512
Settings.embed_model = embedding_model

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
from llama_index.core import (
    StorageContext,
    VectorStoreIndex,
    SimpleDirectoryReader,
    KnowledgeGraphIndex,
)
from llama_index.graph_stores.nebula import NebulaGraphStore
from IPython.display import Markdown, display

# Create knowledge graph

In [41]:
documents = SimpleDirectoryReader("../data/nih/alzheimers").load_data()

In [5]:
space_name = "graph_rag_test"
edge_types, rel_prop_names = ["relationship"], [
    "relationship"
]  # default, could be omit if create from an empty kg
tags = ["entity"]  # default, could be omit if create from an empty kg

In [18]:
graph_store = NebulaGraphStore(
    space_name=space_name,
    edge_types=edge_types,
    rel_prop_names=rel_prop_names,
    tags=tags,
)

storage_context = StorageContext.from_defaults(graph_store=graph_store)

# NOTE: can take a while!
index = KnowledgeGraphIndex.from_documents(
    documents,
    storage_context=storage_context,
    max_triplets_per_chunk=2,
    space_name=space_name,
    edge_types=edge_types,
    rel_prop_names=rel_prop_names,
    tags=tags,
)

# Persist and restore the knowledge graph

In [19]:
index.storage_context.persist(persist_dir="./storage_graph")
!ls storage_graph

default__vector_store.json image__vector_store.json
docstore.json              index_store.json


In [8]:
from llama_index.core import load_index_from_storage

graph_store = NebulaGraphStore(
    space_name=space_name,
    edge_types=edge_types,
    rel_prop_names=rel_prop_names,
    tags=tags,
)

storage_context = StorageContext.from_defaults(
    persist_dir="./storage_graph", graph_store=graph_store
)
index = load_index_from_storage(
    storage_context=storage_context,
    max_triplets_per_chunk=2,
    space_name=space_name,
    edge_types=edge_types,
    rel_prop_names=rel_prop_names,
    tags=tags,
    verbose=True,
)

INFO:llama_index.core.indices.loading:Loading all indices.


# Query knowledge graph

In [29]:
query_engine = index.as_query_engine()

response = query_engine.query("Tell me about Alzheimer's disease")
display(Markdown(f"<b>{response}</b>"))

INFO:llama_index.core.indices.knowledge_graph.retrievers:> Querying with idx: d713e5bf-8fa6-47c6-a27c-30f59f6d0041: {"id_": "c5ccd49e-6386-4de5-96f4-bcf03df0619a", "embedding": null, "metadata"...
INFO:llama_index.core.indices.knowledge_graph.retrievers:> Querying with idx: b557e46a-ee34-4987-8b74-6b4b1d0b1d44: For example, scientists are learning how age-related changes in the brain may...
INFO:llama_index.core.indices.knowledge_graph.retrievers:> Querying with idx: b3dd51cb-ffda-411d-b44a-0eeb54a01905: 89e\u53f6\u5931\u667a\u75c7\u548c\u8840\u7ba1\u6027\u5931\u667a\u75c7\u3002\u...


<b>Alzheimer's disease is a disease that affects the brain and causes memory loss, confusion, and changes in behavior. It is the most common type of dementia, which is a general term for a decline in cognitive function. Alzheimer's disease is caused by changes in the brain that occur over time, and it is not curable. However, there are treatments that can help to slow the progression of the disease and improve symptoms.</b>

In [81]:
kg_index_query_engine = index.as_query_engine(
    llm=query_llm,
    retriever_mode="keyword",
    verbose=True,
    response_mode="tree_summarize",
    temperature=0.5,
)
response = kg_index_query_engine.query("Tell me about Alzheimer's disease")
display(Markdown(f"<b>{response}</b>"))

response = kg_index_query_engine.query("what causes Alzheimer's disease?")
display(Markdown(f"<b>{response}</b>"))

response = kg_index_query_engine.query(
    "what is the relationship between Alzheimer's disease and dementia?"
)
display(Markdown(f"<b>{response}</b>"))

[1;3;32mExtracted keywords: ['brain', 'Alzheimer', 'memory', 'decline', "Alzheimer's disease", 'cognitive decline', 'treatment', 'cognitive', 'aging', 'causes', 'dementia', 'loss', 'memory loss', 'symptoms', 'neurodegenerative', 'disease']
INFO:llama_index.core.indices.knowledge_graph.retrievers:> Querying with idx: d713e5bf-8fa6-47c6-a27c-30f59f6d0041: {"id_": "c5ccd49e-6386-4de5-96f4-bcf03df0619a", "embedding": null, "metadata"...
INFO:llama_index.core.indices.knowledge_graph.retrievers:> Querying with idx: b557e46a-ee34-4987-8b74-6b4b1d0b1d44: For example, scientists are learning how age-related changes in the brain may...
INFO:llama_index.core.indices.knowledge_graph.retrievers:> Querying with idx: b3dd51cb-ffda-411d-b44a-0eeb54a01905: 89e\u53f6\u5931\u667a\u75c7\u548c\u8840\u7ba1\u6027\u5931\u667a\u75c7\u3002\u...
[1;3;34mKG context:
The following are knowledge sequence in max depth 2 in the form of directed graph like:
`subject -[predicate]->, object, <-[predicate_next_hop]-, o

<b>Alzheimer's disease is a disease that affects the brain. It is caused by genetic variants and is influenced by multiple genes in combination with lifestyle and environmental factors. Alzheimer's disease is not usually caused by a single genetic cause.</b>

[1;3;32mExtracted keywords: ['etiology', 'genetics', 'environment', 'lifestyle', 'age', 'brain changes', 'brain', 'risk factors', 'Alzheimer', "Alzheimer's disease", 'factors', 'family', 'history', 'causes', 'changes', 'risk', 'family history', 'disease']
INFO:llama_index.core.indices.knowledge_graph.retrievers:> Querying with idx: d713e5bf-8fa6-47c6-a27c-30f59f6d0041: {"id_": "c5ccd49e-6386-4de5-96f4-bcf03df0619a", "embedding": null, "metadata"...
INFO:llama_index.core.indices.knowledge_graph.retrievers:> Querying with idx: b557e46a-ee34-4987-8b74-6b4b1d0b1d44: For example, scientists are learning how age-related changes in the brain may...
INFO:llama_index.core.indices.knowledge_graph.retrievers:> Querying with idx: b3dd51cb-ffda-411d-b44a-0eeb54a01905: 89e\u53f6\u5931\u667a\u75c7\u548c\u8840\u7ba1\u6027\u5931\u667a\u75c7\u3002\u...
[1;3;34mKG context:
The following are knowledge sequence in max depth 2 in the form of directed graph like:
`subject -[predicate]->, object, <-[predica

<b>Alzheimer's disease is caused by genetic variants.</b>

[1;3;32mExtracted keywords: ['Alzheimer', 'relationship', "Alzheimer's disease", 'dementia', 'disease']
INFO:llama_index.core.indices.knowledge_graph.retrievers:> Querying with idx: d713e5bf-8fa6-47c6-a27c-30f59f6d0041: {"id_": "c5ccd49e-6386-4de5-96f4-bcf03df0619a", "embedding": null, "metadata"...
INFO:llama_index.core.indices.knowledge_graph.retrievers:> Querying with idx: b557e46a-ee34-4987-8b74-6b4b1d0b1d44: For example, scientists are learning how age-related changes in the brain may...
INFO:llama_index.core.indices.knowledge_graph.retrievers:> Querying with idx: b3dd51cb-ffda-411d-b44a-0eeb54a01905: 89e\u53f6\u5931\u667a\u75c7\u548c\u8840\u7ba1\u6027\u5931\u667a\u75c7\u3002\u...
[1;3;34mKG context:
The following are knowledge sequence in max depth 2 in the form of directed graph like:
`subject -[predicate]->, object, <-[predicate_next_hop]-, object_next_hop ...`
Alzheimer's disease{name: Alzheimer's disease} <-[relationship:{relationship: Is hallmark of}]- Beta-amyloid plaques

<b>Alzheimer's disease and dementia are related in that Alzheimer's disease is a type of dementia. Dementia is a general term for a decline in cognitive function that is severe enough to interfere with everyday activities. Alzheimer's disease is the most common type of dementia, accounting for 60-80% of cases.</b>

# Chat engine

In [38]:
from llama_index.core.memory import ChatMemoryBuffer

memory = ChatMemoryBuffer.from_defaults(token_limit=5000)

chat_engine = index.as_chat_engine(chat_mode="context", memory=memory, verbose=True)
response = chat_engine.chat("what is Alzheimer's disease?")
display(Markdown(f"<b>{response}</b>"))

response = chat_engine.chat("what causes Alzheimer's disease?")
display(Markdown(f"<b>{response}</b>"))

response = chat_engine.chat(
    "what is the relationship between Alzheimer's disease and dementia?"
)
display(Markdown(f"<b>{response}</b>"))

[1;3;32mExtracted keywords: ['brain', 'Alzheimer', 'disorder', 'disease', 'treatment', 'aging', 'loss', 'memory loss', 'neurodegenerative', 'memory', 'brain disorder', 'risk factors', 'decline', "Alzheimer's disease", 'factors', 'cognitive decline', 'neurodegenerative disease', 'dementia', 'symptoms', 'risk', 'cognitive']
INFO:llama_index.core.indices.knowledge_graph.retrievers:> Querying with idx: d713e5bf-8fa6-47c6-a27c-30f59f6d0041: {"id_": "c5ccd49e-6386-4de5-96f4-bcf03df0619a", "embedding": null, "metadata"...
INFO:llama_index.core.indices.knowledge_graph.retrievers:> Querying with idx: b557e46a-ee34-4987-8b74-6b4b1d0b1d44: For example, scientists are learning how age-related changes in the brain may...
INFO:llama_index.core.indices.knowledge_graph.retrievers:> Querying with idx: b3dd51cb-ffda-411d-b44a-0eeb54a01905: 89e\u53f6\u5931\u667a\u75c7\u548c\u8840\u7ba1\u6027\u5931\u667a\u75c7\u3002\u...
[1;3;34mKG context:
The following are knowledge sequence in max depth 2 in the form

<b>Alzheimer's disease is a disease that affects the brain and causes memory loss, confusion, and changes in behavior. It is the most common type of dementia, which is a general term for a decline in cognitive function.</b>

[1;3;32mExtracted keywords: ['etiology', 'genetics', 'environment', 'lifestyle', 'age', 'brain changes', 'brain', 'risk factors', 'Alzheimer', "Alzheimer's disease", 'factors', 'family', 'history', 'causes', 'changes', 'risk', 'family history', 'disease']
INFO:llama_index.core.indices.knowledge_graph.retrievers:> Querying with idx: d713e5bf-8fa6-47c6-a27c-30f59f6d0041: {"id_": "c5ccd49e-6386-4de5-96f4-bcf03df0619a", "embedding": null, "metadata"...
INFO:llama_index.core.indices.knowledge_graph.retrievers:> Querying with idx: b557e46a-ee34-4987-8b74-6b4b1d0b1d44: For example, scientists are learning how age-related changes in the brain may...
INFO:llama_index.core.indices.knowledge_graph.retrievers:> Querying with idx: b3dd51cb-ffda-411d-b44a-0eeb54a01905: 89e\u53f6\u5931\u667a\u75c7\u548c\u8840\u7ba1\u6027\u5931\u667a\u75c7\u3002\u...
[1;3;34mKG context:
The following are knowledge sequence in max depth 2 in the form of directed graph like:
`subject -[predicate]->, object, <-[predica

<b>The exact cause of Alzheimer's disease is unknown, but it is thought to be caused by a combination of genetic, environmental, and lifestyle factors. Some of the risk factors for Alzheimer's disease include:

* Age: The risk of Alzheimer's disease increases with age.
* Family history: People who have a family history of Alzheimer's disease are more likely to develop the disease themselves.
* Genetics: Certain genes have been linked to an increased risk of Alzheimer's disease.
* Head injury: People who have suffered a head injury are more likely to develop Alzheimer's disease later in life.
* Heart disease: People who have heart disease are more likely to develop Alzheimer's disease.
* Diabetes: People who have diabetes are more likely to develop Alzheimer's disease.
* Obesity: People who are obese are more likely to develop Alzheimer's disease.
* Smoking: People who smoke are more likely to develop Alzheimer's disease.
* Alcohol abuse: People who abuse alcohol are more likely to develop Alzheimer's disease.

It is important to note that not everyone who has one or more of these risk factors will develop Alzheimer's disease. However, these factors can increase the risk of developing the disease.</b>

[1;3;32mExtracted keywords: ['Alzheimer', 'relationship', "Alzheimer's disease", 'dementia', 'disease']
INFO:llama_index.core.indices.knowledge_graph.retrievers:> Querying with idx: d713e5bf-8fa6-47c6-a27c-30f59f6d0041: {"id_": "c5ccd49e-6386-4de5-96f4-bcf03df0619a", "embedding": null, "metadata"...
INFO:llama_index.core.indices.knowledge_graph.retrievers:> Querying with idx: b557e46a-ee34-4987-8b74-6b4b1d0b1d44: For example, scientists are learning how age-related changes in the brain may...
INFO:llama_index.core.indices.knowledge_graph.retrievers:> Querying with idx: b3dd51cb-ffda-411d-b44a-0eeb54a01905: 89e\u53f6\u5931\u667a\u75c7\u548c\u8840\u7ba1\u6027\u5931\u667a\u75c7\u3002\u...
[1;3;34mKG context:
The following are knowledge sequence in max depth 2 in the form of directed graph like:
`subject -[predicate]->, object, <-[predicate_next_hop]-, object_next_hop ...`
Alzheimer's disease{name: Alzheimer's disease} <-[relationship:{relationship: Is hallmark of}]- Beta-amyloid plaques

<b>Alzheimer's disease is the most common type of dementia. Dementia is a general term for a decline in cognitive function that is severe enough to interfere with everyday activities. Alzheimer's disease accounts for 60-80% of all dementia cases.

The symptoms of Alzheimer's disease and dementia can be similar, including memory loss, confusion, and changes in behavior. However, there are some key differences between the two conditions.

* **Alzheimer's disease is a progressive disease**, meaning that the symptoms will gradually worsen over time. Dementia can be caused by a variety of conditions, some of which are reversible.
* **Alzheimer's disease is caused by changes in the brain**, including the accumulation of amyloid plaques and tau tangles. Dementia can be caused by a variety of factors, including Alzheimer's disease, stroke, Parkinson's disease, and Huntington's disease.

There is no cure for Alzheimer's disease or dementia, but there are treatments that can help to manage the symptoms.

**Relationship between Alzheimer's disease and dementia**

Alzheimer's disease is a type of dementia. Dementia is a general term for a decline in cognitive function that is severe enough to interfere with everyday activities. Alzheimer's disease is the most common type of dementia, accounting for 60-80% of all cases.

The symptoms of Alzheimer's disease and dementia can be similar, including memory loss, confusion, and changes in behavior. However, there are some key differences between the two conditions.

* **Alzheimer's disease is a progressive disease**, meaning that the symptoms will gradually worsen over time. Dementia can be caused by a variety of conditions, some of which are reversible.
* **Alzheimer's disease is caused by changes in the brain**, including the accumulation of amyloid plaques and tau tangles. Dementia can be caused by a variety of factors, including Alzheimer's disease, stroke, Parkinson's disease, and Huntington's disease.

There is no cure for Alzheimer's disease or dementia, but there are treatments that can help to manage the symptoms.</b>

# Knowledge graph query engine

In [15]:
from llama_index.core.query_engine import KnowledgeGraphQueryEngine
from llama_index.core.prompts.base import (
    PromptTemplate,
    PromptType,
)

# from lama_index/legacy/query_engine/knowledge_graph_query_engine.py
DEFAULT_NEBULAGRAPH_NL2CYPHER_PROMPT_TMPL = """
Generate NebulaGraph query from natural language.
Use only the provided relationship types and properties in the schema.
Do not use any other relationship types or properties that are not provided.
Schema:
---
{schema}
---
Note: NebulaGraph speaks a dialect of Cypher, comparing to standard Cypher:

1. it uses double equals sign for comparison: `==` rather than `=`
2. it needs explicit label specification when referring to node properties, i.e.
v is a variable of a node, and we know its label is Foo, v.`foo`.name is correct
while v.name is not.

For example, see this diff between standard and NebulaGraph Cypher dialect:
```diff
< MATCH (p:person)-[:directed]->(m:movie) WHERE m.name = 'The Godfather'
< RETURN p.name;
---
> MATCH (p:`person`)-[:directed]->(m:`movie`) WHERE m.`movie`.`name` == 'The Godfather'
> RETURN p.`person`.`name`;
```
Remember to escape any single quotes in all literal string in the NebulaGraph Cypher dialect query. For example, p.`entity`.`name` == 'Alzheimer's disease' should be p.`entity`.`name` == 'Alzheimer\\'s disease'.

Question: What causes Glaucoma?

NebulaGraph Cypher dialect query: MATCH (p:`entity`)-[:`relationship`]->(m:`entity`) WHERE p.`entity`.`name` == 'Glaucoma\\'s causes' RETURN m.`entity`.`name`;

Question: {query_str}

NebulaGraph Cypher dialect query:
"""
DEFAULT_NEBULAGRAPH_NL2CYPHER_PROMPT = PromptTemplate(
    DEFAULT_NEBULAGRAPH_NL2CYPHER_PROMPT_TMPL,
    prompt_type=PromptType.TEXT_TO_GRAPH_QUERY,
)

query_engine = KnowledgeGraphQueryEngine(
    llm=query_llm,
    storage_context=storage_context,
    verbose=True,
    graph_query_synthesis_prompt=DEFAULT_NEBULAGRAPH_NL2CYPHER_PROMPT,
)

In [123]:
response = query_engine.query("Tell me about Alzheimer's disease")
display(Markdown(f"<b>{response}</b>"))

[1;3;33mGraph Store Query:
MATCH (p:`entity`)-[:`relationship`]->(m:`entity`) WHERE p.`entity`.`name` == 'Alzheimer\'s disease' RETURN m.`entity`.`name`;
[0m[1;3;33mGraph Store Response:
{'m.entity.name': ['Genetic variants', 'Doctors', 'Cured', 'Loss of connections between neurons in the brain', '80 genetic areas', 'Proteins', 'Right words', 'Genetic variants', 'Dr. alois alzheimer', 'Environmental factors', 'Lifestyle', 'Multiple genes', 'Earlier stages', 'Brain', 'Memory', 'Memory and cognitive abilities', 'Older adults', 'Dr. alois alzheimer', 'Drugs', 'Cognitive impairment', 'Genetic cause', 'Memory loss', 'No cure', 'Risk', 'Risk factor', 'Clinical trials', 'Dementia', 'Brain disorder', 'Complex', 'Disease', 'Early-stage', 'Most common cause of dementia', 'Most commonly diagnosed form of dementia', 'Neurodegenerative disease', 'Neurodegenerative disorder', 'Seventh leading cause of death in the united states', 'Third leading cause of death', 'Type of dementia', 'Very complex',

<b>Alzheimer's disease is a complex brain disorder that affects memory, thinking, and behavior. It is the most common cause of dementia, a general term for memory loss and other cognitive abilities serious enough to interfere with everyday activities. Alzheimer's disease is a progressive disease, meaning it gets worse over time. There is no cure for Alzheimer's disease, but treatments can help manage the symptoms.</b>

# Knowledge graph RAG query engine

In [16]:
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.retrievers import KnowledgeGraphRAGRetriever

graph_rag_retriever = KnowledgeGraphRAGRetriever(
    storage_context=storage_context,
    verbose=True,
    with_nl2graphquery=True,  # include results from generated cypher queries
    graph_query_synthesis_prompt=DEFAULT_NEBULAGRAPH_NL2CYPHER_PROMPT,
)

query_engine = RetrieverQueryEngine.from_args(
    graph_rag_retriever,
    response_mode="tree_summarize",
    verbose=True,
)

In [18]:
response = query_engine.query("Tell me about Alzheimer's disease")
display(Markdown(f"<b>{response}</b>"))

[1;3;33mGraph Store Query:
MATCH (p:`entity`)-[:`relationship`]->(m:`entity`) WHERE p.`entity`.`name` == 'Alzheimer\'s disease' RETURN m.`entity`.`name`;
[0m[1;3;33mGraph Store Response:
{'m.entity.name': ['Genetic variants', 'Doctors', 'Cured', 'Loss of connections between neurons in the brain', '80 genetic areas', 'Proteins', 'Right words', 'Genetic variants', 'Dr. alois alzheimer', 'Environmental factors', 'Lifestyle', 'Multiple genes', 'Earlier stages', 'Brain', 'Memory', 'Memory and cognitive abilities', 'Older adults', 'Dr. alois alzheimer', 'Drugs', 'Cognitive impairment', 'Genetic cause', 'Memory loss', 'No cure', 'Risk', 'Risk factor', 'Clinical trials', 'Dementia', 'Brain disorder', 'Complex', 'Disease', 'Early-stage', 'Most common cause of dementia', 'Most commonly diagnosed form of dementia', 'Neurodegenerative disease', 'Neurodegenerative disorder', 'Seventh leading cause of death in the united states', 'Third leading cause of death', 'Type of dementia', 'Very complex',

<b>Alzheimer's disease is a neurodegenerative disease that is the most common cause of dementia. It is a complex disease that affects memory, thinking, and behavior. Alzheimer's disease is caused by a combination of genetic, environmental, and lifestyle factors. There is no cure for Alzheimer's disease, but there are treatments that can help to manage the symptoms.</b>

In [19]:
response.metadata

{'ce2b34a7-b92c-47c4-a1ca-1aa609edd5c3': {'query_str': "Tell me about Alzheimer's disease",
  'graph_store_query': "MATCH (p:`entity`)-[:`relationship`]->(m:`entity`) WHERE p.`entity`.`name` == 'Alzheimer\\'s disease' RETURN m.`entity`.`name`;",
  'graph_store_response': {'m.entity.name': ['Genetic variants',
    'Doctors',
    'Cured',
    'Loss of connections between neurons in the brain',
    '80 genetic areas',
    'Proteins',
    'Right words',
    'Genetic variants',
    'Dr. alois alzheimer',
    'Environmental factors',
    'Lifestyle',
    'Multiple genes',
    'Earlier stages',
    'Brain',
    'Memory',
    'Memory and cognitive abilities',
    'Older adults',
    'Dr. alois alzheimer',
    'Drugs',
    'Cognitive impairment',
    'Genetic cause',
    'Memory loss',
    'No cure',
    'Risk',
    'Risk factor',
    'Clinical trials',
    'Dementia',
    'Brain disorder',
    'Complex',
    'Disease',
    'Early-stage',
    'Most common cause of dementia',
    'Most commonly 