In [16]:
import nest_asyncio

nest_asyncio.apply()

In [17]:
# !pip install -U llama-index

In [18]:
from utils import load_env

load_env()

# Designing the Workflow TBD

RAG + Reranking consists of some clearly defined steps

Indexing data, creating an index
Using that index + a query to retrieve relevant text chunks
Rerank the text retrieved text chunks using the original query
Synthesizing a final response
With this in mind, we can create events and workflow steps to follow this process!

In [19]:
# TBD
from llama_index.core.workflow import Event
from llama_index.core.schema import NodeWithScore


class RetrieverEvent(Event):
    """Result of running retrieval"""

    nodes: list[NodeWithScore]


class RerankEvent(Event):
    """Result of running reranking on retrieved nodes"""

    nodes: list[NodeWithScore]

# Workflow components

## local neo4j docker instance


for mac or linux, use below
```bash
docker run \
    -p 7474:7474 -p 7687:7687 \
    -v $PWD/data:/data -v $PWD/plugins:/plugins \
    --name neo4j-apoc \
    -e NEO4J_apoc_export_file_enabled=true \
    -e NEO4J_apoc_import_file_enabled=true \
    -e NEO4J_apoc_import_file_use__neo4j__config=true \
    -e NEO4J_PLUGINS=\[\"apoc\"\] \
    neo4j:latest
```

for windows on anaconda prompt, run below

```bash
docker run ^
   -p 7474:7474 -p 7687:7687 ^
   -v "%CD%/data:/data" -v "%CD%/plugins:/plugins" ^
   --name neo4j-apoc ^
   -e NEO4J_apoc_export_file_enabled=true ^
   -e NEO4J_apoc_import_file_enabled=true ^
   -e NEO4J_apoc_import_file_use__neo4j__config=true ^
   -e NEO4J_PLUGINS="[\"apoc\"]" ^
   neo4j:latest
```
Go see your instance at http://localhost:7474/browser/. Default login and password is 'neo4j'.

You will be asked to change the password, change it to 'password'.


In [20]:
from llama_index.graph_stores.neo4j import Neo4jPGStore

username = "neo4j"
password = "password"
url = "bolt://localhost:7687"

# connect to the graph store
graph_store = Neo4jPGStore(username=username, password=password, url=url)



In [21]:
from llama_index.core.workflow import Event
from llama_index.core.schema import NodeWithScore


class RetrieverEvent(Event):
    """Result of running retrieval"""

    nodes: list[NodeWithScore]


class RerankEvent(Event):
    """Result of running reranking on retrieved nodes"""

    nodes: list[NodeWithScore]

## The Workflow Itself
With our events defined, we can construct our workflow and steps

Note that the workflow automatically validates itself using type annotations, so the type annotations on our steps are very helpful

In [22]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core.response_synthesizers import CompactAndRefine
from llama_index.core.postprocessor.llm_rerank import LLMRerank
from llama_index.core.workflow import (
    Context,
    Workflow,
    StartEvent,
    StopEvent,
    step,
)

from llama_index.core import Document, PropertyGraphIndex
from llama_index.core.indices.property_graph import DynamicLLMPathExtractor
from llama_index.core import Settings

from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding



class GraphRAGWorkflow(Workflow):
    @step(pass_context=True)
    async def ingest_docs(self, ctx: Context, ev: StartEvent) -> StopEvent | None:
        """Ingest the documents"""
        dirpath = ev.get("dirpath")
        if not dirpath:
            return None
        
        docs = SimpleDirectoryReader(dirpath).load_data()

        # store the docs in the global context
        ctx.data["docs"] = docs

        print(f"ingested {len(docs)} docs")
        return StopEvent(result=docs)
        # return StopEvent(result=f"ingested {len(docs)} docs")


    @step(pass_context=True)
    async def index_docs(self, ctx: Context, ev: StartEvent) -> StopEvent | None:
        """Index the documents dynamically in the graph store"""
        max_triplets = ev.get("max_triplets")
        if not max_triplets:
            return None

        allowed_entity_types = ev.get("allowed_entity_types", None)
        allowed_relation_types = ev.get("allowed_relation_types", None)
        allowed_relation_props = ev.get("allowed_relation_props", [])
        allowed_entity_props = ev.get("allowed_entity_props", [])

        llm = OpenAI(temperature=0.0, model="gpt-3.5-turbo")
        embed_model = OpenAIEmbedding(model="text-embedding-3-small")

        Settings.llm = llm
        Settings.chunk_size = 2048
        Settings.chunk_overlap = 20
        
        docs: List[Document] = ctx.data.get("docs", [])
        if docs is None:
            print("No documents to index, ingest some documents first.")
            return None

        kg_extractor = DynamicLLMPathExtractor(
            llm=llm,
            max_triplets_per_chunk=max_triplets,
            num_workers=4,
            allowed_entity_types=allowed_entity_types,
            allowed_relation_types=allowed_relation_types,
            allowed_relation_props=allowed_relation_props,
            allowed_entity_props=allowed_entity_props,
        )

        index = PropertyGraphIndex.from_documents(
            docs,
            llm=llm,
            embed_model=embed_model,
            property_graph_store=graph_store,
            kg_extractors=[kg_extractor],
            show_progress=True,
        )

        ctx.data["index"] = index

        return StopEvent(result=index)


    @step(pass_context=True)
    async def retrieve(self, ctx: Context, ev: StartEvent) -> RetrieverEvent | None:
        "Entry point for RAG, triggered by a StartEvent with `query`."
        query = ev.get("query")
        if not query:
            return None

        print(f"Query the graph database with: {query}")

        # store the query in the global context
        ctx.data["query"] = query

        # get the index from the global context
        index = ctx.data.get("index")
        if index is None:
            print("Index is empty, load some documents before querying!")
            return None

        retriever = index.as_retriever(similarity_top_k=10)
        nodes = retriever.retrieve(query)
        print(f"Retrieved {len(nodes)} nodes.")
        return RetrieverEvent(nodes=nodes)


    @step(pass_context=True)
    async def rerank(self, ctx: Context, ev: RetrieverEvent) -> RerankEvent:
            """Rerank the nodes based on the query"""
            ranker = LLMRerank(
                choice_batch_size=5, top_n=3, llm=OpenAI(model="gpt-4o-mini")
            )
            print(ctx.data.get("query"), flush=True)
            new_nodes = ranker.postprocess_nodes(
                ev.nodes, query_str=ctx.data.get("query")
            )
            print(f"Reranked nodes to {len(new_nodes)}")
            return RerankEvent(nodes=new_nodes)


    @step(pass_context=True)
    async def synthesize(self, ctx: Context, ev: RerankEvent) -> StopEvent:
        """Return a streaming response using reranked nodes."""
        llm = OpenAI(model="gpt-4o-mini")
        summarizer = CompactAndRefine(llm=llm, streaming=True, verbose=True)
        query = ctx.data.get("query")

        response = await summarizer.asynthesize(query, nodes=ev.nodes)
        return StopEvent(result=response)

# Run the workflow

In [28]:
# Ingest the documents
w = GraphRAGWorkflow(timeout=600, verbose=True)
docs = await w.run(dirpath="data/kg_rag_workflow/")

Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 34 0 (offset 0)
Ignoring wrong pointing object 51 0 (offset 0)
Ignoring wrong pointing object 67 0 (offset 0)
Ignoring wrong pointing object 86 0 (offset 0)
Ignoring wrong pointing object 108 0 (offset 0)
Ignoring wrong pointing object 127 0 (offset 0)
Ignoring wrong pointing object 140 0 (offset 0)
Ignoring wrong pointing object 180 0 (offset 0)


Running step index_docs
Step index_docs produced no event
Running step ingest_docs
ingested 46 docs
Step ingest_docs produced event StopEvent
Running step retrieve
Step retrieve produced no event


In [29]:
# Display the first 2 documents
print(f"docs is type {type(docs[0])}")
for doc in docs[:2]:
    print("-" * 50)
    print(doc)

docs is type <class 'llama_index.core.schema.Document'>
--------------------------------------------------
Doc ID: 753e7dff-e7c1-43c6-895b-b71d062d4a35
Text: 139 Archivio Italiano di Urologia e Andrologia 2019; 91, 3REVIEW
Nutraceutical treatment and prevention  of benign prostatic
hyperplasia and prostate cancer  Arrigo F.G. Cicero1, Olta
Allkanjari2, Gian Maria Busetto3, Tommaso Cai4, Gaetano Larganà5,
Vittorio Magri6, Gianpaolo Perletti7, Francesco Saverio Robustelli
Della Cuna8, Giorgio Ivan Russ...
--------------------------------------------------
Doc ID: 3399ef2a-961b-4ddc-aeac-bd7d10ae9e21
Text: Archivio Italiano di Urologia e Andrologia 2019; 91, 3A.F.G.
Cicero, O. Allkanjari, G.M. Busetto, et al.
140INTRODUCTIONTONUTRACEUTICALPRESCRIPTION (Arrigo F.G. Cicero) During
the last years, the pharmaceutical innovation in primary care are
dramatically less frequent and will be even more rare in the next
future. In this context, pre- clinical...


In [30]:
allowed_entity_types=["MEDICINAL_PLANT", "COMPOUND", "SYMPTOM", "TREATMENT", "DISEASE", "STUDY_TYPE"]
allowed_relation_types=["TREATS", "CONTAINS", "ALLEVIATES", "STUDIED_IN", "SIDE_EFFECT_OF", "INTERACTS_WITH"]
allowed_relation_props=["efficacy", "dosage"]
allowed_entity_props=["scientific_name", "common_name", "description"]
    
index = await w.run(
    max_triplets=20, 
    allowed_entity_types=allowed_entity_types, 
    allowed_relation_types=allowed_relation_types, 
    allowed_relation_props=allowed_relation_props, 
    allowed_entity_props=allowed_entity_props, 
    )

Running step index_docs


Parsing nodes: 100%|██████████| 46/46 [00:00<00:00, 700.58it/s]


Running step ingest_docs
Step ingest_docs produced no event
Running step retrieve
Step retrieve produced no event


Extracting and inferring knowledge graph from text:  95%|█████████▍| 55/58 [03:15<00:10,  3.37s/it]

In [None]:
index.property_graph_store.get_triplets(
    entity_names=["saw palmetto", "benign prostatic hyperplasia"]
)[:5]

[[EntityNode(label='COMPOUND', embedding=None, properties={'creation_date': '2024-08-03', 'id': 'benign prostatic hyperplasia', 'last_modified_date': '2024-08-03', 'file_size': 575459, 'file_path': '/Users/michaelkoch/github/posts/data/kg_rag_workflow/bph-review-2021.pdf', 'description': 'MTOPS Research Group', 'file_name': 'bph-review-2021.pdf', 'page_label': '32', 'triplet_source_id': '9707b4f1-e9b2-4f7c-854e-3018d5cb8488', 'file_type': 'application/pdf'}, name='doxazosin'),
  Relation(label='TREATS', source_id='doxazosin', target_id='benign prostatic hyperplasia', properties={}),
  EntityNode(label='DISEASE', embedding=None, properties={'file_size': 575459, 'file_path': '/Users/michaelkoch/github/posts/data/kg_rag_workflow/bph-review-2021.pdf', 'file_name': 'bph-review-2021.pdf', 'file_type': 'application/pdf', 'creation_date': '2024-08-03', 'id': 'doxazosin', 'Pharmacological Effect': 'α1-adrenoceptor antagonist', 'last_modified_date': '2024-08-03', 'description': 'benign prostatic

In [None]:
# run a query 
query = "What are the side effects of treating benign prostatic hyperplasia with saw palmetto?"
result = await w.run(query=query)
async for chunk in result.async_response_gen():
    print(chunk, end="", flush=True)

Running step index_docs
Step index_docs produced no event
Running step ingest_docs
Step ingest_docs produced no event
Running step retrieve
Query the graph database with: What are the side effects of treating benign prostatic hyperplasia with saw palmetto?
Retrieved 9 nodes.
Step retrieve produced event RetrieverEvent
Running step rerank
What are the side effects of treating benign prostatic hyperplasia with saw palmetto?
Reranked nodes to 3
Step rerank produced event RerankEvent
Running step synthesize
Step synthesize produced event StopEvent
The provided information does not specify the side effects of treating benign prostatic hyperplasia with saw palmetto. For detailed information on potential side effects, it would be advisable to consult medical literature or a healthcare professional.

In [None]:
# # Set up global configurations
# llm = OpenAI(temperature=0.0, model="gpt-3.5-turbo")
# embed_model = OpenAIEmbedding(model="text-embedding-3-small")

# Settings.llm = llm
# Settings.chunk_size = 2048
# Settings.chunk_overlap = 20

# allowed_entity_types = ["MEDICINAL_PLANT", "COMPOUND", "SYMPTOM", "TREATMENT", "DISEASE", "STUDY_TYPE"]
# allowed_relation_types= ["TREATS", "CONTAINS", "ALLEVIATES", "STUDIED_IN", "SIDE_EFFECT_OF", "INTERACTS_WITH"]
# allowed_relation_props= ["efficacy", "dosage"]
# allowed_entity_props= ["scientific_name", "common_name", "description"]

# kg_extractor = DynamicLLMPathExtractor(
#     llm=llm,
#     max_triplets_per_chunk=20,
#     num_workers=4,
#     allowed_entity_types=allowed_entity_types,
#     allowed_relation_types=allowed_relation_types,
#     allowed_relation_props=allowed_relation_props,
#     allowed_entity_props=allowed_entity_props,
# )

# index = PropertyGraphIndex.from_documents(
#     [document],
#     llm=llm,
#     embed_model=embed_model,
#     property_graph_store=graph_store,
#     kg_extractors=[kg_extractor],
#     show_progress=True,
# )

In [None]:
index.as_chat_engine().chat(query)

AgentChatResponse(response='The side effects of treating benign prostatic hyperplasia with saw palmetto may include gastrointestinal symptoms such as stomach discomfort, nausea, vomiting, and diarrhea. There are also rare reports of headaches, dizziness, and mild insomnia associated with the use of saw palmetto for BPH treatment.', sources=[ToolOutput(content='The side effects of treating benign prostatic hyperplasia with saw palmetto include potential gastrointestinal symptoms such as stomach discomfort, nausea, vomiting, and diarrhea. Additionally, there may be rare reports of headaches, dizziness, and mild insomnia associated with the use of saw palmetto for BPH treatment.', tool_name='query_engine_tool', raw_input={'input': 'Side effects of treating benign prostatic hyperplasia with saw palmetto'}, raw_output=Response(response='The side effects of treating benign prostatic hyperplasia with saw palmetto include potential gastrointestinal symptoms such as stomach discomfort, nausea, 

In [None]:
retriever = dynamic_index.as_retriever(similarity_top_k=5)
nodes = retriever.retrieve(query)
print(f"Retrieved {len(nodes)} nodes.")



Retrieved 1 nodes.


In [None]:
retriever = index.as_retriever(similarity_top_k=2)
retriever.retrieve(query)

[NodeWithScore(node=TextNode(id_='265cbcca-c3c4-4e82-a444-7a40397b1d3d', embedding=None, metadata={'page_label': '1', 'file_name': 'bph-review-2019.pdf', 'file_path': '/Users/michaelkoch/github/posts/data/kg_rag_workflow/bph-review-2019.pdf', 'file_type': 'application/pdf', 'file_size': 1558509, 'creation_date': '2024-08-03', 'last_modified_date': '2024-08-02'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='70989c0c-b301-4992-b54c-2ee47e87daab', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '1', 'file_name': 'bph-review-2019.pdf', 'file_path': '/Users/michaelkoch/github/posts/data/kg_rag_workflow/bph-review-2019.pdf', 'file_type': 'application/pdf', 'file_size': 1558509, 'creation_date': '