In [28]:
import nest_asyncio

nest_asyncio.apply()

In [29]:
# !pip install -U llama-index

In [30]:
from utils import load_env

load_env()

# Designing the Workflow TBD

RAG + Reranking consists of some clearly defined steps

Indexing data, creating an index
Using that index + a query to retrieve relevant text chunks
Rerank the text retrieved text chunks using the original query
Synthesizing a final response
With this in mind, we can create events and workflow steps to follow this process!

In [31]:
# TBD
from llama_index.core.workflow import Event
from llama_index.core.schema import NodeWithScore


class RetrieverEvent(Event):
    """Result of running retrieval"""

    nodes: list[NodeWithScore]


class RerankEvent(Event):
    """Result of running reranking on retrieved nodes"""

    nodes: list[NodeWithScore]

# Workflow components

## local neo4j docker instance


for mac or linux, use below
```bash
docker run \
    -p 7474:7474 -p 7687:7687 \
    -v $PWD/data:/data -v $PWD/plugins:/plugins \
    --name neo4j-apoc \
    -e NEO4J_apoc_export_file_enabled=true \
    -e NEO4J_apoc_import_file_enabled=true \
    -e NEO4J_apoc_import_file_use__neo4j__config=true \
    -e NEO4J_PLUGINS=\[\"apoc\"\] \
    neo4j:latest
```

for windows on anaconda prompt, run below

```bash
docker run ^
   -p 7474:7474 -p 7687:7687 ^
   -v "%CD%/data:/data" -v "%CD%/plugins:/plugins" ^
   --name neo4j-apoc ^
   -e NEO4J_apoc_export_file_enabled=true ^
   -e NEO4J_apoc_import_file_enabled=true ^
   -e NEO4J_apoc_import_file_use__neo4j__config=true ^
   -e NEO4J_PLUGINS="[\"apoc\"]" ^
   neo4j:latest
```
Go see your instance at http://localhost:7474/browser/. Default login and password is 'neo4j'.

You will be asked to change the password, change it to 'password'.


In [32]:
from llama_index.graph_stores.neo4j import Neo4jPGStore

username = "neo4j"
password = "password"
url = "bolt://localhost:7687"

# connect to the graph store
graph_store = Neo4jPGStore(username=username, password=password, url=url)



## The Workflow Itself
With our events defined, we can construct our workflow and steps

Note that the workflow automatically validates itself using type annotations, so the type annotations on our steps are very helpful

In [33]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.response_synthesizers import CompactAndRefine
from llama_index.core.postprocessor.llm_rerank import LLMRerank
from llama_index.core.workflow import (
    Context,
    Workflow,
    StartEvent,
    StopEvent,
    step,
)

from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

from llama_index.core import SimpleDirectoryReader
from llama_index.readers.file import PDFReader


class GraphRAGWorkflow(Workflow):
    @step(pass_context=True)
    async def ingest_docs(self, ctx: Context, ev: StartEvent) -> StopEvent | None:
        """Ingest the documents"""
        dirpath = ev.get("dirpath")
        if not dirpath:
            return None
        
        # docs = SimpleDirectoryReader(dirpath).load_data()

        # PDF Reader with `SimpleDirectoryReader`
        parser = PDFReader()
        file_extractor = {".pdf": parser}
        docs = SimpleDirectoryReader(
            dirpath, file_extractor=file_extractor
        ).load_data()


        print(f"ingested {len(docs)} docs")
        return StopEvent(result=docs)
        # return StopEvent(result=f"ingested {len(docs)} docs")



    @step(pass_context=True)
    async def index_docs(self, ctx: Context, ev: StartEvent) -> StopEvent | None:
        """Index the documents dynamically"""
        max_triplets = ev.get("max_triplets")
        if not max_triplets:
            return None
        
        docs: List[TextNode] = ctx.data.get("docs", [])
        kg_extractor = DynamicLLMPathExtractor(
            llm=llm,
            max_triplets_per_chunk=max_triplets,
            num_workers=4,
            allowed_entity_types=None,
            allowed_relation_types=None,
            allowed_relation_props=None,
            allowed_entity_props=None,
        )

        dynamic_index = PropertyGraphIndex.from_documents(
            docs,
            llm=llm,
            embed_model=embed_model,
            property_graph_store=graph_store,
            kg_extractors=[kg_extractor],
            show_progress=True,
        )

        return StopEvent(result=dynamic_index)

        

# Run the workflow

In [34]:
# Ingest the documents

w = GraphRAGWorkflow(timeout=120, verbose=True)
docs = await w.run(dirpath="data/kg_rag_workflow/")

Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 34 0 (offset 0)
Ignoring wrong pointing object 51 0 (offset 0)
Ignoring wrong pointing object 67 0 (offset 0)
Ignoring wrong pointing object 86 0 (offset 0)
Ignoring wrong pointing object 108 0 (offset 0)
Ignoring wrong pointing object 127 0 (offset 0)
Ignoring wrong pointing object 140 0 (offset 0)
Ignoring wrong pointing object 180 0 (offset 0)


Running step index_docs
Step index_docs produced no event
Running step ingest_docs
ingested 46 docs
Step ingest_docs produced event StopEvent


In [35]:
# Display the first 2 documents
print(f"docs is type {type(docs[0])}")
for doc in docs[:2]:
    print("-" * 50)
    print(doc)

docs is type <class 'llama_index.core.schema.Document'>
--------------------------------------------------
Doc ID: 307571d9-99ad-482a-a9dd-0f5236459522
Text: 139 Archivio Italiano di Urologia e Andrologia 2019; 91, 3REVIEW
Nutraceutical treatment and prevention  of benign prostatic
hyperplasia and prostate cancer  Arrigo F.G. Cicero1, Olta
Allkanjari2, Gian Maria Busetto3, Tommaso Cai4, Gaetano Larganà5,
Vittorio Magri6, Gianpaolo Perletti7, Francesco Saverio Robustelli
Della Cuna8, Giorgio Ivan Russ...
--------------------------------------------------
Doc ID: 8a43c4a8-11a3-4aa2-b94f-384a8e398dfa
Text: Archivio Italiano di Urologia e Andrologia 2019; 91, 3A.F.G.
Cicero, O. Allkanjari, G.M. Busetto, et al.
140INTRODUCTIONTONUTRACEUTICALPRESCRIPTION (Arrigo F.G. Cicero) During
the last years, the pharmaceutical innovation in primary care are
dramatically less frequent and will be even more rare in the next
future. In this context, pre- clinical...


In [36]:
# dynamic_index = w.run(max_triplets=20, timeout=120, verbose=True)

In [37]:
len(docs)
type(docs[0])
document = docs[0]

In [38]:
document.text

"139 Archivio Italiano di Urologia e Andrologia 2019; 91, 3REVIEW\nNutraceutical treatment and prevention \nof benign prostatic hyperplasia and prostate cancer \nArrigo F.G. Cicero1, Olta Allkanjari2, Gian Maria Busetto3, Tommaso Cai4, Gaetano Larganà5,\nVittorio Magri6, Gianpaolo Perletti7, Francesco Saverio Robustelli Della Cuna8, Giorgio Ivan Russo5,\nKostantinos Stamatiou9, Alberto Trinchieri10, Annabella Vitalone2\n1 Dip. di Scienze Mediche e Chirurgiche, Alma Mater Studiorum Università di Bologna, Bologna, Italy;\n2 Dipartimento di Farmacologia e Fisiologia “V. Erspamer”, Sapienza, Università di Roma, Roma, Italy;\n3 Department of Urology, Sapienza Università di Roma, Policlinico Umberto I, Roma, Italy;\n4 Department of Urology, Santa Chiara Regional Hospital, Trento, Italy;\n5 Urology Department, University of Catania, Catania, Italy;\n6 Ambulatorio Territoriale di Urologia ed Ecograﬁa Urologica, ASST Nord Milano, Milano, Italy;\n7 Dipartimento di Biotecnologie e Scienze della V

In [42]:
from llama_index.core import Document, PropertyGraphIndex
from llama_index.core.indices.property_graph import (
    SimpleLLMPathExtractor,
    SchemaLLMPathExtractor,
    DynamicLLMPathExtractor,
)
from llama_index.core import Settings

from utils import load_env

load_env()
# Set up global configurations
llm = OpenAI(temperature=0.0, model="gpt-3.5-turbo")

Settings.llm = llm
Settings.chunk_size = 2048
Settings.chunk_overlap = 20


kg_extractor = DynamicLLMPathExtractor(
    llm=llm,
    max_triplets_per_chunk=20,
    num_workers=4,
    # Let the LLM infer entities and their labels (types) on the fly
    allowed_entity_types=None,
    # Let the LLM infer relationships on the fly
    allowed_relation_types=None,
    # LLM will generate any entity properties, set `None` to skip property generation (will be faster without)
    allowed_relation_props=[],
    # LLM will generate any relation properties, set `None` to skip property generation (will be faster without)
    allowed_entity_props=[],
)

dynamic_index = PropertyGraphIndex.from_documents(
    [document],
    llm=llm,
    embed_kg_nodes=False,
    kg_extractors=[kg_extractor],
    show_progress=True,
)

dynamic_index.property_graph_store.save_networkx_graph(
    name="./DynamicGraph_dev.html"
)

dynamic_index.property_graph_store.get_triplets(
    entity_names=["Barack Obama", "Obama"]
)[:5]

  from .autonotebook import tqdm as notebook_tqdm
Parsing nodes: 100%|██████████| 1/1 [00:00<00:00, 551.59it/s]
Extracting and inferring knowledge graph from text: 100%|██████████| 1/1 [00:18<00:00, 18.38s/it]


[]