In [1]:
import nest_asyncio

nest_asyncio.apply()

In [2]:
# !pip install -U llama-index

In [3]:
from utils import load_env

load_env()

# Designing the Workflow TBD

RAG + Reranking consists of some clearly defined steps

Indexing data, creating an index
Using that index + a query to retrieve relevant text chunks
Rerank the text retrieved text chunks using the original query
Synthesizing a final response
With this in mind, we can create events and workflow steps to follow this process!

In [4]:
# TBD
from llama_index.core.workflow import Event
from llama_index.core.schema import NodeWithScore


class RetrieverEvent(Event):
    """Result of running retrieval"""

    nodes: list[NodeWithScore]


class RerankEvent(Event):
    """Result of running reranking on retrieved nodes"""

    nodes: list[NodeWithScore]

### The Workflow Itself,
With our events defined, we can construct our workflow and steps

Note that the workflow automatically validates itself using type annotations, so the type annotations on our steps are very helpful

In [5]:
from llama_index.core.schema import TextNode
from typing import List
import json

# helper functions
def get_text_nodes(json_list: List[dict]):
    text_nodes = []
    for idx, page in enumerate(json_list):
        text_node = TextNode(text=page["md"], metadata={"page": page["page"]})
        text_nodes.append(text_node)
    return text_nodes


def save_jsonl(data_list, filename):
    """Save a list of dictionaries as JSON Lines."""
    with open(filename, "w") as file:
        for item in data_list:
            json.dump(item, file)
            file.write("\n")


def load_jsonl(filename):
    """Load a list of dictionaries from JSON Lines."""
    data_list = []
    with open(filename, "r") as file:
        for line in file:
            data_list.append(json.loads(line))
    return data_list

In [6]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.response_synthesizers import CompactAndRefine
from llama_index.core.postprocessor.llm_rerank import LLMRerank
from llama_index.core.workflow import (
    Context,
    Workflow,
    StartEvent,
    StopEvent,
    step,
)

from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

from llama_parse import LlamaParse


class GraphRAGWorkflow(Workflow):
    @step(pass_context=True)
    async def ingest(self, ctx: Context, ev: StartEvent) -> StopEvent | None:
        """Ingest the documents using LlamaParse"""
        filepath = ev.get("filepath")
        if not filepath:
            return None
        
        parser = LlamaParse(
            result_type="markdown",
            use_vendor_multimodal_model=True,
            vendor_multimodal_model_name="openai-gpt-4o-mini",
            invalidate_cache=True,
        )
        json_objs = parser.get_json_result(filepath)
        json_list = json_objs[0]["pages"]
        docs = get_text_nodes(json_list)
        ctx.data["docs"] = docs

        return StopEvent(result=f"ingested {len(docs)} docs.")

    @step(pass_context=True)
    async def display_docs(self, ctx: Context, ev: StartEvent) -> StopEvent | None:
        """Display parsed documents."""
        num_docs = ev.get("num_docs")
        if not num_docs:
            return None
        
        docs: List[TextNode] = ctx.data.get("docs", [])
        if not docs:
            return StopEvent(result="No documents found. Please ingest documents first.")
        
        displayed_docs = []
        for i in range(min(num_docs, len(docs))):
            doc = docs[i]
            displayed_docs.append({
                "content": doc.get_content(metadata_mode="all")
            })

        return StopEvent(result=displayed_docs)

        

# Run the workflow

In [7]:
# Ingest the documents

w = GraphRAGWorkflow(timeout=120, verbose=True)
await w.run(filepath="data/bph-review-2019.pdf")

Running step display_docs
Step display_docs produced no event
Running step ingest
Started parsing the file under job_id 701d6299-d2c6-4e40-9aae-4178a0d1ed3b
Step ingest produced event StopEvent


'ingested 14 docs.'

In [10]:
# Display the first 2 documents
display_result = await w.run(num_docs=2)
for doc in display_result:
    print(doc['content'])
    print("-" * 50)

Running step display_docs
Step display_docs produced event StopEvent
Running step ingest
Step ingest produced no event
page: 1

# Nutraceutical treatment and prevention of benign prostatic hyperplasia and prostate cancer

## Arrigo F.G. Cicero¹, Olla Allkanjari², Gian Maria Busetto³, Tommaso Cai⁴, Gaetano Largana⁵, Vittorio Magri⁶, Gianpaolo Fertiletti⁷, Francesco Saverio Robustelli Della Cuna⁸, Giorgio Ivan Russo⁵, Konstantinos Stamataki⁹, Alberto Trinchieri¹⁰, Annabella Vitalone²

### Summary
During the last years, pharmaceutical innovations in primary care are dramatically less frequent and will be even more rare in the next future. In this context, preclinical and clinical research oriented their interests toward natural compounds efficacy and safety, supporting the development of a new "nutraceutical" science. Medicinal plants, in the form of powders or extracts of them, are continuously used for the treatment of prostate diseases such as benign hyperplasia, prostatitis and chroni