In [6]:
from llama_index.core.workflow import Event
from llama_index.core.schema import NodeWithScore

In [7]:
class RetrieverEvent(Event):

    nodes: list[NodeWithScore]

class RerankEvent(Event):

    nodes: list[NodeWithScore]

In [8]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.response_synthesizers import  CompactAndRefine
from llama_index.core.postprocessor.llm_rerank import LLMRerank
from llama_index.core.workflow import (
    Context,
    Workflow,
    StartEvent,
    StopEvent,
    step
)

In [None]:
# !pip install llama-index-llms-ollama 


In [12]:
from llama_index.legacy.llms.ollama import Ollama


In [13]:
#from llama_index.llms.ollama import Ollama

In [None]:
#from llama_index.legacy.readers.file.base import SimpleDirectoryReader


In [14]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# loads BAAI/bge-small-en
# embed_model = HuggingFaceEmbedding()

# loads BAAI/bge-small-en-v1.5
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

2025-03-08 22:52:20.349338: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [20]:
from llama_index.core.postprocessor import SimilarityPostprocessor

In [25]:
class RAGWorkflow(Workflow):

    @step
    async def ingest(self, ctx:Context, ev:StartEvent) -> StopEvent | None:
        dirname = ev.get("dirname")
        if not dirname:
            return None

        documents = SimpleDirectoryReader(dirname).load_data()
        index = VectorStoreIndex.from_documents(
            documents=documents,
            embed_model=embed_model
        )
        return StopEvent(result=index)

    @step
    async def retrieve(self, ctx:Context, ev:StartEvent) -> RetrieverEvent | None:

        query = ev.get("query")
        index = ev.get("index")

        if not query:
            return None
        
        print(f"query the database with: {query}")

        await ctx.set("query", query)

        if index is None:
            return None

        retriever = index.as_retriever(similarity_top_k=2)
        nodes = await retriever.aretrieve(query)
        print(f"Retrieved {len(nodes)} nodes")
        return RetrieverEvent(nodes=nodes)

    @step
    async def rerank(self, ctx:Context, ev:RetrieverEvent) -> RerankEvent:

        # ranker = LLMRerank(
        #     choice_batch_size=5, top_n=3, llm=Ollama(model="llama3:latest", request_timeout=120.0)
        # )
        
        ranker = SimilarityPostprocessor(similarity_cutoff=0.7)

        print(await ctx.get("query", default=None),flush=True)
        new_nodes = ranker.postprocess_nodes(
            ev.nodes, query_str=await ctx.get("query", default=None)
        )
        print(f"Reranked nodes to {len(new_nodes)}")
        return RerankEvent(nodes=new_nodes)

    @step
    async def synthesize(self, ctx:Context, ev:RerankEvent) -> StopEvent:
        llm = Ollama(model="llama3:latest", request_timeout=120.0)
        summarizer = CompactAndRefine(llm=llm, streaming=True, verbose=True)
        query = await ctx.get("query", default=None)

        response = await summarizer.asynthesize(query, nodes=ev.nodes)
        return StopEvent(result=response)


In [None]:
# !pip uninstall llama-index --yes
# !pip install llama-index --upgrade --no-cache-dir --force-reinstall

In [26]:
w = RAGWorkflow()

# Ingest the documents
index = await w.run(dirname="data")

In [27]:
# Run a query
result = await w.run(query="How was Llama2 trained?", index=index)
async for chunk in result.async_response_gen():
    print(chunk, end="", flush=True)

query the database with: How was Llama2 trained?
Retrieved 2 nodes
How was Llama2 trained?


Reranked nodes to 2
The training of Llama 2 began with the pretraining of Llama 2 using publicly available online sources. This was followed by the creation of an initial version of Llama 2-Chat through the application of supervised fine-tuning. The model was then iteratively refined using Reinforcement Learning with Human Feedback (RLHF) methodologies, specifically through rejection sampling and Proximal Policy Optimization (PPO).

In [4]:
!wget --user-agent "Mozilla" "https://arxiv.org/pdf/2307.09288.pdf" -O "data/llama2.pdf"

--2025-03-08 22:23:02--  https://arxiv.org/pdf/2307.09288.pdf
Resolving arxiv.org (arxiv.org)... 151.101.195.42, 151.101.131.42, 151.101.3.42, ...
Connecting to arxiv.org (arxiv.org)|151.101.195.42|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://arxiv.org/pdf/2307.09288 [following]
--2025-03-08 22:23:18--  http://arxiv.org/pdf/2307.09288
Connecting to arxiv.org (arxiv.org)|151.101.195.42|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13661300 (13M) [application/pdf]
Saving to: ‘data/llama2.pdf’


2025-03-08 22:23:22 (2.99 MB/s) - ‘data/llama2.pdf’ saved [13661300/13661300]

