In [57]:
from mistralai import Mistral
import os
from dotenv import load_dotenv

load_dotenv('../.envrc')



True

In [93]:
import phoenix as px
import llama_index.core

def launch_phoenix():
    px.launch_app()
    llama_index.core.set_global_handler("arize_phoenix")

def close_phoenix():
    px.close_app()


### PDF Files

In [4]:
file_path = '../data/fy2025_budget_statement.pdf'
file_name = 'fy2025_budget_statement.pdf'

#### Convert PDF to text

Use [Mistral OCR API](https://docs.mistral.ai/capabilities/document/) because
- Parse PDF into markdown
- Allow for images in pdf (base64)


In [3]:

MISTRAL_API_KEY = os.environ["MISTRAL_API_KEY"]

client = Mistral(api_key=MISTRAL_API_KEY)


In [5]:

uploaded_pdf = client.files.upload(
    file={
        "file_name": file_name,
        "content": open(file_path, "rb"),
    },
    purpose="ocr"
)
uploaded_pdf

UploadFileOut(id='e215a65b-8ce4-467b-8242-01d7735cb6a6', object='file', size_bytes=614571, created_at=1744378402, filename='fy2025_budget_statement.pdf', purpose='ocr', sample_type='ocr_input', source='upload', num_lines=None)

In [6]:
signed_url = client.files.get_signed_url(file_id=uploaded_pdf.id)
signed_url


FileSignedURL(url='https://mistralaifilesapiprodswe.blob.core.windows.net/fine-tune/ca9f74b9-2aeb-457f-a3ac-a81ac3401e24/e215a65b8ce4467b824201d7735cb6a6.pdf?se=2025-04-12T13%3A33%3A52Z&sp=r&sv=2025-05-05&sr=b&sig=sODIvuBG8YUzEc1Q1Ohyv/Ci3tXqBZmw2fDNv0gzixw%3D')

In [7]:
# Send to Mistral OCR API
ocr_response = client.ocr.process(
            model="mistral-ocr-latest",
            document={
                "type": "document_url",
                "document_url": signed_url.url,
            },
            include_image_base64=False,
        )
        
markdown = '\n\n'.join([page.markdown for page in ocr_response.pages])

In [10]:
ocr_response.pages

[OCRPageObject(index=0, markdown='# BUDGET 2025 SPEECH \n\n## ONWARD TOGETHER FOR A BETTER TOMORROW\n\nA. Introduction ..... 3\nB. Tackling Cost Pressures ..... 9\nC. Advancing our Growth Frontier ..... 17\nEnhancing our Technology and Innovation Engines ..... 19\nStrengthening our Enterprise Ecosystem ..... 24\nTackling Infrastructure and Resource Constraints ..... 30\nD. Equipping Workers Throughout Life ..... 38\nEncouraging Lifelong Learning ..... 39\nSupporting our Enterprises in Workforce Transformation ..... 43\nStrengthening Support for Workers ..... 49\nE. Building a Sustainable City ..... 53\nGreening and Enhancing our Transport Sector ..... 54\nSecuring a Climate-Resilient Future ..... 57\nEmbracing Sustainability as a Lifestyle ..... 60\nF. Nurturing a Caring and Inclusive Society ..... 62\nImproving the Resilience of Vulnerable Families ..... 62\nBuilding a Singapore Made for Families ..... 65\nEnabling Seniors to Age Well ..... 68\nEmpowering Persons with Disabilities ...

### Setup
- Select embedding model
- Connection to vector store
    - Create table if not exists
- Create index pipeline    

#### Load Embedding model

##### OpenAI

In [58]:
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
assert OPENAI_API_KEY is not None


In [59]:
from llama_index.embeddings.openai import OpenAIEmbedding
embedding_model_name = "text-embedding-3-large"
embedding_model_dimensions = 3072
embedding_model = OpenAIEmbedding(api_key=OPENAI_API_KEY, model=embedding_model_name, dimensions=embedding_model_dimensions)

#### Connect to vector store

In [60]:
DB_HOST = os.getenv('DB_HOST')
assert DB_HOST is not None
DB_PORT = os.getenv('DB_PORT')
assert DB_PORT is not None
DB_USER = os.getenv('DB_USER')
assert DB_USER is not None
DB_PASSWORD = os.getenv('DB_PASSWORD')
assert DB_PASSWORD is not None
DB_NAME = os.getenv('DB_NAME')
assert DB_NAME is not None


Create database and table (Initial setup)

In [61]:
from llama_index.vector_stores.postgres import PGVectorStore
table_name = 'budget_2025'

In [62]:
# Change this to True if creating the table for the first time
perform_setup = False
vector_store = PGVectorStore.from_params(
            host=DB_HOST,
            port=DB_PORT,
            database=DB_NAME,
            user=DB_USER,
            password=DB_PASSWORD,
            table_name=table_name,
            perform_setup=perform_setup,
            embed_dim=embedding_model_dimensions,
        )

if perform_setup:
    vector_store._initialize()


In [63]:
from llama_index.core import VectorStoreIndex

vsi = VectorStoreIndex.from_vector_store(
    vector_store=vector_store,
    embed_model=embedding_model
)

#### Create index pipeline

In [26]:
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import MarkdownNodeParser

pipeline = IngestionPipeline(
    transformations=[
        MarkdownNodeParser(),
        embedding_model
    ],
    vector_store=vector_store
)


In [27]:
from llama_index.core.schema import Document

document = Document(text=markdown)
nodes = pipeline.run(documents=[document])
len(nodes)

32

### Test retrieval

In [31]:
retriever = vsi.as_retriever()

nodes = retriever.retrieve("What is the total budget for the year?")


In [34]:
print(nodes[0].text)

# A. Introduction 

Mr Speaker, Sir

1. I move that Parliament approve the financial policy of the Government for the financial year 1 April 2025 to 31 March 2026.
2. 2025 marks our $60^{\text {th }}$ year of independence. It has been a remarkable journey, reflecting the grit and resilience of generations of Singaporeans in building our nation.
3. 60 years ago, the world was very different:
a. It was the height of the Cold War, with the Berlin Wall standing as a stark symbol of division between the United States and the Soviet Union.
b. In Asia, the Cold War spilled over into proxy fights, with devastating wars in Vietnam, Laos, and Cambodia.

4. As a fledgling nation, Singapore faced monumental challenges.
a. Racial tensions were high. Unemployment was rampant.
b. The withdrawal of British forces shortly after independence dealt a heavy blow to our defence and our economy.
c. In a world divided by blocs, we had no hinterland and no natural allies. Survival seemed improbable.
5. Yet in

#### Corrective RAG Workflow 
- (retrieve_context) Question come in, retrieve context and set in state, maybe perform some postprocessing e.g. reranking
- (generate_answer) Answer question from context
- (grade_answer) Grade answer
- (refine_answer) Refine answer if needed
- (provide_answer) Provide answer to user

State to maintain:
- Question
- Context
- Answer
- Grade
- Refined Answer



In [78]:
from llama_index.core.workflow import (
    Event,
    StartEvent,
    StopEvent,
    Workflow,
    step,
)
from llama_index.core.schema import NodeWithScore
from llama_index.core.base.response.schema import RESPONSE_TYPE

In [100]:
from typing import Annotated

class QuestionEvent(StartEvent):
    question: Annotated[str, "Single question"]
    similarity_top_k: Annotated[int, "Number of nodes to retrieve"] = 20
    
class NodesRetrievedEvent(Event):
    retrieved_nodes: Annotated[list[NodeWithScore], "Retrieved nodes"]

class NodesRerankedEvent(Event):
    reranked_nodes: Annotated[list[NodeWithScore], "Reranked nodes"]

# class ContextPopulatedEvent(Event):
#     context: str

class AnswerGeneratedFromContextEvent(StopEvent):
    answer: Annotated[str, "Answer generated from context"]
    
# class AnswerGradedEvent(Event):
#     is_relevant: Annotated[bool, "Whether the answer is relevant to the question"]
    


In [66]:
from llama_index.postprocessor.jinaai_rerank import JinaRerank

JINA_API_KEY = os.getenv('JINA_API_KEY')
assert JINA_API_KEY is not None

postprocessor = JinaRerank(
    top_n=20, model="jina-reranker-v1-base-en", api_key=JINA_API_KEY
)

# Testing
# reranked_nodes = postprocessor.postprocess_nodes(nodes, query_str=question)
# len(reranked_nodes)


In [67]:
from llama_index.core.prompts import PromptTemplate

DEFAULT_ANSWER_GENERATION_PROMPT_TEMPLATE = PromptTemplate(
    template="""Your task is to answer the user's question about Singapore government budget statement based on the context provided. Be detailed and objective in your answer.

    Context: \"\"\"
    {context_str}
    \"\"\"

    User Question: \"\"\"
    {query_str}
    \"\"\"

    """
)

In [107]:
from llama_index.llms.openai import OpenAI
from llama_index.core.workflow import Context
from llama_index.core.response_synthesizers import get_response_synthesizer
from llama_index.core.response_synthesizers.type import ResponseMode
class QAWorkflow(Workflow):
    llm = OpenAI(model="gpt-4o-mini")  # llm to generate answer
    vsi: VectorStoreIndex = VectorStoreIndex.from_vector_store(
        vector_store=vector_store,
        embed_model=embedding_model
    )

    @step
    async def retrieve_nodes(self, ctx: Context, ev: QuestionEvent) -> NodesRetrievedEvent:
        question = ev.question
        # set question in global state
        await ctx.set("question", question)

        retriever = self.vsi.as_retriever(similarity_top_k=ev.similarity_top_k)
        qe = vsi.as_query_engine()
        qe.aquery
        nodes = await retriever.aretrieve(question)
        
        return NodesRetrievedEvent(retrieved_nodes=nodes)

    @step
    async def rerank_nodes(self, ctx: Context, ev: NodesRetrievedEvent) -> NodesRerankedEvent:
        nodes = ev.retrieved_nodes
        question = await ctx.get("question")

        reranked_nodes = postprocessor.postprocess_nodes(nodes, query_str=question)
        return NodesRerankedEvent(reranked_nodes=reranked_nodes)

    @step
    async def generate_answer_from_context(self, ctx: Context, ev: NodesRerankedEvent) -> AnswerGeneratedFromContextEvent:
        nodes = ev.reranked_nodes
        question = await ctx.get("question")
        print(f'in generate_answer_from_context, question: {question}')

        response_synthesizer = get_response_synthesizer(llm=self.llm, response_mode=ResponseMode.COMPACT)
        
        response = await response_synthesizer.asynthesize(query=question, nodes=nodes)
        print(f'in generate_answer_from_context, response: {response}')
        
        return AnswerGeneratedFromContextEvent(answer=response.response)

    # @step
    # async def grade_answer(self, ctx: Context, ev: AnswerGeneratedFromContextEvent) -> AnswerGradedEvent:
    #     answer = ev.answer
    #     question = await ctx.get("question")
    #     print(f'in grade_answer, question: {question}')
        


In [108]:
w = QAWorkflow(timeout=60,verbose=True)
result = await w.run(start_event=QuestionEvent(question="What do i gain if i am an undergraduate student?"))
print(result)


Running step retrieve_nodes
Step retrieve_nodes produced event NodesRetrievedEvent
Running step rerank_nodes
Step rerank_nodes produced event NodesRerankedEvent
Running step generate_answer_from_context
in generate_answer_from_context, question: What do i gain if i am an undergraduate student?
in generate_answer_from_context, response: As an undergraduate student, you will receive a top-up of $500 to your Edusave account or your Post-Secondary Education Account. This top-up can be used to cover approved education-related expenses. Additionally, you may benefit from various support schemes aimed at enhancing your learning experience and financial assistance for your education.
Step generate_answer_from_context produced event AnswerGeneratedFromContextEvent
answer='As an undergraduate student, you will receive a top-up of $500 to your Edusave account or your Post-Secondary Education Account. This top-up can be used to cover approved education-related expenses. Additionally, you may benef

In [92]:
ans: RESPONSE_TYPE = result.answer
print(ans)

The total budget for the year is not explicitly stated in the provided information. However, it mentions expected surpluses for FY2024 and FY2025, which are projected to be \$6.4 billion and \$6.8 billion, respectively. For a specific total budget figure, additional details would be required.


In [105]:
launch_phoenix()

Attempting to instrument while already instrumented


🌍 To view the Phoenix app in your browser, visit http://localhost:6006/
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix


In [96]:
close_phoenix()

#### Agent with tools

For groundedness, maintain a state in the workflow for current context, for the grader agent to check that the answer is grounded in the context.

In [None]:
vsi.as_query_engine()

In [38]:
os.environ["LM_STUDIO_API_BASE"] = "http://172.27.112.1:1234"


In [40]:
import os
from llama_index.llms.litellm import LiteLLM
from llama_index.core.llms import ChatMessage

# set env variable

message = ChatMessage(role="user", content="Hey! how's it going?")

# openai call
llm = LiteLLM("lm_studio/llama-3.2-3b-instruct")



In [48]:
import litellm
# litellm._turn_on_debug()


In [46]:
llm = LiteLLM("ollama/llama3:latest", api_base='http://localhost:11434')

In [49]:
chat_response = llm.chat([message])

[92m23:54:22 - LiteLLM:DEBUG[0m: utils.py:324 - 

[92m23:54:22 - LiteLLM:DEBUG[0m: utils.py:324 - [92mRequest to litellm:[0m
[92m23:54:22 - LiteLLM:DEBUG[0m: utils.py:324 - [92mlitellm.completion(messages=[{'role': 'user', 'content': "Hey! how's it going?"}], stream=False, model='ollama/llama3:latest', temperature=0.1, api_base='http://localhost:11434')[0m
[92m23:54:22 - LiteLLM:DEBUG[0m: utils.py:324 - 

[92m23:54:22 - LiteLLM:DEBUG[0m: litellm_logging.py:422 - self.optional_params: {}
[92m23:54:22 - LiteLLM:DEBUG[0m: utils.py:324 - SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
[92m23:54:22 - LiteLLM:INFO[0m: utils.py:3076 - 
LiteLLM completion() model= llama3:latest; provider = ollama
[92m23:54:22 - LiteLLM:DEBUG[0m: utils.py:3079 - 
LiteLLM: Params passed to completion() {'model': 'llama3:latest', 'functions': None, 'function_call': None, 'temperature': 0.1, 'top_p': None, 'n': None, 'stream': False, 'stream_options': No

In [50]:
chat_response

ChatResponse(message=ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, additional_kwargs={}, blocks=[TextBlock(block_type='text', text="Hi! I'm doing well, thanks for asking. It's great to chat with you. How about you? What's new and exciting in your world?")]), raw=ModelResponse(id='chatcmpl-b04b8dfe-39a5-4313-90d5-5f0242bd9661', created=1744386869, model='ollama/llama3:latest', object='chat.completion', system_fingerprint=None, choices=[Choices(finish_reason='stop', index=0, message=Message(content="Hi! I'm doing well, thanks for asking. It's great to chat with you. How about you? What's new and exciting in your world?", role='assistant', tool_calls=None, function_call=None, provider_specific_fields=None))], usage=Usage(completion_tokens=33, prompt_tokens=10, total_tokens=43, completion_tokens_details=None, prompt_tokens_details=None)), delta=None, logprobs=None, additional_kwargs={})

##### LiteLLM

In [None]:
from litellm import completion
import os

os.environ['LM_STUDIO_API_BASE'] = "http://172.27.112.1:1234"

response = completion(
    model="lm_studio/llama-3.2-3b-instruct", 
    messages=[
        {
            "role": "user",
            "content": "What's the weather like in Boston today in Fahrenheit?",
        }
    ]
)
print(response)