In [38]:
from mistralai import Mistral
import os
from dotenv import load_dotenv

load_dotenv('../.envrc')



True

In [2]:
import phoenix as px
import llama_index.core

def launch_phoenix():
    px.launch_app()
    llama_index.core.set_global_handler("arize_phoenix")

def close_phoenix():
    px.close_app()


### Ingestion Setup for PDF Files
- Parse PDF to text
- Select embedding model
- Connection to vector store
    - Create table if not exists
- Create index pipeline
- Run the pipeline on the parsed text

In [33]:
# file_name = 'fy2025_budget_statement.pdf'
# file_name = 'budget-debate-round-up-speech.pdf'
# file_name = 'fy2025_budget_booklet_english.pdf'
# file_name = 'fy2025_budget_booklet_chinese.pdf'
# file_name = 'fy2025_budget_booklet_malay.pdf'
file_name = 'fy2025_budget_booklet_tamil.pdf'
file_path = f'../data/{file_name}'

#### Convert PDF to text

Use [Mistral OCR API](https://docs.mistral.ai/capabilities/document/) for PDF parsing because of support for
- Parse PDF into markdown
- Allow for images in pdf (base64)


In [21]:
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")
assert MISTRAL_API_KEY is not None
client = Mistral(api_key=MISTRAL_API_KEY)

In [34]:
uploaded_pdf = client.files.upload(
    file={
        "file_name": file_name,
        "content": open(file_path, "rb"),
    },
    purpose="ocr"
)

signed_url = client.files.get_signed_url(file_id=uploaded_pdf.id)
print(f'signed_url: {signed_url}')

# Send to Mistral OCR API
ocr_response = client.ocr.process(
            model="mistral-ocr-latest",
            document={
                "type": "document_url",
                "document_url": signed_url.url,
            },
            include_image_base64=False,
        )
        
markdown = '\n\n'.join([page.markdown for page in ocr_response.pages])

signed_url: url='https://mistralaifilesapiprodswe.blob.core.windows.net/fine-tune/ca9f74b9-2aeb-457f-a3ac-a81ac3401e24/ce5a4f6f5f06405790b1a51df7ad7543.pdf?se=2025-04-16T08%3A35%3A18Z&sp=r&sv=2025-05-05&sr=b&sig=eLu6UF9z/SZOrnql2z9rZ3Pn9%2BSC1VX1IOKgX/bF63M%3D'


In [None]:
# see the markdown
from IPython.display import Markdown, display

display(Markdown(markdown))


Extract from image (if needed)

In [None]:
# Extract from image (not needed)
# ocr_response = client.ocr.process(
#     model="mistral-ocr-latest",
#     document={
#         "type": "image_url",
#         "image_url": "https://www.mof.gov.sg/docs/librariesprovider3/budget2025/images/resources/fy2025_budget_disbursement_calendar_english.png"
#     }
# )


#### Load Embedding model

In [37]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import re

model_name = "intfloat/multilingual-e5-large"

embed_model = HuggingFaceEmbedding(model_name=model_name)

embedding_model_dimensions = 1024


#### Connect to vector store

In [5]:
DB_HOST = os.getenv('DB_HOST')
assert DB_HOST is not None
DB_PORT = os.getenv('DB_PORT')
assert DB_PORT is not None
DB_USER = os.getenv('DB_USER')
assert DB_USER is not None
DB_PASSWORD = os.getenv('DB_PASSWORD')
assert DB_PASSWORD is not None
DB_NAME = os.getenv('DB_NAME')
assert DB_NAME is not None

DB_URL = f'postgresql+asyncpg://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}'


Create database and table (Initial setup)

In [39]:
from llama_index.vector_stores.postgres import PGVectorStore

table_prefix = 'budget_2025-'
# regex to only have words, numbers, and dashes
# replace / with -
model_name_clean = re.sub(r'[^a-zA-Z0-9\-]', '-', model_name)
table_name = f'{table_prefix}{model_name_clean}'
table_name


'budget_2025-intfloat-multilingual-e5-large'

In [40]:
# Change this to True if creating the table for the first time
perform_setup = False
vector_store = PGVectorStore.from_params(
            host=DB_HOST,
            port=DB_PORT,
            database=DB_NAME,
            user=DB_USER,
            password=DB_PASSWORD,
            table_name=table_name,
            perform_setup=perform_setup,
            embed_dim=embedding_model_dimensions,
        )

if perform_setup:
    vector_store._initialize()
    print(f'Vector store initialized for {table_name}')


In [41]:
from llama_index.core import VectorStoreIndex

vsi = VectorStoreIndex.from_vector_store(
    vector_store=vector_store,
    embed_model=embed_model
)

#### Create index pipeline

In [13]:
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import MarkdownNodeParser
from llama_index.core.node_parser import SemanticSplitterNodeParser

semantic_splitter = SemanticSplitterNodeParser(embed_model=embed_model)

pipeline = IngestionPipeline(
    transformations=[
        MarkdownNodeParser(),
        semantic_splitter,
        embed_model
    ],
    vector_store=vector_store
)


In [36]:
from llama_index.core.schema import Document

metadata = {
    'source_document': file_name
}

document = Document(text=markdown, metadata=metadata)
document.excluded_embed_metadata_keys = metadata.keys()

nodes = await pipeline.arun(documents=[document])
print(f'{len(nodes)} created for {file_name}')

45 created for fy2025_budget_booklet_tamil.pdf


#### Test retrieval

In [42]:
similarity_top_k = 30
retriever = vsi.as_retriever(similarity_top_k=similarity_top_k)


In [43]:
vsi._embed_model

HuggingFaceEmbedding(model_name='intfloat/multilingual-e5-large', embed_batch_size=10, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x7efe1a176750>, num_workers=None, max_length=512, normalize=True, query_instruction=None, text_instruction=None, cache_folder=None, show_progress_bar=False)

In [44]:
question = "What are some housing subsidies provided in the budget?"

In [None]:
# question = "How much is the government topping up to cultural matching fund?"

nodes = retriever.retrieve(question)
print(nodes[0].score)
print(nodes[0].text)


0.8069594377017614
The measures we have taken in recent years, and are taking in this Budget, will help to mitigate the impact of rising costs.
31. But in the longer term, the best way to adjust to higher prices is to grow the economy and increase productivity, so that all Singaporeans can enjoy higher real incomes and better standards of living. And let me turn to our strategies next in these areas.
