### Imports

In [1]:
import tiktoken
from llama_index.readers.file import PDFReader
from llama_index.core import Document
from llama_index.core.extractors import TitleExtractor
from llama_index.core.node_parser import TokenTextSplitter
import os

### Create URL Mapping

In [16]:
os.listdir("../data/budget_statement_annex")

['annexb1.pdf',
 'annexb2.pdf',
 'annexc1.pdf',
 'annexc2.pdf',
 'annexd1.pdf',
 'annexe1.pdf',
 'annexe2.pdf',
 'annexf1.pdf',
 'annexf2.pdf',
 'annexf3.pdf',
 'annexf4.pdf',
 'annexg1.pdf',
 'annexg2.pdf',
 'annexh1.pdf',
 'annexh2.pdf',
 'annexi1.pdf',
 'budget_booklet_pg6_pg7_calendar.txt',
 'budget_booklet_pg8_household_support.txt',
 'budget_booklet_pg8_individual_support.txt']

In [17]:
url_mapping = {
    'annexb1.pdf': 'https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexb1.pdf',
    'annexb2.pdf': 'https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexb2.pdf',
    'annexc1.pdf': 'https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexc1.pdf',
    'annexc2.pdf': 'https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexc2.pdf',
    'annexd1.pdf': 'https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexd1.pdf',
    'annexe1.pdf': 'https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexe1.pdf',
    'annexe2.pdf': 'https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexe2.pdf',
    'annexf1.pdf': 'https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexf1.pdf',
    'annexf2.pdf': 'https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexf2.pdf',
    'annexf3.pdf': 'https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexf3.pdf',
    'annexf4.pdf': 'https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexf4.pdf',
    'annexg1.pdf': 'https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexg1.pdf',
    'annexg2.pdf': 'https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexg2.pdf',
    'annexh1.pdf': 'https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexh1.pdf',
    'annexh2.pdf': 'https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexh2.pdf',
    'annexi1.pdf': 'https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexi1.pdf',
    'budget_booklet_pg6_pg7_calendar.txt': 'https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/fy2024_disbursement_calendar_english.pdf',
    'budget_booklet_pg8_household_support.txt': 'https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/fy2024_support_for_singaporeans_english.pdf',
    'budget_booklet_pg8_individual_support.txt': 'https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/fy2024_support_for_singaporeans_english.pdf',
}

### Testing

In [18]:
loader = PDFReader()
sample_doc = loader.load_data("../data/budget_statement_annex/annexf2.pdf")
sample_doc

[Document(id_='13824515-b5ce-4442-a04d-8cbb549d2358', embedding=None, metadata={'page_label': '1', 'file_name': 'annexf2.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text=' \n1 \nMINISTRY OF FINANCE \nANNEX F-2: MAJULAH PACKAGE \n \nThe Majulah Package aims to provide Singapore Citizens born in 1973 or earlier, especially \nthose born in 1960 to 1973 (“Young Seniors”, currently in their 50s and early 60s ), with an \nadditional boost for their retirement. The Majulah Package comprises the following: \n \n(A) Annual Earn and Save Bonus; \n(B) One-time Retirement Savings Bonus; and \n(C) One-time MediSave Bonus.  \n \nEvery Singaporean born in 1973 or earlier will receive at least one component of the Majulah \nPackage. The Package will benefit about 1.6 million Singaporeans. \n \nThe Majulah Package is estimated to cost $8.2 billion in total  lifetime costs. The Government \nwill set aside $7.5 billion in a new Fund, the Majulah Package Fund, 

In [19]:
doc_text = "\n\n".join([d.get_content() for d in sample_doc])

In [20]:
new_doc = Document(text=doc_text)
new_doc

Document(id_='19aa32eb-efc6-4170-999a-75ef85269e10', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text=' \n1 \nMINISTRY OF FINANCE \nANNEX F-2: MAJULAH PACKAGE \n \nThe Majulah Package aims to provide Singapore Citizens born in 1973 or earlier, especially \nthose born in 1960 to 1973 (“Young Seniors”, currently in their 50s and early 60s ), with an \nadditional boost for their retirement. The Majulah Package comprises the following: \n \n(A) Annual Earn and Save Bonus; \n(B) One-time Retirement Savings Bonus; and \n(C) One-time MediSave Bonus.  \n \nEvery Singaporean born in 1973 or earlier will receive at least one component of the Majulah \nPackage. The Package will benefit about 1.6 million Singaporeans. \n \nThe Majulah Package is estimated to cost $8.2 billion in total  lifetime costs. The Government \nwill set aside $7.5 billion in a new Fund, the Majulah Package Fund, to fund these lifetime \ncosts.  \n \n(A) Earn

In [21]:
new_doc.metadata = {'filename': 'annexf2.pdf'}
new_doc

Document(id_='19aa32eb-efc6-4170-999a-75ef85269e10', embedding=None, metadata={'filename': 'annexf2.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text=' \n1 \nMINISTRY OF FINANCE \nANNEX F-2: MAJULAH PACKAGE \n \nThe Majulah Package aims to provide Singapore Citizens born in 1973 or earlier, especially \nthose born in 1960 to 1973 (“Young Seniors”, currently in their 50s and early 60s ), with an \nadditional boost for their retirement. The Majulah Package comprises the following: \n \n(A) Annual Earn and Save Bonus; \n(B) One-time Retirement Savings Bonus; and \n(C) One-time MediSave Bonus.  \n \nEvery Singaporean born in 1973 or earlier will receive at least one component of the Majulah \nPackage. The Package will benefit about 1.6 million Singaporeans. \n \nThe Majulah Package is estimated to cost $8.2 billion in total  lifetime costs. The Government \nwill set aside $7.5 billion in a new Fund, the Majulah Package Fund, to fund these lifetim

In [22]:
new_doc_text = new_doc.get_content()

In [23]:
print(new_doc_text)

 
1 
MINISTRY OF FINANCE 
ANNEX F-2: MAJULAH PACKAGE 
 
The Majulah Package aims to provide Singapore Citizens born in 1973 or earlier, especially 
those born in 1960 to 1973 (“Young Seniors”, currently in their 50s and early 60s ), with an 
additional boost for their retirement. The Majulah Package comprises the following: 
 
(A) Annual Earn and Save Bonus; 
(B) One-time Retirement Savings Bonus; and 
(C) One-time MediSave Bonus.  
 
Every Singaporean born in 1973 or earlier will receive at least one component of the Majulah 
Package. The Package will benefit about 1.6 million Singaporeans. 
 
The Majulah Package is estimated to cost $8.2 billion in total  lifetime costs. The Government 
will set aside $7.5 billion in a new Fund, the Majulah Package Fund, to fund these lifetime 
costs.  
 
(A) Earn and Save Bonus (“ESB”) 
 
Singaporeans born in 1973 or earlier will receive the ESB if they meet the following criteria:  
 
(i) Work and have an average monthly income of between $500 and 

In [24]:
tokenizer = tiktoken.encoding_for_model("gpt-4o-mini")
encoded_new_doc = tokenizer.encode(new_doc_text)
len(encoded_new_doc)

1048

In [25]:
tokenizer = tiktoken.encoding_for_model("text-embedding-3-small")
encoded_new_doc = tokenizer.encode(new_doc_text)
len(encoded_new_doc)

1057

### Iterate through all annexes

In [26]:
loader = PDFReader()
tokenizer = tiktoken.encoding_for_model("text-embedding-3-small") # for RAG
documents = []

for filename in os.listdir("../data/budget_statement_annex"):
    if filename.endswith(".pdf"):
        doc_pages = loader.load_data(f"../data/budget_statement_annex/{filename}")
        doc_text = "\n\n".join([d.get_content() for d in doc_pages])
        new_doc = Document(text=doc_text)
        new_doc.metadata = {'filename': filename, 'url': url_mapping[filename]}
        new_doc_text = new_doc.get_content()
        encoded_new_doc = tokenizer.encode(new_doc_text)
        print(f"{filename}: {len(encoded_new_doc)} ({new_doc.metadata['url']})")
        documents.append(new_doc)
    
    elif filename.endswith(".txt"):
        with open(f"../data/budget_statement_annex/{filename}", "r") as f:
            doc_text = f.read()
        new_doc = Document(text=doc_text)
        new_doc.metadata = {'filename': filename, 'url': url_mapping[filename]}
        new_doc_text = new_doc.get_content()
        encoded_new_doc = tokenizer.encode(new_doc_text)
        print(f"{filename}: {len(encoded_new_doc)} ({new_doc.metadata['url']})")
        documents.append(new_doc)

annexb1.pdf: 3112 (https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexb1.pdf)
annexb2.pdf: 1278 (https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexb2.pdf)
annexc1.pdf: 598 (https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexc1.pdf)
annexc2.pdf: 580 (https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexc2.pdf)
annexd1.pdf: 712 (https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexd1.pdf)
annexe1.pdf: 1583 (https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexe1.pdf)
annexe2.pdf: 850 (https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexe2.pdf)
annexf1.pdf: 3098 (https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexf1.pdf)
annexf2.pdf: 1057 (https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexf2.pdf)
annexf3.pdf: 339 (https://www.mof.gov.sg/docs/librariesprovider3/budget2024/d

In [27]:
print(len(documents))
documents

19


[Document(id_='084a2d91-2e03-4ef8-b43f-e3fa71585697', embedding=None, metadata={'filename': 'annexb1.pdf', 'url': 'https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexb1.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text=' \n1 \nMINISTRY OF FINANCE \nANNEX B-1: ASSURANCE PACKAGE ENHANCEMENTS \n \nThe Government will enhance the Assurance Package (“AP”) further to provide more support \nto help Singaporeans cope with cost-of-living concerns and economic uncertainties. The \nenhancements made in Budget 2024 will cost $1.9 billion.  \n \nThis Annex elaborates on the following: \n \n(A) Enhancements to the AP; and  \n(B) Illustration of Support for Households from the GST Voucher ( “GSTV”) Scheme \nand the AP, including B2024 AP Enhancements. \n \n(A) Enhancements to the Assurance Package (“AP”) \n \nCommunity Development Council (“CDC”) Vouchers \n \nEvery Singaporean household will receive an additional $600 CDC Vouchers.

In [28]:
splitter = TokenTextSplitter(
    chunk_size=8191,
    chunk_overlap=0,
    separator=" ",
)
nodes = splitter.get_nodes_from_documents(documents)
print(len(nodes))
nodes

19


[TextNode(id_='33026210-b82f-4a7e-9be9-33d6922bc79c', embedding=None, metadata={'filename': 'annexb1.pdf', 'url': 'https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexb1.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='084a2d91-2e03-4ef8-b43f-e3fa71585697', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'filename': 'annexb1.pdf', 'url': 'https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexb1.pdf'}, hash='d522f331fb380ff89ec22c5b95af811c3d5be73486d29411181d9cad4efe93b1')}, text='1 \nMINISTRY OF FINANCE \nANNEX B-1: ASSURANCE PACKAGE ENHANCEMENTS \n \nThe Government will enhance the Assurance Package (“AP”) further to provide more support \nto help Singaporeans cope with cost-of-living concerns and economic uncertainties. The \nenhancements made in Budget 2024 will cost $1.9 billion.  \n \nThis Annex elaborates on the following: \n \n(A) Enhanceme

### Iterate through statement and speech

In [4]:
loader = PDFReader()
tokenizer_rag = tiktoken.encoding_for_model("text-embedding-3-small") # for RAG
tokenizer = tiktoken.encoding_for_model("gpt-4o-mini") # for putting the whole document into the model

for filename in os.listdir("../data/budget_statement_and_speech"):
    if filename.endswith(".pdf"):
        doc_pages = loader.load_data(f"../data/budget_statement_and_speech/{filename}")
        doc_text = "\n\n".join([d.get_content() for d in doc_pages])
        new_doc = Document(text=doc_text)
        new_doc.metadata = {'filename': filename}
        new_doc_text = new_doc.get_content()

        with open(f"../data/budget_statement_and_speech/{filename.replace('.pdf', '.txt')}", "w") as f:
            f.write(new_doc_text)

        encoded_new_doc = tokenizer.encode(new_doc_text)
        encoded_new_doc_rag = tokenizer_rag.encode(new_doc_text)
        print(f"{filename}: {len(encoded_new_doc_rag)} for RAG, {len(encoded_new_doc)} for GPT-4o-mini")

fy2024_budget_debate_round_up_speech.pdf: 13393 for RAG, 13348 for GPT-4o-mini
fy2024_budget_statement.pdf: 19588 for RAG, 19476 for GPT-4o-mini


### Analyse booklet

In [28]:
loader = PDFReader()
doc_pages = loader.load_data("../data/booklet/fy2024_infographic_budget_booklet_english.pdf")
doc_pages

[Document(id_='11fddaca-d976-4b33-bc7e-5b9bdcd50081', embedding=None, metadata={'page_label': '1', 'file_name': 'fy2024_infographic_budget_booklet_english.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='BUILDING OUR\nSHARED FUTURE\nTOGETHER', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'),
 Document(id_='12379407-0e50-468b-b380-d460faada650', embedding=None, metadata={'page_label': '2', 'file_name': 'fy2024_infographic_budget_booklet_english.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='FOREWORD\nMy Fellow Singaporeans,\n2023 was a challenging year. Geopolitical tensions weighed \nheavily on the global economy. The difﬁcult external \nenvironment resulted in lower economic growth for us.\nWhile we avoided a recession, income growth was not as \nstrong for many Singaporean

Calendar summary of benefits is parsed properly. Might need manually created a paraseable calendar of benefits.

In [39]:
print(doc_pages[7].get_content())

U-Save
$550 to $950
for HDB 
households
2 to 4 months 
offset for HDB 
households
S&CC Rebate
$600 in total
for all Singaporean
households
CDC Vouchers
Apr 2024 – Mar 2025
SUPPORT FOR SINGAPOREANS
For Households
For Individuals
$400 to $2,500 for 
Singaporeans born
in 1973 or earlier
CPF Retirement 
or Special Account
$
50% of tax payable
for the Year of
Assessment 2024, 
capped at $200
Personal Income 
Tax Rebate
Cash
$200 to $2,150 for 
all Singaporeans 
aged 21 and above
CPF MediSave
Account
$100 to $1,650 for all 
Singaporeans
MOHAMED SINGAPORE
CHAN X Y
$200 for all past and 
present national 
servicemen
NS LifeSG Credits
Note: Beneﬁts are subject to the individual’s or household’s eligibility.
8
