In [2]:
import requests
from bs4 import BeautifulSoup
import langchain_text_splitters
import llm_functions 
from openai import OpenAI
from langchain.chains import RetrievalQA



scrape data from IRAS

In [3]:

def scrape_section(section):
    """Extracts data from a single section."""
    section_data = {}
    
    # title = section.find('h2')
    # section_data['title'] = title.get_text(strip=True) if title else 'No Title Found'

    # # Extract all paragraphs
    # paragraphs = section.find_all('p')
    # section_data['paragraphs'] = [p.get_text(strip=True) for p in paragraphs]

    # # Extract lists
    # list_items = section.find_all('li')
    # section_data['list_items'] = [li.get_text(strip=True) for li in list_items]


    # Extract the title, but only if it's not empty
    title = section.find('h2')
    if title and title.get_text(strip=True):  # Check if the title exists and is non-empty
        section_data['title'] = title.get_text(strip=True)

    # Extract paragraphs, but only if they are not empty
    paragraphs = section.find_all('p')
    paragraph_texts = [p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)]
    if paragraph_texts:  # Only add paragraphs if there are any non-empty ones
        section_data['paragraphs'] = paragraph_texts

    # Extract list items, but only if they are not empty
    list_items = section.find_all('li')
    list_item_texts = [li.get_text(strip=True) for li in list_items if li.get_text(strip=True)]
    if list_item_texts:  # Only add list items if there are any non-empty ones
        section_data['list_items'] = list_item_texts


    tables = section.find_all('table')
    table_data = []
    for table in tables:
        rows = table.find_all('tr')
        if rows:
            table_rows = []
            for row in rows:
                columns = row.find_all(['td', 'th'])
                row_data = [col.get_text(strip=True) for col in columns if col.get_text(strip=True)]
                if row_data:  # Only add the row if it's not empty
                    table_rows.append(row_data)
            if table_rows:  # Only add the table if it contains valid rows
                table_data.append(table_rows)

    if table_data:  # Only add tables if they contain non-empty rows
        section_data['tables'] = table_data

    return section_data

def scrape_page(url):
    response = requests.get(url)
    response.raise_for_status()  # Raise an error for bad responses
    
    soup = BeautifulSoup(response.text, 'html.parser')
    data = []

    # Extract all sections with the specified class
    sections = soup.find_all('section', class_='eyd-rte')
    for section in sections:
        section_data = scrape_section(section)
        data.append(section_data)

    return data



Get the different types of relief

In [4]:
all_relief_main_url = "https://www.iras.gov.sg/taxes/individual-income-tax/basics-of-individual-income-tax/tax-reliefs-rebates-and-deductions/tax-reliefs"

In [5]:
all_relief_url = ['earned-income-relief',
'spouse-relief-spouse-relief-(disability)',
'foreign-domestic-worker-levy-(fdwl)-relief',
"central-provident-fund(cpf)-relief-for-employees",
"central-provident-fund-(cpf)-relief-for-self-employed-employee-who-is-also-self-employed",
"nsman-relief-(self-wife-and-parent)",
"parent-relief-parent-relief-(disability)",
"grandparent-caregiver-relief",
"sibling-relief-(disability)",
"working-mother's-child-relief-(wmcr)",
"qualifying-child-relief-(qcr)-child-relief-(disability)",
"life-insurance-relief",
"course-fees-relief",
"central-provident-fund-(cpf)-cash-top-up-relief",
"compulsory-and-voluntary-medisave-contributions"]

base_url = "https://www.iras.gov.sg/taxes/individual-income-tax/basics-of-individual-income-tax/tax-reliefs-rebates-and-deductions/tax-reliefs/"



In [6]:
scrapped_all = {}
for relief_ in  all_relief_url:
    #scrapped_ = scrape_page(url)
    print(f"Scrapping:{base_url}{relief_}")
    scrapped_ = scrape_page(f"{base_url}{relief_}")
    scrapped_all[relief_] =scrapped_


scrapped_all

special_url = "https://www.iras.gov.sg/taxes/individual-income-tax/basics-of-individual-income-tax/special-tax-schemes/srs-contributions#title4"

scrapped_ = scrape_page(special_url)
scrapped_all["srs-contributions"] =scrapped_



Scrapping:https://www.iras.gov.sg/taxes/individual-income-tax/basics-of-individual-income-tax/tax-reliefs-rebates-and-deductions/tax-reliefs/earned-income-relief
Scrapping:https://www.iras.gov.sg/taxes/individual-income-tax/basics-of-individual-income-tax/tax-reliefs-rebates-and-deductions/tax-reliefs/spouse-relief-spouse-relief-(disability)
Scrapping:https://www.iras.gov.sg/taxes/individual-income-tax/basics-of-individual-income-tax/tax-reliefs-rebates-and-deductions/tax-reliefs/foreign-domestic-worker-levy-(fdwl)-relief
Scrapping:https://www.iras.gov.sg/taxes/individual-income-tax/basics-of-individual-income-tax/tax-reliefs-rebates-and-deductions/tax-reliefs/central-provident-fund(cpf)-relief-for-employees
Scrapping:https://www.iras.gov.sg/taxes/individual-income-tax/basics-of-individual-income-tax/tax-reliefs-rebates-and-deductions/tax-reliefs/central-provident-fund-(cpf)-relief-for-self-employed-employee-who-is-also-self-employed
Scrapping:https://www.iras.gov.sg/taxes/individual-i

split the dataset and store in vectordb

In [7]:
from langchain_text_splitters import RecursiveJsonSplitter

splitter = RecursiveJsonSplitter(max_chunk_size=400)

json_chunks = splitter.split_json(json_data=scrapped_all)

json_docs = splitter.create_documents(texts=[scrapped_all])
for chunk in json_chunks:
     print(chunk)

{'earned-income-relief': [{}, {'title': 'Qualifying for relief', 'paragraphs': ['You will receive Earned Income Relief if you have taxable earned income from any of the following sources in the previous year:'], 'list_items': ['Employment;', 'Pension;\n or', 'Trade, business, profession or vocation.']}, {'title': 'Amount of relief', 'paragraphs': ['The amount of Earned Income Relief is based on your age and taxable earned income (less any allowable expenses) in the previous year.', 'Your age as of 31 Dec of the previous year', '*Maximum amount claimable', 'Below 55', '$1,000', '55 to 59', '$6,000', '60 and above', '$8,000', '* If the amount of taxable earned income is lower than the maximum amount claimable, the relief will be capped at the amount of taxable earned income.', 'For example, if you are 55 years old as at 31 Dec 2023 and have taxable earned income of $5,000 in 2023, you will get Earned Income Relief of $5,000 (instead of $6,000) for the Year of Assessment 2024.'], 'tables'

In [8]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

embeddings_model = OpenAIEmbeddings(model='text-embedding-3-small')


# Create the vector database
vectordb = Chroma.from_documents(
    documents=json_docs,
    embedding=embeddings_model,
    collection_name="json_splitter", # one database can have multiple collections
    persist_directory="./vector_db"
)

In [9]:
json_docs

[Document(metadata={}, page_content='{"earned-income-relief": [{}, {"title": "Qualifying for relief", "paragraphs": ["You will receive Earned Income Relief if you have taxable earned income from any of the following sources in the previous year:"], "list_items": ["Employment;", "Pension;\\n or", "Trade, business, profession or vocation."]}, {"title": "Amount of relief", "paragraphs": ["The amount of Earned Income Relief is based on your age and taxable earned income (less any allowable expenses) in the previous year.", "Your age as of 31 Dec of the previous year", "*Maximum amount claimable", "Below 55", "$1,000", "55 to 59", "$6,000", "60 and above", "$8,000", "* If the amount of taxable earned income is lower than the maximum amount claimable, the relief will be capped at the amount of taxable earned income.", "For example, if you are 55 years old as at 31 Dec 2023 and have taxable earned income of $5,000 in 2023, you will get Earned Income Relief of $5,000 (instead of $6,000) for th

In [10]:
# from langchain import hub
# from langchain_core.output_parsers import StrOutputParser
# from langchain_core.runnables import RunnablePassthrough


# from langchain_community.vectorstores import FAISS

# # Store splits
# vectorstore = FAISS.from_documents(documents=json_docs, embedding=embeddings_model)

# # See full prompt at https://smith.langchain.com/hub/rlm/rag-prompt
# prompt = ChatPromptTemplate([
#     ("system", "You are a helpful assistant tasked to help Singapore citizens learn more about personal income tax relief \n \
#      If you do not understand the question or do no have sufficient infomation, reply and say 'I am not sure'. \n \
#      Use a friendly and cheerful tone "),
#     ("human", "{question}")
# ])

# prompt = ChatPromptTemplate([ ("human", "You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use a friendly tone. \
# Question: {question} \
# Context: {context} \
# Answer:")])

# # def format_docs(docs):
# #     return "\n\n".join(doc.page_content for doc in docs)

# def format_docs(docs):
#     return "\n\n".join(doc.page_content for doc in docs)

# llm = ChatOpenAI(model='gpt-4o-mini', temperature=0)

# qa_chain = (
#     {
#         "context": vectorstore.as_retriever()| format_docs,
#         "question": RunnablePassthrough(),
#     }
#     | prompt
#     | llm
#     | StrOutputParser()
# )
  
# qa_chain.invoke( {"question":"What is the NS men relief?"})

In [13]:
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate


# # Store splits
vectorstore = FAISS.from_documents(documents=json_docs, embedding=embeddings_model)


prompt = ChatPromptTemplate([ ("human", "You are an assistant for question-answering tasks.\
                                Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use a friendly tone and you can present the findings in a table or in point form. \
Question: {question} \
Context: {context} \
Answer:")])

qa_chain = RetrievalQA.from_chain_type(
    ChatOpenAI(model='gpt-4o-mini', temperature=0) , retriever=vectorstore.as_retriever(), chain_type_kwargs={"prompt": prompt}
)
question = "What is working mother relief?"
result = qa_chain({"query": question})
print(result["result"])

def ask_tax_relief_qn(question):
    result = qa_chain.invoke({"query": question})
    print(result["result"])



**Working Mother Relief (WMCR)** is a tax relief provided to encourage married women to remain in the workforce after having children. Here’s a summary of the key points regarding WMCR:

### Purpose of WMCR
- **Encourages** married women to stay in the workforce after having children.
- **Promotes** Singapore Citizenship for children.
- **Rewards** families with children who are Singapore Citizens.

### Qualifying Conditions
To claim WMCR for the Year of Assessment 2024, you must meet the following criteria:
- You are a working mother who is married, divorced, or widowed.
- You have taxable earned income from employment, pensions, trade, business, or profession.
- You have maintained a child who is a Singapore Citizen as of December 31, 2023, and meets the conditions under Qualifying Child Relief (QCR) or Child Relief (Disability).

### Amount of Relief
The amount of WMCR you can claim is based on the order of your child in the family:
- **1st Child**: 15% of mother's earned income
- *