# Building an RAG model based chatbot for financial queries.

In [3]:
pip install PyMuPDF rake-nltk pandas numpy scipy



# Step 1

In [4]:
import fitz

pdf_document = 'Sample Financial Statement.pdf'
document = fitz.open(pdf_document)

pdf_text = {}

for page_number in range(document.page_count):
  page = document.load_page(page_number)
  text = page.get_text()
  pdf_text[page_number + 1] = text

document.close()

# for page, text in pdf_text.items():
#   print(f"Text from page {page}:\n{text}\n")

In [5]:
import re

# Function to clean the data
def preprocess_pdf_text(pdf_text):
    processed_text = {}
    for page, text in pdf_text.items():
        text = re.sub(r"\n\s*\n", "\n", text.strip())

        relevant_lines = []
        for line in text.split("\n"):
            if re.search(r"\b(balance sheet|revenue|assets|liabilities|profit|income|expense)\b", line, re.IGNORECASE):
                relevant_lines.append(line)
            elif re.search(r"[\d,]+(\.\d+)?", line):
                relevant_lines.append(line)

        processed_text[page] = "\n".join(relevant_lines)

    return processed_text

cleaned_pdf_text = preprocess_pdf_text(pdf_text)

for page, text in cleaned_pdf_text.items():
    print(f"Cleaned Text from page {page}:\n{text}\n")


Cleaned Text from page 1:
Condensed Consolidated Balance Sheet ……………………………………………………………………………………
1
Condensed Consolidated Statement of Profit and Loss ………………………………………………………………………
2
3
5
1. Overview
1.1 Company overview ……………………………………………………………………………………………………
7
1.2 Basis of preparation of financial statements …………………………………………………………………………
7
1.3 Basis of consolidation …………………………………………………………………………………………………
7
1.4 Use of estimates and judgments ………………………………………………………………………………………
7
1.5 Critical accounting estimates and judgments…………………………………………………………………………
8
2. Notes to the Interim Condensed Consolidated Financial Statements
2.1 Business Combinations ………………………………………………………………………………………………
10
2.2 Property, plant and equipment ………………………………………………………………………………………
12
2.3 Goodwill and other intangible assets…………………………………………………………………………………
14
2.4 Investments ……………………………………………………………………………………………………………
15
2.5 Loans …………………………………………………………………………………………………………………
16
2.6 Other financial assets ………………………………………………………

# Step 2

In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

page_chunks = {}

for page, text in pdf_text.items():
  chunks = text_splitter.split_text(text)
  page_chunks[page] = chunks

for page, chunks in page_chunks.items():
  print(f"Text chunks from page{page}:")
  for i, chunk in enumerate(chunks, start=1):
    print(f"Chunk[{i}:\n{chunk}\n")

Text chunks from page1:
Chunk[1:
Index
Page No.
Condensed Consolidated Balance Sheet ……………………………………………………………………………………
1
Condensed Consolidated Statement of Profit and Loss ………………………………………………………………………
2
Condensed Consolidated Statement of Changes in Equity …………………………………………………………………
3
Condensed Consolidated Statement of Cash Flows …………………………………………………………………………
5
Overview and Notes to the Interim Condensed Consolidated Financial Statements
1. Overview
1.1 Company overview ……………………………………………………………………………………………………
7
1.2 Basis of preparation of financial statements …………………………………………………………………………
7
1.3 Basis of consolidation …………………………………………………………………………………………………
7
1.4 Use of estimates and judgments ………………………………………………………………………………………
7
1.5 Critical accounting estimates and judgments…………………………………………………………………………
8
2. Notes to the Interim Condensed Consolidated Financial Statements
2.1 Business Combinations ………………………………………………………………………………………………
10
2.2 Property, plant and equipment …………………………………………………………

In [7]:
import nltk
nltk.download('stopwords')

import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [8]:
from rake_nltk import Rake

rake = Rake()

page_phrases = {}

for page, text in pdf_text.items():
  rake.extract_keywords_from_text(text)
  phrases = rake.get_ranked_phrases()
  page_phrases[page] = phrases

chunk_phrases = {}
for page, chunks in page_chunks.items():
    for chunk_number, chunk in enumerate(chunks, start=1):
        rake.extract_keywords_from_text(chunk)
        phrases = rake.get_ranked_phrases()
        chunk_phrases[(page, chunk_number)] = phrases

for (page, chunk_number), phrases in chunk_phrases.items():
  print(f"Key phrases from page {page}, chunk {chunk_number}:\n{phrases}\n")

Key phrases from page 1, chunk 1:
['condensed consolidated balance sheet …………………………………………………………………………………… 1 condensed consolidated statement', 'loss ……………………………………………………………………… 2 condensed consolidated statement', 'equity ………………………………………………………………… 3 condensed consolidated statement', 'interim condensed consolidated financial statements 1', 'interim condensed consolidated financial statements 2', '1 business combinations ……………………………………………………………………………………………… 10 2', '1 company overview …………………………………………………………………………………………………… 7 1', 'financial statements ………………………………………………………………………… 7 1', 'cash flows ………………………………………………………………………… 5 overview', 'consolidation ………………………………………………………………………………………………… 7 1', 'judgments ……………………………………………………………………………………… 7 1', 'judgments ………………………………………………………………………… 8 2', '5 critical accounting estimates', 'overview 1', 'equipment ……………………………………………………………………………………… 12', '2 property', '2 basis', '3 basis', 'index page', '4 use', 'estimates', 'profit', 'preparation', 'pla

# Step 3

In [9]:
pip install sentence-transformers



In [10]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

def get_embedding(phrase):
    embedding = model.encode(phrase)
    return embedding

phrase_embeddings = {}

# Generate embeddings for each phrase
for (page, chunk_number), phrases in chunk_phrases.items():
    embeddings = [get_embedding(phrase) for phrase in phrases]
    phrase_embeddings[(page, chunk_number)] = list(zip(phrases, embeddings))

excel_data = []
for (page, chunk_number), phrases in phrase_embeddings.items():
    for phrase, embedding in phrases:
        excel_data.append({ "Page": page, "Chunk": chunk_number, "Phrase": phrase, "Embedding": embedding })

import pandas as pd
df = pd.DataFrame(excel_data)

# Save to Excel
excel_filename = "phrases_embeddings.xlsx"
df.to_excel(excel_filename, index=False)
print(f"Embeddings saved to {excel_filename}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embeddings saved to phrases_embeddings.xlsx


In [11]:
def extract_phrases_from_query(query):
    rake.extract_keywords_from_text(query)
    return rake.get_ranked_phrases()
# Example query
# query = "What is the gross profit for Q3 2024?"
# # Extract phrases from the query
# query_phrases = extract_phrases_from_query(query)
# Output query phrases
# print(f"Query phrases:\n{query_phrases}\n")

def get_embeddings(phrase):
    embedding = model.encode(phrase)
    return embedding

# query_embeddings = get_embeddings(query_phrases)

import numpy as np
from scipy.spatial.distance import cosine

#Function to calculate cosine similarity
def cosine_similarity(embedding1, embedding2):
    return 1 - cosine(embedding1, embedding2)



Query phrases:
['q3 2024', 'gross profit']



# Step 4

In [12]:
pip install transformers



In [13]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load a pre-trained text generation model
model_name = "EleutherAI/gpt-neo-1.3B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model_2 = AutoModelForCausalLM.from_pretrained(model_name)

def truncate_input(context, query, model, max_new_tokens=300):
    max_input_length = model.config.max_position_embeddings - max_new_tokens
    prompt = f"Answer the following query based on the provided text:\n\n{context}\n\nQuery: {query}\nAnswer:"
    tokenized_input = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_input_length)
    return tokenized_input


# Function to generate a response
def generate_response(context, query, max_new_tokens=300, temperature=0.7):

    inputs = truncate_input(context, query, model_2, max_new_tokens)

    outputs = model_2.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response[len(inputs["input_ids"][0]):].strip()

tokenizer_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/5.31G [00:00<?, ?B/s]

In [14]:
def find_relevant_chunks(query, page_chunks, phrase_embeddings, model):
    # Extract query phrases
    query_phrases = extract_phrases_from_query(query)
    query_embeddings = [get_embedding(phrase) for phrase in query_phrases]

    # Calculate cosine similarities
    chunk_similarities = {}
    for (page, chunk_number), phrases in phrase_embeddings.items():
        similarities = []
        for phrase, embedding in phrases:
            phrase_similarities = [cosine_similarity(embedding, query_embedding) for query_embedding in query_embeddings]
            similarities.append(max(phrase_similarities))

        # Average similarity for the chunk
        average_similarity = np.mean(similarities)
        chunk_similarities[(page, chunk_number)] = average_similarity


    top_chunks = sorted(chunk_similarities.items(), key=lambda x: x[1], reverse=True)[:5]

    selected_chunks = []
    for (page, chunk_number), similarity in top_chunks:
        selected_chunks.append(page_chunks[page][chunk_number - 1])

    return selected_chunks

In [15]:
print("Welcome to the Financial QA Chatbot! Ask your questions or type 'exit' to quit.")
while True:
    query = input("You: ")
    if query.lower() == "exit":
        print("Chatbot: Goodbye!")
        break

    # Find relevant chunks to generate a response
    selected_chunks = find_relevant_chunks(query, page_chunks, phrase_embeddings, model)
    context = "\n\n".join(selected_chunks)
    answer = generate_response(context, query)

    print(f"Chatbot: {answer}")

Welcome to the Financial QA Chatbot! Ask your questions or type 'exit' to quit.
Chatbot: her comprehensive income, net*

2.17 OTHER INCOME, NET
Accounting policy
Foreign currency 
Accounting policy
Functional currency
Transactions and translations
Government grant
Other income for the three months and year ended March 31, 2024 and March 31, 2023 is as follows:
(In ₹ crore)
Particulars
2024 
2023 
2024 
2023 
Interest income on financial assets carried at amortized cost
Tax free bonds and Government bonds
                        31                         36                            131                      149 
Deposit with Bank and others
                      222                       161                            929                      712 
                      318 
231                     —                         —                             —                        —                   — 
                         —                          —                             —   