# **OJKGPT**

This is the Jupyter Notebook for the development of the 1st version of OJKGPT for the showcase demo capabilites to wealth team program. <br>
In this development, we use several data sources for enriching the RAG system of the chatbot. 

In [1]:
import nest_asyncio
nest_asyncio.apply()

In [2]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [3]:
api_key = os.getenv("AZURE_OPENAI_KEY")
api_version = os.getenv("API_VERSION")
api_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
uri = os.getenv("DATABASE_URI")

In [4]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [5]:
from utils.models_definer import ModelName

# storing or not
OCR_TRESHOLD = 0.98
# TRESHOLD_CUTOFF = 0.2
TOP_K = 3
NUM_BATCH_EVAL = 10
STORE = False
DELETE = False
EVAL = True
model_name = ModelName.AZURE_OPENAI
model_name_eval = ModelName.OPENAI

## **LLM Models Define**

In [6]:
from llama_index.core import Settings
from utils.models_definer import get_llm_and_embedding

# ollama/openai/azure_openai
llm,embedding_llm = get_llm_and_embedding(model_name=model_name, api_key=api_key, api_version=api_version, api_endpoint=api_endpoint)

llm_eval = get_llm_and_embedding(model_name=model_name_eval, api_key=api_key, api_version=api_version, api_endpoint=api_endpoint)[0]


Settings.llm = llm
Settings.embed_model = embedding_llm

## **Loading Documents**

### **Document Reader and Node Parser**

In [7]:
from utils.documents_reader import read_documents
from utils.node_parser import parse_nodes

path = './data'

if STORE:
    docs = read_documents(path=path, ocr_treshold=OCR_TRESHOLD)
    nodes = parse_nodes(documents=docs, llm=llm)

## **Indexing & Storing**

In [8]:
from utils.index_store import (store_vector_index, load_vector_index)

# store
if STORE:
    vector_index = store_vector_index(nodes=nodes,embed_model=embedding_llm, delete=DELETE, uri=uri)
# load
else:
    vector_index = load_vector_index(uri=uri)

Loading the vector index completed.


## **Querying**

### **Retriever**

In [9]:
vector_retriever = vector_index.as_retriever(similarity_top_k=TOP_K)

### **Prompt**

In [10]:
from llama_index.core import PromptTemplate

qa_prompt = """\
Context information is below.
---------------------
{context_str}
---------------------
Given the context information and not prior knowledge, \
answer the query asking about banking compliance in Indonesia. 
Answer the question based on the context information.
ALWAYS ANSWER WITH USER'S LANGUAGE.
Please provide your answer with [regulation_number](file_url) in metadata 
(if possible) in the following format:

---------------------
Answer... \n\n
Source: [regulation_number](file_url) \n
---------------------

Query: {query_str}
Answer: \
"""

qa_prompt_tmpl = PromptTemplate(qa_prompt)

### **Query Engine**

In [11]:
from llama_index.core.query_engine import RetrieverQueryEngine

vector_query_engine = RetrieverQueryEngine.from_args(retriever=vector_retriever,llm=llm)

In [12]:
vector_query_engine.update_prompts(
    {"response_synthesizer:text_qa_template": qa_prompt_tmpl}
)

In [13]:
from utils.prompt_display import display_prompt_dict

prompts_dict = vector_query_engine.get_prompts()
display_prompt_dict(prompts_dict)

**Prompt Key**: response_synthesizer:text_qa_template<br>**Text:** <br>

Context information is below.
---------------------
{context_str}
---------------------
Given the context information and not prior knowledge, answer the query asking about banking compliance in Indonesia. 
Answer the question based on the context information.
ALWAYS ANSWER WITH USER'S LANGUAGE.
Please provide your answer with [regulation_number](file_url) in metadata 
(if possible) in the following format:

---------------------
Answer... 


Source: [regulation_number](file_url) 

---------------------

Query: {query_str}
Answer: 


<br><br>

**Prompt Key**: response_synthesizer:refine_template<br>**Text:** <br>

The original query is as follows: {query_str}
We have provided an existing answer: {existing_answer}
We have the opportunity to refine the existing answer (only if needed) with some more context below.
------------
{context_msg}
------------
Given the new context, refine the original answer to better answer the query. If the context isn't useful, return the original answer.
Refined Answer: 


<br><br>

In [14]:
a = "Apa judul peraturan 7/33/PBI/2005?" # Pencabutan atas Peraturan Bank Indonesia Nomor 5/17/PBI/2003 tentang Persyaratan dan Tata Cara Pelaksanaan Jaminan Pemerintah terhadap Kewajiban Pembayaran Bank Perkreditan Rakyat
b = "Kapan surat edaran No. 15/26/DPbS mulai berlaku?" # 1 Agustus 2013.
c = "Siapa nama dan jabatannya yang menandatangani surat dengan nomor 1/SEOJK.04/2013?" # NURHAIDA, kepala eksekutif pengawas pasar modal
d = "Saya ingin menyelenggarakan kegiatan pasar modal berikan saya nomor surat yang membahas mengenai hal ini!" # Peraturan Pemerintah Nomor 12 Tahun 2004
e = "Berapa persen jaminan moneter pada tanggal 20 Agustus 1958?" # 7,3%
f = "Surat edaran nomor berapa yang mengatur bank umum syariah dan unit usaha syariah?" # 15/26/DPbS
g = "Apa kepanjangan dari PAPSI?" # Pedoman Akuntansi Perbankan Syariah Indonesia
h = "apa judul peraturan nomor 112/KMK.03/2001?" # Keputusan Menteri Keuangan tentang Pemotongan Pajak Penghasil Pasal 21 atas Penghasilan berupa Uang Pesangon, Uang Tebusan Pensiun, dan Tunjangan Hari Tua atau Jaminan Hari Tua
i = "Saya ingin membuat sistem informasi lembaga jasa keuangan, berikan nomor regulasi dari peraturan yang membahas tentang manejemen risiko nya!" # 4/POJK.05/2021
j = "Apa kepanjangan dari SWDKLLJ?" # Sumbangan Wajib Dana Kecelakaan Lalu Lintas Jalan
k = "Berapa nilai SWDKLLJ dari sedan?" # Rp. 140.000
l = "Apa latar belakang dari peraturan NOMOR 4/POJK.05/2021?" # dalam bentuk list
m = "Apa itu LJKNB?" # Lembaga Jasa Keuangan Non Bank
n = "Apakah KMK Nomor 462/KMK.04/1998 masih berlaku" # tidak
o = "Apa itu Uang Pesangon?" # penghasilan yang dibayarkan oleh pemberi kerja kepada karyawan dengan nama dan dalam bentuk apapun sehubungan dengan berakhirnya masa kerja atau terjadi pemutusan  hubungan kerja, termasuk uang penghargaan masa kerja dan uang  ganti kerugian
p = "Apa itu CKPN?" # Cadangan Kerugian Penurunan Nilai.
q = "Kapan, dimana, dan oleh siapa surat nomor PER- 06/BL/2012 ditetapkan?" # Surat nomor PER-06/BL/2012 ditetapkan pada tanggal 22 November 2012 di Jakarta oleh Ketua Badan Pengawas Pasar Modal dan Lembaga Keuangan.
r = "Apa kepanjangan PSAK?" # Pernyataan Standar Akuntansi Keuangan
s = "Apa itu 'shahibul maal'?" # Pemilik dana pihak ketiga
t = "Judul peraturan NOMOR 1 /POJK.05/2018?"

query_str = f

In [15]:
from llama_index.core.response.notebook_utils import display_response

response_vector = vector_query_engine.query(query_str)
display_response(response_vector)

Retrying llama_index.embeddings.openai.base.get_embedding in 0.23367175953038333 seconds as it raised APITimeoutError: Request timed out..


**`Final Response:`** Surat Edaran Bank Indonesia Nomor 15/22/DPbS mengatur Bank Umum Syariah dan Unit Usaha Syariah. 

Source: [15/22/DPbS](https://www.ojk.go.id/id/regulasi/Documents/Pages/SEBI-perihal-Pedoman-Pelaksanaan-Tugas-dan-Tanggung-Jawab-Dewan-Pengawas-Syariah-Bank-Pembiayaan-Rakyat-Syariah/SEBI%2022.pdf)

### **Chat Engine**

In [16]:
from llama_index.core.storage.chat_store import SimpleChatStore
from llama_index.core.memory import ChatMemoryBuffer

chat_store = SimpleChatStore()
memory = ChatMemoryBuffer.from_defaults(
    token_limit=10000,
    chat_store=chat_store,
    chat_store_key="user1",
)

In [17]:
# from llama_index.agent.openai import OpenAIAgent
# from llama_index.core.agent.legacy.react.base import ReActAgent
from llama_index.core.chat_engine import CondenseQuestionChatEngine

chat_engine = CondenseQuestionChatEngine.from_defaults(
    llm=llm,
    memory=memory,
    query_engine=vector_query_engine,
    verbose=True,
)

In [18]:
from llama_index.core.response.notebook_utils import display_response

response = chat_engine.chat(message=query_str)
display_response(response)

Querying with: Surat edaran nomor berapa yang mengatur bank umum syariah dan unit usaha syariah?


**`Final Response:`** Surat Edaran Bank Indonesia Nomor 15/22/DPbS mengatur Bank Umum Syariah dan Unit Usaha Syariah. 

Source: [15/22/DPbS](https://www.ojk.go.id/id/regulasi/Documents/Pages/SEBI-perihal-Pedoman-Pelaksanaan-Tugas-dan-Tanggung-Jawab-Dewan-Pengawas-Syariah-Bank-Pembiayaan-Rakyat-Syariah/SEBI%2022.pdf)

In [19]:
from llama_index.core.response.notebook_utils import display_source_node

for node in response.source_nodes:
    display_source_node(node, show_source_metadata=True)    

**Node ID:** 5f06cde9-0849-41a6-be45-dc23152598af<br>**Similarity:** 0.9382452368736267<br>**Text:** spesifik, Bank wajib menyampaikan rencana tindak (action plan)  
yang akan dilakukan. 
D. 
Ketent...<br>**Metadata:** {'file_name': 'ojk-sebi-15_26_dpbs-10072013-sebi_perihal_pelaksanaan_pedoman_akuntansi_perbankan_syariah_indonesia_sebi_26_pdf.pdf', 'title': 'Surat Edaran Bank Indonesia perihal Pelaksanaan Pedoman Akuntansi Perbankan Syariah Indonesia', 'sector': 'Perbankan', 'subsector': 'Perbankan Syariah', 'regulation_type': 'SEBI', 'regulation_number': '15/26/DPbS', 'effective_date': '10 Juli 2013', 'file_url': 'https://www.ojk.go.id/id/regulasi/Documents/Pages/SEBI-perihal-Pelaksanaan-Pedoman-Akuntansi-Perbankan-Syariah-Indonesia/SEBI%2026.pdf', 'questions_this_excerpt_can_answer': "1. What is the regulation number and effective date of the Surat Edaran Bank Indonesia regarding the implementation of the Indonesian Islamic Banking Accounting Guidelines?\n2. What is the file URL for the document containing the Surat Edaran Bank Indonesia?\n3. What are the specific requirements for Shariah Banks and Public Accountants in relation to the estimation of collective impairment of financing?\n4. What actions should be taken by Public Accountants if they find that a Shariah Bank is applying collective impairment estimation without experiencing specific loss limitations?\n5. What is the purpose of disclosing information about collective impairment estimation in the Financial Statements' Notes in the Annual Report of Shariah Banks?", 'prev_section_summary': 'The key topics of this section are the formation of loss reserves for financial and non-financial assets by Sharia Banks, the calculation of collective impairment estimation for financing without specific loss data, and the requirement for Sharia Banks to use specific loss data for collective impairment estimation starting from January 1, 2015. The entities mentioned in this section are Bank Indonesia and Sharia Banks.', 'section_summary': "The key topics of the section are the implementation of the Indonesian Islamic Banking Accounting Guidelines, specific requirements for Shariah Banks and Public Accountants regarding the estimation of collective impairment of financing, actions to be taken by Public Accountants if a Shariah Bank is applying collective impairment estimation without specific loss limitations, and the purpose of disclosing information about collective impairment estimation in the Financial Statements' Notes in the Annual Report of Shariah Banks. The entities mentioned in the section are Bank Indonesia, Shariah Banks, Public Accountants, and Bank Umum Syariah."}<br>

**Node ID:** ee70edc5-6c48-42dc-8e16-b51bc8f16cde<br>**Similarity:** 0.93742436170578<br>**Text:** No. 15/22/DPbS                                                  Jakarta, 27 Juni 2013 
 
 
 
 
 
...<br>**Metadata:** {'file_name': 'ojk-sebi-15_22_dpbs-27062013-sebi_perihal_pedoman_pelaksanaan_tugas_dan_tanggung_jawab_dewan_pengawas_syariah_bank_pembiayaan_rakyat_syariah_sebi_22_pdf.pdf', 'title': 'Surat Edar Bank Indonesia perihal Pedoman Pelaksanaan Tugas dan Tanggung Jawab Dewan Pengawas Syariah Bank Pembiayaan Rakyat Syariah', 'sector': 'Perbankan', 'subsector': 'BPR,  Perbankan Syariah', 'regulation_type': 'SEBI', 'regulation_number': '15/22/DPbS', 'effective_date': '27 Juni 2013', 'file_url': 'https://www.ojk.go.id/id/regulasi/Documents/Pages/SEBI-perihal-Pedoman-Pelaksanaan-Tugas-dan-Tanggung-Jawab-Dewan-Pengawas-Syariah-Bank-Pembiayaan-Rakyat-Syariah/SEBI%2022.pdf', 'questions_this_excerpt_can_answer': '1. What is the regulation number and effective date of the Surat Edar Bank Indonesia regarding the guidelines for the roles and responsibilities of the Dewan Pengawas Syariah Bank Pembiayaan Rakyat Syariah?\n2. What is the title of the Surat Edar Bank Indonesia that provides guidelines for the roles and responsibilities of the Dewan Pengawas Syariah Bank Pembiayaan Rakyat Syariah?\n3. What is the sector and subsector that this regulation pertains to?\n4. What is the file URL where the full document of the Surat Edar Bank Indonesia can be accessed?\n5. What is the purpose of the Dewan Pengawas Syariah and what are their responsibilities in relation to Bank Pembiayaan Rakyat Syariah?', 'prev_section_summary': "The key topics of this section are the Surat Edaran Bank Indonesia regarding the Monthly Reports of People's Credit Banks. The section provides information about the regulation number (15/20/DKBU) and effective date (22 Mei 2013) of the Surat Edaran. It also mentions the file URL for the PDF document containing the Surat Edaran. Additionally, the section provides contact information for the help desk of Bank Indonesia regarding questions related to the Monthly Report application. The section also mentions transitional provisions for the preparation and submission of Monthly Reports before August 2013. Finally, it states that previous Surat Edaran Bank Indonesia documents are revoked and declared invalid with the implementation of the current Surat Edaran. The entities mentioned in the section are Bank Indonesia and BPR (Bank Perkreditan Rakyat).", 'section_summary': 'The key topics of the section are the guidelines for the roles and responsibilities of the Dewan Pengawas Syariah Bank Pembiayaan Rakyat Syariah. The section also mentions the regulation number and effective date of the Surat Edar Bank Indonesia, as well as the sector and subsector that the regulation pertains to. The section provides a file URL where the full document of the Surat Edar Bank Indonesia can be accessed. Additionally, the section briefly mentions the purpose of the Dewan Pengawas Syariah and their responsibilities in relation to Bank Pembiayaan Rakyat Syariah.'}<br>

**Node ID:** ac47321e-d27a-4d10-bdfe-773ef4a0ce08<br>**Similarity:** 0.9360443353652954<br>**Text:** IV. PEMBENTUKAN CADANGAN KERUGIAN 
A.  Bank Syariah wajib membentuk cadangan kerugian penurunan 
...<br>**Metadata:** {'file_name': 'ojk-sebi-15_26_dpbs-10072013-sebi_perihal_pelaksanaan_pedoman_akuntansi_perbankan_syariah_indonesia_sebi_26_pdf.pdf', 'title': 'Surat Edaran Bank Indonesia perihal Pelaksanaan Pedoman Akuntansi Perbankan Syariah Indonesia', 'sector': 'Perbankan', 'subsector': 'Perbankan Syariah', 'regulation_type': 'SEBI', 'regulation_number': '15/26/DPbS', 'effective_date': '10 Juli 2013', 'file_url': 'https://www.ojk.go.id/id/regulasi/Documents/Pages/SEBI-perihal-Pelaksanaan-Pedoman-Akuntansi-Perbankan-Syariah-Indonesia/SEBI%2026.pdf', 'questions_this_excerpt_can_answer': '1. What is the regulation number and effective date of the Surat Edaran Bank Indonesia regarding the implementation of the Indonesian Sharia Banking Accounting Guidelines?\n2. What is the file URL for accessing the SEBI 26 PDF document?\n3. What is the requirement for Sharia Banks to form a loss reserve for financial and non-financial assets?\n4. How should Sharia Banks calculate the collective impairment estimation for financing without specific loss data?\n5. Until when can Sharia Banks apply the collective impairment estimation method, and what method should they use after that date?', 'prev_section_summary': 'The key topics of this section are the implementation of the Indonesian Sharia Banking Accounting Guidelines, specifically regarding the recording of murabahah transactions in Sharia Banks using the proportional method. The section also discusses the types of income and expenses related to murabahah transactions and how they should be recognized and allocated. \n\nThe entities mentioned in this section are Bank Indonesia, which issued the Surat Edaran (circular) regarding the implementation of the guidelines, and Sharia Banks, which are required to follow the guidelines for accounting and reporting purposes.', 'section_summary': 'The key topics of this section are the formation of loss reserves for financial and non-financial assets by Sharia Banks, the calculation of collective impairment estimation for financing without specific loss data, and the requirement for Sharia Banks to use specific loss data for collective impairment estimation starting from January 1, 2015. The entities mentioned in this section are Bank Indonesia and Sharia Banks.'}<br>

In [20]:
chat_store_string = chat_store.json()
loaded_chat_store = SimpleChatStore.parse_raw(chat_store_string)

## **Evaluation**

In [21]:
import os

# check directory exist
if not os.path.exists("./json_data"):
    os.makedirs("./json_data")

In [22]:
QA_GENERATE_PROMPT_TMPL = """\
Context information is below.

---------------------
{context_str}
---------------------

Given the context information and not prior knowledge.
generate only questions based on the below query.
generate quetions in BAHASA INDONESIA.

You are a Teacher/ Professor. Your task is to setup \
{num_questions_per_chunk} questions for an upcoming \
quiz/examination. The questions should be diverse in nature \
across the document. Restrict the questions to the \
context information provided."
"""

In [23]:
from llama_index.core.evaluation import (
    generate_question_context_pairs,
    EmbeddingQAFinetuneDataset,
)
if EVAL:
    if STORE:
        qa_dataset = generate_question_context_pairs(
            nodes=nodes,
            qa_generate_prompt_tmpl=QA_GENERATE_PROMPT_TMPL,
            llm=llm_eval,
            num_questions_per_chunk=2,
        )
        qa_dataset.save_json("./json_data/pg_eval_dataset.json")
    else:
        qa_dataset = EmbeddingQAFinetuneDataset.from_json("./json_data/pg_eval_dataset.json")

### **Retrieval Evaluation**

In [24]:
metrics = ["hit_rate", "mrr", "precision", "recall", "ap", "ndcg"]

In [25]:
from utils.evaluator import get_retrieval_eval_df

if EVAL:
    vector_retrieval_eval_results = await get_retrieval_eval_df(name=f"Top-{TOP_K} Eval", metrics=metrics, retriever=vector_retriever, qa_dataset=qa_dataset)
    display(vector_retrieval_eval_results)

Unnamed: 0,retrievers,hit_rate,mrr,precision,recall,ap,ndcg
0,Top-3 Eval,0.0,0.0,0.0,0.0,0.0,0.0


### **Response Evaluation**

In [26]:
from llama_index.core.evaluation import (
    FaithfulnessEvaluator,
    RelevancyEvaluator,
)

if EVAL:
    # model for evaluation
    relevancy_evaluator = RelevancyEvaluator(llm=llm_eval)
    faithfullness_evaluator = FaithfulnessEvaluator(llm=llm_eval)

#### **Faithfulness Evaluator**

Evaluate Response

In [29]:
from utils.evaluator import get_response_eval_df

if EVAL:
    vector_faithfullness_eval_result = faithfullness_evaluator.evaluate_response(response=response_vector, query=query_str)
    display(get_response_eval_df(query=query_str, response=response_vector, eval_result=vector_faithfullness_eval_result))

TypeError: __init__() missing 2 required keyword-only arguments: 'response' and 'body'

Evaluate Source Nodes

In [None]:
from utils.evaluator import get_response_eval_sources_df

if EVAL:
    relevancy_eval_sources = get_response_eval_sources_df(query=query_str, response=response_vector, evaluator=faithfullness_evaluator)
    display(relevancy_eval_sources)

#### **Relevancy Evaluator**

Evaluate Response

In [None]:
if EVAL:
    relevancy_eval_result = relevancy_evaluator.evaluate_response(
        query=query_str, response=response_vector
    )

    display(get_response_eval_df(query=query_str, response=response_vector, eval_result=relevancy_eval_result))

Evaluate Source Nodes

In [None]:
from utils.evaluator import get_response_eval_sources_df

if EVAL:
    relevancy_eval_sources = get_response_eval_sources_df(query=query_str, response=response_vector, evaluator=relevancy_evaluator)
    display(relevancy_eval_sources)

#### **Batch Evaluator (Faithfulness, Relevancy, Correctness)**

In [None]:
from llama_index.core.evaluation import BatchEvalRunner
from utils.evaluator import get_batch_eval_results
from utils.evaluator import get_batch_eval_df

if EVAL:
    runner = BatchEvalRunner(
        {
            "faithfulness": faithfullness_evaluator, 
            "relevancy": relevancy_evaluator,
        },
        workers=8,
    )

    vector_eval_results = await get_batch_eval_results(runner=runner, qa_dataset=qa_dataset, query_engine=vector_query_engine, num_queries=NUM_BATCH_EVAL)
    display(get_batch_eval_df(vector_eval_results))