In [88]:
import transformers
import torch
from langchain_community.document_loaders import PyPDFLoader

In [89]:
FILE_DIR = "./data/"
FILE_NAME = "bcbsc125.pdf"

# Step 1: read files

In [90]:
def load_files(file_path):
    """
    This function loads a file from the given path.

    :param file_path: the path to the file to be loaded
    :return: the content of the file
    """  
    loader = PyPDFLoader(file_path)
    documents = loader.load()
    return documents

In [91]:
file_content = load_files(FILE_DIR + FILE_NAME)

# Step 2: Text Splitting

In [92]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

def split_text(documents):
    """
    This function splits the text of the documents into smaller chunks.

    :param documents: the documents to be split
    :return: the list of text chunks
    """  
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    texts = text_splitter.split_documents(documents)
    return texts

text = split_text(file_content)

# Step 3: Create a vector db to store the contents

In [93]:
from langchain_huggingface import HuggingFaceEmbeddings

emb_model = "./models/Qwen3-Embedding-0.6B"

def get_embeddings(model_path):
    """
    This function initializes the HuggingFaceEmbeddings model.

    :param model_path: the path to the embedding model
    :return: the initialized embedding model
    """  
    embeddings = HuggingFaceEmbeddings(model_name=model_path)
    return embeddings

emb = get_embeddings(emb_model)

In [94]:
from langchain_chroma import Chroma

In [95]:
# Create primary key for documents
def create_primary_key(documents):
    """
    This function creates a primary key for each document.

    :param documents: the list of documents
    :return: list of documents with primary keys {primary_key: document}, primary_key is in 
             the format of "file:page:chunk_id
    """ 
    cur_page = 0 
    chunk_id = 0
    out_documents = []
    for doc in documents:
        page_num = doc.metadata['page']
        if cur_page != page_num:
            cur_page = page_num
            chunk_id = 0
        primary_key = f"{FILE_NAME}:{page_num}:{chunk_id}"
        out_documents.append([primary_key, doc])
        chunk_id += 1
    return out_documents

#doc_w_key = create_primary_key(text)

In [96]:
def db_initialize(documents, embedding_model):
    """
    This function creates a Chroma vector store from the given texts and embedding model.

    :param texts: the list of text chunks
    :param embedding_model: the embedding model to be used
    :return: the created Chroma vector store
    """ 
    # Database initialization
    db = Chroma(embedding_function=embedding_model,persist_directory="./chroma_db")


    # assign primary key to each document
    doc_w_key = create_primary_key(documents)

    # Check existing of chroma_db directory

    # add documents to the vector store
    ids = [doc[0] for doc in doc_w_key]
    doc_details = [doc[1] for doc in doc_w_key]
    db.add_documents(documents=doc_details,ids=ids)

    return db
chroma_db = db_initialize(text, emb)

In [97]:
# Update document
def update_document(db, new_documents):
    """
    This function updates the Chroma vector store with new documents.

    :param db: the Chroma vector store
    :param new_documents: the list of new documents to be added
    """ 
    return None


# Delete document
def delete_document(db, doc_ids):
    """
    This function deletes documents from the Chroma vector store based on their IDs.

    :param db: the Chroma vector store
    :param doc_ids: the list of document IDs to be deleted
    """ 
    return None


In [98]:
tt = chroma_db.similarity_search("What is the definitoin of credit risk? and what does bank need to do to mitigate credit risk")

In [99]:
print(tt[0].page_content)
print(tt[0].metadata['page'])

Credit risk management
1
Principles for the Management of Credit Risk
I. Introduction
1. While financial institutions have faced difficulties over the years for a multitude of
reasons, the major cause of serious banking problems continues to be directly related to lax
credit standards for borrowers and counterparties, poor portfolio risk management, or a lack
of attention to changes in economic or other circumstances that can lead to a deterioration in
the credit standing of a bank’s counterparties. This experience is common in both G-10 and
non-G-10 countries.
2. Credit risk is most simply defined as the potential that a bank borrower or
counterparty will fail to meet its obligations in accordance with agreed terms. The goal of
credit risk management is to maximise a bank’s risk-adjusted rate of return by maintaining
credit risk exposure within acceptable parameters. Banks need to manage the credit risk
0


In [100]:
print(text[1].page_content)
print(text[1].metadata)

credit risk management is to maximise a bank’s risk-adjusted rate of return by maintaining
credit risk exposure within acceptable parameters. Banks need to manage the credit risk
inherent in the entire portfolio as well as the risk in individual credits or transactions. Banks
should also consider the relationships between credit risk and other risks. The effective
management of credit risk is a critical component of a comprehensive approach to risk
management and essential to the long-term success of any banking organisation.
3. For most banks, loans are the largest and most obvious source of credit risk;
however, other sources of credit risk exist throughout the activities of a bank, including in the
banking book and in the trading book, and both on and off the balance sheet. Banks are
increasingly facing credit risk (or counterparty risk) in various financial instruments other
than loans, including acceptances, interbank transactions, trade financing, foreign exchange
{'producer': 'A

In [101]:
len(tt)

4

In [102]:
from huggingface_hub import snapshot_download


In [103]:
#snapshot_download(repo_id="Qwen/Qwen3-Embedding-0.6B", local_dir="local-models/Qwen3-Embedding-0.6B")