In [None]:
import rag_chatbot.embeddings.embedder as em
from rag_chatbot.data.handler import DataHandler
from rag_chatbot.chunking.text_splitter import chunk_documents
from rag_chatbot.chunking.sample import stratified_sample


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_clean = DataHandler.from_registry(
    "DATA", "interim_dir", "complaints_clean.parquet").load()

In [4]:
df_sample = stratified_sample(
    df_clean,
    group_col="product_category",
    total_samples=12000,
)

In [5]:
df_sample.shape

(11997, 10)

In [6]:
df_sample.head()

Unnamed: 0,complaint_id,product_category,product,issue,sub_issue,company,state,date_received,clean_narrative,consumer_complaint_narrative
0,3305741,Savings account,Checking account,Managing an account,Banking errors,WELLS FARGO & COMPANY,GA,2019-07-13,consumer financial protection bureau cfpb step...,consumer financial protection bureau cfpb step...
1,5268713,Savings account,Checking account,Problem with a lender or other company chargin...,Transaction was not authorized,TRUIST FINANCIAL CORPORATION,GA,2022-02-28,i called the <masked> customer service number ...,i called the <masked> customer service number ...
2,3744902,Personal loan,Title loan,Charged fees or interest you didn't expect,,Rapid Auto Loans LLC,FL,2020-07-14,rapid auto loan issues aged reports for 4 mont...,rapid auto loan issues aged reports for 4 mont...
3,7958400,Savings account,Checking account,Managing an account,Deposits and withdrawals,CAPITAL ONE FINANCIAL CORPORATION,TX,2023-12-07,"hello first of all, i would like to say that m...","hello first of all, i would like to say that m..."
4,5111585,Savings account,Checking account,Opening an account,Account opened as a result of fraud,WELLS FARGO & COMPANY,CT,2022-01-14,i ve been attempting to resolve issue of missi...,i ve been attempting to resolve issue of missi...


In [7]:
df_sample=df_sample.drop(columns=["clean_narrative"])

- Stratified sampling was used to preserve proportional representation across product categories, preventing dominant categories from biasing retrieval results.

### Chunking and Embedding

##### Chunking documents using RecursiveCharacterTextSplitter, with chunk_size=500, chunk_overlap=100,

In [None]:
# Sample
docs=chunk_documents(df_sample)

### Embedding + Vector Store (FAISS)

- Strong semantic performance for short–medium text
- Lightweight (384 dimensions)
- Fast CPU inference
- Widely benchmarked and stable
- Excellent for RAG and FAISS

- The all-MiniLM-L6-v2 model was selected due to its strong semantic retrieval performance, low dimensionality, and efficient inference, making it well suited for large-scale complaint indexing.

In [None]:
# 3. Embed
embeddings = em.build_embeddings(docs)


Batches: 100%|██████████| 1203/1203 [29:17<00:00,  1.46s/it]


In [None]:

# 4. Index
index = em.build_faiss_index(embeddings)


In [None]:

# 5. Persist
em.save_vector_store(index, docs)