In [1]:
import pandas as pd
import os
import re
from tqdm import tqdm


In [2]:
df = pd.read_csv("../data/processed/cleaned_complaints.csv")
print(df.shape)
df.head()


(454469, 11)


Unnamed: 0,Product,Consumer complaint narrative,cleaned_narrative_enhanced,original_length,cleaned_length,reduction_percent,Issue,Sub-issue,Company,State,Date received
0,Credit card,A XXXX XXXX card was opened under my name by a...,a card was opened under my name by a fraudster...,488,442,9.43,Getting a credit card,Card opened without my consent or knowledge,"CITIBANK, N.A.",TX,2025-06-13
1,Checking or savings account,I made the mistake of using my wellsfargo debi...,i made the mistake of using my wellsfargo debi...,555,539,2.88,Managing an account,Deposits and withdrawals,WELLS FARGO & COMPANY,ID,2025-06-13
2,Credit card,"Dear CFPB, I have a secured credit card with c...","dear cfpb, i have a secured credit card with c...",806,801,0.62,"Other features, terms, or problems",Other problem,"CITIBANK, N.A.",NY,2025-06-12
3,Credit card,I have a Citi rewards cards. The credit balanc...,i have a citi rewards cards. the credit balanc...,1199,1157,3.5,Incorrect information on your report,Account information incorrect,"CITIBANK, N.A.",IL,2025-06-12
4,Credit card,b'I am writing to dispute the following charge...,b'i am writing to dispute the following charge...,2908,2824,2.89,Problem with a purchase shown on your statement,Credit card company isn't resolving a dispute ...,"CITIBANK, N.A.",TX,2025-06-09


In [3]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text
df["cleaned_text"] = df["Consumer complaint narrative"].apply(clean_text)
df[["Consumer complaint narrative", "cleaned_text"]].head()


Unnamed: 0,Consumer complaint narrative,cleaned_text
0,A XXXX XXXX card was opened under my name by a...,a xxxx xxxx card was opened under my name by a...
1,I made the mistake of using my wellsfargo debi...,i made the mistake of using my wellsfargo debi...
2,"Dear CFPB, I have a secured credit card with c...",dear cfpb i have a secured credit card with ci...
3,I have a Citi rewards cards. The credit balanc...,i have a citi rewards cards the credit balance...
4,b'I am writing to dispute the following charge...,bi am writing to dispute the following charges...


In [4]:
SAMPLES_PER_PRODUCT = 5000  

sampled_df = (
    df.groupby("Product", group_keys=False)
      .apply(lambda x: x.sample(
          n=min(len(x), SAMPLES_PER_PRODUCT),
          random_state=42
      ))
)

print(sampled_df.shape)
sampled_df["Product"].value_counts()


  .apply(lambda x: x.sample(


(31497, 12)


Product
Checking or savings account                                5000
Credit card                                                5000
Credit card or prepaid card                                5000
Money transfer, virtual currency, or money service         5000
Payday loan, title loan, or personal loan                  5000
Payday loan, title loan, personal loan, or advance loan    5000
Money transfers                                            1497
Name: count, dtype: int64

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter



In [6]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)


In [7]:
chunks = []
metadatas = []

for idx, row in tqdm(sampled_df.iterrows(), total=len(sampled_df)):
    text = row["cleaned_text"]
    split_texts = text_splitter.split_text(text)
    
    for i, chunk in enumerate(split_texts):
        chunks.append(chunk)
        metadatas.append({
            "complaint_id": row.get("Complaint ID", idx),
            "product": row["Product"],
            "chunk_index": i
        })


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 31497/31497 [00:22<00:00, 1393.23it/s]


In [8]:
len(chunks), len(metadatas)


(90080, 90080)

In [9]:
MAX_CHUNKS = 3000
chunks = chunks[:MAX_CHUNKS]
metadatas = metadatas[:MAX_CHUNKS]

print(len(chunks))


3000


In [10]:
from langchain_huggingface import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)


In [11]:
from langchain_chroma import Chroma

vectorstore = Chroma(
    embedding_function=embedding_model,
    persist_directory="../vectorstore"
)


In [12]:
BATCH_SIZE = 100

for i in tqdm(range(0, len(chunks), BATCH_SIZE)):
    vectorstore.add_texts(
        texts=chunks[i:i+BATCH_SIZE],
        metadatas=metadatas[i:i+BATCH_SIZE]
    )


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [04:26<00:00,  8.90s/it]


In [14]:
query = "I was charged twice on my credit card"
docs = vectorstore.similarity_search(query, k=3)

for doc in docs:
    print(doc.page_content[:200])
    print(doc.metadata)
    print("-" * 40)


there are pending transactions that are considered double charges on my account there is a pending balance of 160000 that are all duplicate charges now my card is in overdraft and i cant use my card f
{'chunk_index': 0, 'product': 'Checking or savings account', 'total_chunks': 1, 'complaint_id': 11074}
----------------------------------------
i checked my account and 2 days later i seen two credits for 4300 and two non chase atm fees of 200 were returned into my account but not the 48000 i called chase to find out why and was told that it 
{'product': 'Checking or savings account', 'chunk_index': 1, 'total_chunks': 6, 'complaint_id': 255129}
----------------------------------------
on xxxx xxxx 2023 a charge was made in xxxx pa for 900 on my direct express debit card and on xxxx xxxx 2023 another charge was made in xxxx pa for 15000 i live in california and no one other than mys
{'chunk_index': 0, 'complaint_id': 122633, 'total_chunks': 2, 'product': 'Checking or savings account'}
----