In [3]:
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm

import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer


In [4]:
DATA_PATH = Path("../data/filtered_complaints.csv")
df = pd.read_csv(DATA_PATH)

print("Loaded:", df.shape)
df.head()


Loaded: (449859, 4)


Unnamed: 0,complaint_id,product_category,product,clean_narrative
0,14069121,Credit Cards,Credit card,a xxxx xxxx card was opened under my name by a...
1,14061897,Savings Accounts,Checking or savings account,i made the mistake of using my wellsfargo debi...
2,14047085,Credit Cards,Credit card,dear cfpb i have a secured credit card with ci...
3,14040217,Credit Cards,Credit card,i have a citi rewards cards the credit balance...
4,13968411,Credit Cards,Credit card,b i am writing to dispute the following charge...


In [5]:
required = {"complaint_id", "product_category", "clean_narrative"}
missing = required - set(df.columns)

if missing:
    raise ValueError(f"Missing columns: {missing}")

df["product_category"].value_counts()


product_category
Credit Cards        189301
Savings Accounts    135745
Money Transfers      98680
Personal Loans       26133
Name: count, dtype: int64

In [6]:
N_TOTAL = 15000   # you can set 10000–15000

def stratified_sample(df, label_col, n_total, seed=42):
    counts = df[label_col].value_counts()
    total = counts.sum()

    targets = (counts / total * n_total).round().astype(int).to_dict()
    drift = n_total - sum(targets.values())
    if drift != 0:
        largest = counts.index[0]
        targets[largest] = max(1, targets[largest] + drift)

    parts = []
    for label, k in targets.items():
        g = df[df[label_col] == label]
        k = min(k, len(g))
        parts.append(g.sample(n=k, random_state=seed))

    return pd.concat(parts, ignore_index=True)

df_sample = stratified_sample(df, "product_category", N_TOTAL)

print("Sample shape:", df_sample.shape)
df_sample["product_category"].value_counts()


Sample shape: (15000, 4)


product_category
Credit Cards        6313
Savings Accounts    4526
Money Transfers     3290
Personal Loans       871
Name: count, dtype: int64

In [7]:
SAMPLE_OUT = Path("../data/processed/sample_15k.csv")
SAMPLE_OUT.parent.mkdir(parents=True, exist_ok=True)

df_sample.to_csv(SAMPLE_OUT, index=False)
print("Saved stratified sample to:", SAMPLE_OUT)


Saved stratified sample to: ..\data\processed\sample_15k.csv


In [8]:
def chunk_text(text, chunk_size=500, chunk_overlap=50):
    if chunk_overlap >= chunk_size:
        raise ValueError("chunk_overlap must be < chunk_size")

    text = str(text).strip()
    if not text:
        return []

    chunks = []
    start = 0
    step = chunk_size - chunk_overlap

    while start < len(text):
        end = min(start + chunk_size, len(text))
        chunk = text[start:end].strip()
        if chunk:
            chunks.append(chunk)
        start += step

    return chunks


In [9]:
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
embedder = SentenceTransformer(EMBED_MODEL)

print("Embedding model loaded:", EMBED_MODEL)


Embedding model loaded: sentence-transformers/all-MiniLM-L6-v2


In [10]:
CHROMA_DIR = Path("../vector_store/sample_chroma")
COLLECTION_NAME = "complaints_sample"

CHROMA_DIR.mkdir(parents=True, exist_ok=True)

client = chromadb.PersistentClient(
    path=str(CHROMA_DIR),
    settings=Settings(anonymized_telemetry=False)
)

collection = client.get_or_create_collection(name=COLLECTION_NAME)

print("ChromaDB ready at:", CHROMA_DIR)
print("Collection:", COLLECTION_NAME)


ChromaDB ready at: ..\vector_store\sample_chroma
Collection: complaints_sample


In [11]:
ids = []
documents = []
metadatas = []

for _, row in tqdm(df_sample.iterrows(), total=len(df_sample), desc="Chunking"):
    complaint_id = str(row["complaint_id"])
    category = str(row["product_category"])
    narrative = str(row["clean_narrative"])

    chunks = chunk_text(narrative, chunk_size=500, chunk_overlap=50)

    for i, ch in enumerate(chunks):
        ids.append(f"{complaint_id}_{i}")
        documents.append(ch)
        metadatas.append({
            "complaint_id": complaint_id,
            "product_category": category,
            "chunk_index": i,
            "total_chunks": len(chunks)
        })

print("Total chunks created:", len(documents))


Chunking: 100%|██████████| 15000/15000 [00:00<00:00, 17650.52it/s]

Total chunks created: 44220





In [12]:
batch_size = 256

for i in tqdm(range(0, len(documents), batch_size), desc="Embedding + Indexing"):
    docs_batch = documents[i:i+batch_size]
    ids_batch  = ids[i:i+batch_size]
    meta_batch = metadatas[i:i+batch_size]

    embeds = embedder.encode(docs_batch, normalize_embeddings=True).tolist()

    collection.add(
        ids=ids_batch,
        documents=docs_batch,
        metadatas=meta_batch,
        embeddings=embeds
    )

print("✅ Stored embeddings in ChromaDB at:", CHROMA_DIR)


Embedding + Indexing: 100%|██████████| 173/173 [15:42<00:00,  5.45s/it]

✅ Stored embeddings in ChromaDB at: ..\vector_store\sample_chroma





In [13]:
query = "Why are people unhappy with credit cards?"
q_emb = embedder.encode([query], normalize_embeddings=True).tolist()[0]

results = collection.query(
    query_embeddings=[q_emb],
    n_results=5,
    include=["documents", "metadatas", "distances"]
)

for i in range(5):
    print(f"\n--- Result {i+1} ---")
    print("Distance:", results["distances"][0][i])
    print("Metadata:", results["metadatas"][0][i])
    print("Text:", results["documents"][0][i][:300], "...")



--- Result 1 ---
Distance: 0.7876946330070496
Metadata: {'state': '', 'total_chunks': 4, 'issue': '', 'chunk_index': 3, 'product_category': 'Credit Cards', 'product': 'Credit card or prepaid card', 'sub_issue': '', 'company': '', 'complaint_id': '3051662', 'date_received': ''}
Text: e not financially savvy or have the ability to get accounts at a bank due to low amounts of money or whatever i feel this is a predatory practice on a population of low income young and or uneducated people that should be protected and not taken advantage of with my 20 00 card i would have 2 00 left ...

--- Result 2 ---
Distance: 0.8176965713500977
Metadata: {'state': '', 'complaint_id': '8046015', 'product': 'Credit card', 'chunk_index': 4, 'sub_issue': '', 'issue': '', 'total_chunks': 5, 'product_category': 'Credit Cards', 'date_received': '', 'company': ''}
Text: tuations to acrue even more debt and there is no financial burden on them in allowing access to the same convenience features that all of the