In [1]:
# Cell 1: Imports
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pandas as pd
import faiss
import os
import pickle
from tqdm import tqdm

# Cell 2: Load Filtered Data
df = pd.read_csv("../data/processed/filtered_complaints.csv")
df = df.dropna(subset=["cleaned_narrative"])
df.head()

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID,narrative_length,cleaned_narrative
0,2025-06-13,Credit card,Store credit card,Getting a credit card,Card opened without my consent or knowledge,A XXXX XXXX card was opened under my name by a...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",TX,78230,Servicemember,Consent provided,Web,2025-06-13,Closed with non-monetary relief,Yes,,14069121,91,a xxxx xxxx card was opened under my name by a...
1,2025-06-12,Credit card,General-purpose credit card or charge card,"Other features, terms, or problems",Other problem,"Dear CFPB, I have a secured credit card with c...",Company has responded to the consumer and the ...,"CITIBANK, N.A.",NY,11220,,Consent provided,Web,2025-06-13,Closed with monetary relief,Yes,,14047085,156,dear cfpb i have a secured credit card with ci...
2,2025-06-12,Credit card,General-purpose credit card or charge card,Incorrect information on your report,Account information incorrect,I have a Citi rewards cards. The credit balanc...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",IL,60067,,Consent provided,Web,2025-06-12,Closed with explanation,Yes,,14040217,233,i have a citi rewards cards the credit balance...
3,2025-06-09,Credit card,General-purpose credit card or charge card,Problem with a purchase shown on your statement,Credit card company isn't resolving a dispute ...,b'I am writing to dispute the following charge...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",TX,78413,Older American,Consent provided,Web,2025-06-09,Closed with monetary relief,Yes,,13968411,454,b i am writing to dispute the following charge...
4,2025-06-09,Credit card,General-purpose credit card or charge card,Problem when making payments,Problem during payment process,"Although the account had been deemed closed, I...",Company believes it acted appropriately as aut...,Atlanticus Services Corporation,NY,11212,Older American,Consent provided,Web,2025-06-09,Closed with monetary relief,Yes,,13965746,170,although the account had been deemed closed i ...


In [2]:
# Cell 3: Chunking Narratives
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=50
)

chunks = []
metadatas = []

for idx, row in df.iterrows():
    split_chunks = text_splitter.split_text(row["cleaned_narrative"])
    for chunk in split_chunks:
        chunks.append(chunk)
        metadatas.append({
            "product": row["Product"],
            "index": idx
        })

print(f"Total Chunks: {len(chunks)}")

Total Chunks: 373494


In [3]:
# Cell 4: Embedding with MiniLM
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings = model.encode(chunks, show_progress_bar=True)

Batches: 100%|██████████| 11672/11672 [1:15:39<00:00,  2.57it/s]  


In [4]:
# Cell 5: Save Vector Store with FAISS
embedding_dim = embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(embeddings)

# Save index
os.makedirs("../vector_store", exist_ok=True)
faiss.write_index(index, "../vector_store/faiss_index.index")

# Save associated metadata
with open("../vector_store/chunk_metadata.pkl", "wb") as f:
    pickle.dump(metadatas, f)

# Save chunks (optional for source reference)
with open("../vector_store/chunks.pkl", "wb") as f:
    pickle.dump(chunks, f)

print("✅ Vector store and metadata saved.")

✅ Vector store and metadata saved.
