In [2]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join('..')))

import pandas as pd
from sklearn.model_selection import train_test_split

# Load the cleaned data from Task 1
df = pd.read_csv('../data/processed/filtered_complaints.csv')

# Create a stratified sample (15,000 records)
# This ensures "Money Transfers" (smaller category) is still well-represented 
# relative to "Credit Cards" (larger category)
sample_size = 15000
df_sample, _ = train_test_split(
    df, 
    train_size=sample_size, 
    stratify=df['Product'], 
    random_state=42
)

print(f"Sampled Dataset Shape: {df_sample.shape}")
print("Product Distribution in Sample:\n", df_sample['Product'].value_counts(normalize=True))

Sampled Dataset Shape: (15000, 20)
Product Distribution in Sample:
 Product
Checking or savings account                           0.328733
Credit card or prepaid card                           0.254600
Money transfer, virtual currency, or money service    0.227667
Credit card                                           0.189000
Name: proportion, dtype: float64


In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# We choose 500 chars (~100 words) with a 50 char overlap
# Overlap ensures that context isn't lost if a sentence is cut in half
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    length_function=len,
    add_start_index=True,
)

# Function to process the dataframe into chunks with metadata
def create_chunks(df):
    chunked_docs = []
    
    for _, row in df.iterrows():
        # Create chunks for this specific narrative
        chunks = text_splitter.split_text(row['cleaned_narrative'])
        
        for i, chunk in enumerate(chunks):
            # Store the text and the metadata needed to trace it back
            chunked_docs.append({
                "page_content": chunk,
                "metadata": {
                    "complaint_id": row['Complaint ID'],
                    "product": row['Product'],
                    "company": row['Company'],
                    "chunk_index": i
                }
            })
    return chunked_docs

docs = create_chunks(df_sample)
print(f"Created {len(docs)} total chunks from {len(df_sample)} complaints.")

Created 41457 total chunks from 15000 complaints.


In [4]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

# 1. Initialize the Embedding Model
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {'device': 'cpu'} # Change to 'cuda' if using a GPU
encode_kwargs = {'normalize_embeddings': False}

embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

# 2. Create and Persist the Vector Store
# This will save the database into the 'vector_store/' folder
persist_directory = '../vector_store/complaints_db'

# Extract text and metadata for Chroma
texts = [doc['page_content'] for doc in docs]
metadatas = [doc['metadata'] for doc in docs]

print("Starting Indexing... this may take 5-10 minutes depending on CPU.")

vector_db = Chroma.from_texts(
    texts=texts,
    embedding=embeddings,
    metadatas=metadatas,
    persist_directory=persist_directory
)

print(f"Vector Store saved at {persist_directory}")

Starting Indexing... this may take 5-10 minutes depending on CPU.
Vector Store saved at ../vector_store/complaints_db


In [5]:
# Test if the search works
query = "How do I dispute an unauthorized credit card charge?"
results = vector_db.similarity_search(query, k=3)

for i, res in enumerate(results):
    print(f"\nResult {i+1}:")
    print(f"Product: {res.metadata['product']}")
    print(f"Content: {res.page_content[:200]}...")


Result 1:
Product: Credit card or prepaid card
Content: for approimately minutes while someone worked on my issue was told that this is an issue for my bank to fi and was hung up on continues to tell me that i need to call my bank and that because the char...

Result 2:
Product: Credit card or prepaid card
Content: i had an issue with this company beginning in i used my card to purchase something online i then received a bill with an unauthorized charge on it i called the company and disputed the charge they sta...

Result 3:
Product: Credit card
Content: the credit card company on to investigate the charge through their online how can i dispute a charge process when i called their customer service staff according to the instructions of their online pr...
