In [1]:
import pandas as pd

# Load from cleaned output of Task 1
df = pd.read_csv("../data/filtered_complaints.csv")
print(df.columns.tolist())

['Date received', 'Product', 'Sub-product', 'Issue', 'Sub-issue', 'Consumer complaint narrative', 'Company public response', 'Company', 'State', 'ZIP code', 'Tags', 'Consumer consent provided?', 'Submitted via', 'Date sent to company', 'Company response to consumer', 'Timely response?', 'Consumer disputed?', 'Complaint ID', 'Standardized Product', 'Narrative Length', 'Cleaned Narrative']


In [2]:
# Select relevant columns
df = df[['Complaint ID', 'Consumer complaint narrative', 'Standardized Product']].dropna()
df = df[df['Consumer complaint narrative'].str.strip() != '']

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,     # experiment with 300, 500, 800
    chunk_overlap=100,  # to preserve context between chunks
    length_function=len
)

# Store chunks and metadata
chunks = []
metadatas = []

for i, row in df.iterrows():
    split_texts = text_splitter.split_text(row['Consumer complaint narrative'])
    
    for chunk_text in split_texts:
        chunks.append(chunk_text)
        metadatas.append({
            "complaint_id": row['Complaint ID'],
            "product": row['Standardized Product'],
            "original_text": row['Consumer complaint narrative'][:300] + "...",  # optional preview
        })
# Print the first 3 chunks and their metadata
for i in range(min(3, len(chunks))):
    print(f"\n--- Chunk {i+1} ---")
    print("Text:", chunks[i])
    print("Metadata:", metadatas[i])



--- Chunk 1 ---
Text: On XX/XX/XXXX I presented a check for mobile deposit for {$2300.00}. Wells Fargo made {$400.00} available immediately and said that the rest would be available XX/XX/XXXX. I received an email stating the check was held until XX/XX/XXXX and they reversed the available funds. So nothing from the check was available. They state the payee signature does not match. At XXXX XXXX I called Wells Fargo and spoke to a banker and a supervisor and they gave conflicting information. They said that my mothers
Metadata: {'complaint_id': '13999055', 'product': 'Savings Account', 'original_text': 'On XX/XX/XXXX I presented a check for mobile deposit for {$2300.00}. Wells Fargo made {$400.00} available immediately and said that the rest would be available XX/XX/XXXX. I received an email stating the check was held until XX/XX/XXXX and they reversed the available funds. So nothing from the check...'}

--- Chunk 2 ---
Text: spoke to a banker and a supervisor and they gave conflicting

In [4]:
from langchain_huggingface import HuggingFaceEmbeddings

# Load embedding model
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from langchain.vectorstores import Chroma

# Create vector store directory
persist_directory = "../vector_store"

# Build Chroma vector store
vectorstore = Chroma.from_texts(
    texts=chunks,
    embedding=embedding_model,
    metadatas=metadatas,
    persist_directory=persist_directory
)

# Persist to disk
vectorstore.persist()

print(" ChromaDB vector store created with", len(chunks), "chunks.",)


 ChromaDB vector store created with 78783 chunks.


  vectorstore.persist()


In [6]:
# Reload the vector store later
vectorstore = Chroma(
    embedding_function=embedding_model,
    persist_directory=persist_directory
)

query = "I am being charged on my credit card for something I didn’t authorize"
results = vectorstore.similarity_search(query, k=3)

for res in results:
    print("Product:", res.metadata['product'])
    print("Chunk:", res.page_content[:300])
    print("---")


  vectorstore = Chroma(


Product: Savings Account
Chunk: I keep getting charged for something I did not authorize.
---
Product: Savings Account
Chunk: I keep getting charged for something I did not authorize.
---
Product: Credit Card
Chunk: There is a charge from XX/XX/XXXX on my credit card that I did not authorized or have any knowledge of!
---
