In [6]:
# notebooks/task_2_chunking_embedding_indexing_chromadb.ipynb
import pandas as pd
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma # <--- THIS LINE IS CRUCIAL
from langchain_core.documents import Document
from chromadb.config import Settings
import warnings
warnings.filterwarnings('ignore')
!pip install -qU numpy==1.26.4

# --- Configuration ---
DATA_DIR = '../data'
VECTOR_STORE_DIR = '../vector_store'
FILTERED_DATA_FILE = os.path.join(DATA_DIR, 'filtered_complaints.csv')
CHROMA_PERSIST_DIR = os.path.join(VECTOR_STORE_DIR, 'chroma_db_credi_trust')

# Ensure directories exist
os.makedirs(VECTOR_STORE_DIR, exist_ok=True)

# Load the cleaned and filtered dataset from Task 1
try:
    df_cleaned = pd.read_csv(FILTERED_DATA_FILE)
    print(f"Successfully loaded cleaned data from {FILTERED_DATA_FILE}. Shape: {df_cleaned.shape}")
except FileNotFoundError:
    print(f"Error: {FILTERED_DATA_FILE} not found. Please ensure Task 1 was completed and the file exists.")
    exit()

# Identify the narrative column (assuming it's 'Consumer complaint narrative' from Task 1)
NARRATIVE_COLUMN = 'Consumer complaint narrative'
if NARRATIVE_COLUMN not in df_cleaned.columns:
    print(f"Error: '{NARRATIVE_COLUMN}' not found in the loaded DataFrame. Please check the column name.")
    exit()

# Display a sample of the cleaned narratives
print("\nSample of cleaned narratives:")
print(df_cleaned[NARRATIVE_COLUMN].sample(5).tolist())


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Successfully loaded cleaned data from ../data\filtered_complaints.csv. Shape: (239539, 18)

Sample of cleaned narratives:
['mailed in a NUMBER .00 money order the tracking information shows the payment was received. i was told payments mailed in to the specific address would post when received. my payment has not been posted two workers hung up on me after i began to get irritated with them because my account did not reflect accurate notes because i was in two systems and my name was spelled incorrectly on my initial loan documents.', 'loanmart has been calling me back to back several times a day harassing and threatening me.', 'albert banking sent me a text alert on stating they were mailing me a new debit card when i didnt ask for . a representative locked my account for security reasons when i already verified myself. i have not been helped in anyway. i just keep being told that my situation has been escalated. the representatives have been ignoring my text messages and emails. its 

In [7]:
print("\n--- Applying Text Chunking Strategy ---")

FINAL_CHUNK_SIZE = 500
FINAL_CHUNK_OVERLAP = 100

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=FINAL_CHUNK_SIZE,
    chunk_overlap=FINAL_CHUNK_OVERLAP,
    length_function=len,
    separators=["\n\n", "\n", " ", ""]
)

# Prepare documents with metadata for ChromaDB
documents = []
for index, row in df_cleaned.iterrows():
    text = row[NARRATIVE_COLUMN]
    if pd.notna(text) and text.strip() != '':
        chunks = text_splitter.split_text(text)
        for i, chunk_content in enumerate(chunks):
            documents.append(Document(
                page_content=chunk_content,
                metadata={
                    "complaint_id": str(row['Complaint ID']) if 'Complaint ID' in row else str(index), # Ensure ID is string
                    "product": row['Product'],
                    "chunk_id": i,
                    "original_index": index # To trace back to original DataFrame row
                }
            ))

print(f"\nTotal number of documents (chunks) created: {len(documents)}")
if documents:
    print(f"First chunk example:\nContent: {documents[0].page_content}\nMetadata: {documents[0].metadata}")
    print(f"Second chunk example:\nContent: {documents[1].page_content}\nMetadata: {documents[1].metadata}")

# Justification for Chunking Strategy (for your report)
print(f"\nJustification for Chunking Strategy (for your report):")
print(f"Chosen chunk_size: {FINAL_CHUNK_SIZE} characters.")
print(f"Chosen chunk_overlap: {FINAL_CHUNK_OVERLAP} characters.")
print(f"This strategy aims to create chunks that are sufficiently large to encapsulate meaningful context related to a complaint, while remaining small enough for efficient embedding and retrieval. The overlap is crucial to prevent the loss of context that might occur at chunk boundaries, thereby improving the chances of retrieving relevant information even if it spans across two separate chunks.")


--- Applying Text Chunking Strategy ---

Total number of documents (chunks) created: 730562
First chunk example:
Content: a card was opened under my name by a fraudster. i received a notice from that an account was just opened under my name. i reached out to to state that this activity was unauthorized and not me. confirmed this was fraudulent and immediately closed the card. however, they have failed to remove this from the three credit agencies and this fraud is now impacting my credit score based on a hard credit pull done by that was done by a fraudster.
Metadata: {'complaint_id': '14069121', 'product': 'Credit card', 'chunk_id': 0, 'original_index': 0}
Second chunk example:
Content: i made the mistake of using my wellsfargo debit card to depsit funds into atm machine outside their branch. i went into the branch and was told they couldnt help and had to phone the customer service for help. i did this and was told i was helped gave all the info for the time terminal id aact s, was 

In [8]:
print("\n--- Choosing Embedding Model ---")
# Initialize the embedding model
# i use HuggingFaceEmbeddings which is a convenient wrapper for sentence-transformers models
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedding_model = HuggingFaceEmbeddings(model_name=model_name)

print(f"Chosen embedding model: {model_name}")
print(f"Brief explanation for model choice:")
print(f"The 'sentence-transformers/all-MiniLM-L6-v2' model was selected due to its proven balance of computational efficiency and semantic accuracy. It excels at generating high-quality sentence embeddings, which are vital for effective semantic search in the context of customer complaints. Its relatively small size (384-dimensional embeddings) and fast inference speed make it ideal for an internal AI tool that requires quick processing of large volumes of text data without significant hardware overhead. This model effectively captures the nuanced meaning of sentences and short paragraphs, which is essential for identifying subtle complaint trends.")



--- Choosing Embedding Model ---
Chosen embedding model: sentence-transformers/all-MiniLM-L6-v2
Brief explanation for model choice:
The 'sentence-transformers/all-MiniLM-L6-v2' model was selected due to its proven balance of computational efficiency and semantic accuracy. It excels at generating high-quality sentence embeddings, which are vital for effective semantic search in the context of customer complaints. Its relatively small size (384-dimensional embeddings) and fast inference speed make it ideal for an internal AI tool that requires quick processing of large volumes of text data without significant hardware overhead. This model effectively captures the nuanced meaning of sentences and short paragraphs, which is essential for identifying subtle complaint trends.


In [10]:
print("\n--- Creating and Persisting ChromaDB Vector Store ---")
from chromadb.config import Settings  # Import Settings from chromadb.config

chroma_client_settings = Settings(
    anonymized_telemetry=False,
)

try:
    print("Initializing ChromaDB...")
    vector_store = Chroma.from_documents(
        documents=documents,
        embedding=embedding_model,
        persist_directory=CHROMA_PERSIST_DIR,
        collection_name="credi_trust_complaints",
        client_settings=chroma_client_settings
    )
    print("ChromaDB initialized.")

    print("Persisting the ChromaDB index...")
    vector_store.persist()
    print(f"ChromaDB index persisted locally to: {CHROMA_PERSIST_DIR}")

    print("\nLoading the persisted ChromaDB index for verification...")
    loaded_vector_store = Chroma(
        persist_directory=CHROMA_PERSIST_DIR,
        embedding_function=embedding_model,
        collection_name="credi_trust_complaints"
    )
    print("ChromaDB index loaded successfully from disk.")

    # Test a similarity search
    query = "unauthorized transactions on my credit card"
    print("Performing similarity search...")
    results = loaded_vector_store.similarity_search(query, k=5)  # Retrieve top 5 similar documents

    print(f"\nTop 5 results for query: '{query}'")
    for i, doc in enumerate(results):
        print(f"\n--- Result {i + 1} ---")
        print(f"Content (first 200 chars): {doc.page_content[:200]}...")
        print(f"Metadata: {doc.metadata}")
        print(f"Source Complaint ID: {doc.metadata.get('complaint_id')}")
        print(f"Product: {doc.metadata.get('product')}")
        print(f"Chunk ID: {doc.metadata.get('chunk_id')}")

except Exception as e:
    print(f"An error occurred during ChromaDB vector store creation or saving: {e}")

print("\nTask 2: Text Chunking, Embedding, and Vector Store Indexing (with ChromaDB) completed.")


--- Creating and Persisting ChromaDB Vector Store ---
Initializing ChromaDB...


Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


KeyboardInterrupt: 