In [None]:
# Install necessary libraries directly into the current kernel
%pip install langchain langchain-community langchain-text-splitters sentence-transformers chromadb

Note: you may need to restart the kernel to use updated packages.




In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from tqdm.auto import tqdm

# Setup Paths
ROOT = Path("..").resolve()
PROCESSED_DATA_PATH = ROOT / "data" / "processed" / "filtered_complaints.csv"
VECTOR_STORE_DIR = ROOT / "vector_store"
VECTOR_STORE_DIR.mkdir(parents=True, exist_ok=True)

print(f"Data Path: {PROCESSED_DATA_PATH}")
print(f"Vector Store Path: {VECTOR_STORE_DIR}")

  from .autonotebook import tqdm as notebook_tqdm


Data Path: C:\BackUp\web-projects\tenx\complaint-analysis-rag-week7\data\processed\filtered_complaints.csv
Vector Store Path: C:\BackUp\web-projects\tenx\complaint-analysis-rag-week7\vector_store


### 1. Load Data & Stratified Sampling
Load the filtered dataset and create a stratified sample of ~10,000 complaints to ensure proportional representation across products.

In [7]:
# Load Data
df = pd.read_csv(PROCESSED_DATA_PATH)
print(f"Total records: {len(df)}")

# Stratified Sampling
# We want ~10,000 records.
SAMPLE_SIZE = 10000

# Calculate proportion of each product
product_counts = df['Product'].value_counts(normalize=True)
print("\nOriginal Product Distribution:")
display(product_counts)

# Sample
# We use groupby and sample to maintain proportions
df_sample = df.groupby('Product', group_keys=False).apply(lambda x: x.sample(frac=SAMPLE_SIZE/len(df), random_state=42))

print(f"\nSampled records: {len(df_sample)}")
print("\nSampled Product Distribution:")
display(df_sample['Product'].value_counts(normalize=True))

# Reset index
df_sample = df_sample.reset_index(drop=True)
df_sample.head()

Total records: 477714

Original Product Distribution:


Product
Checking or savings account                           0.293730
Credit card or prepaid card                           0.227473
Money transfer, virtual currency, or money service    0.203444
Credit card                                           0.168860
Payday loan, title loan, or personal loan             0.036084
Bank account or service                               0.031159
Consumer Loan                                         0.019805
Prepaid card                                          0.016311
Money transfers                                       0.003134
Name: proportion, dtype: float64


Sampled records: 10000

Sampled Product Distribution:


  df_sample = df.groupby('Product', group_keys=False).apply(lambda x: x.sample(frac=SAMPLE_SIZE/len(df), random_state=42))


Product
Checking or savings account                           0.2937
Credit card or prepaid card                           0.2275
Money transfer, virtual currency, or money service    0.2034
Credit card                                           0.1689
Payday loan, title loan, or personal loan             0.0361
Bank account or service                               0.0312
Consumer Loan                                         0.0198
Prepaid card                                          0.0163
Money transfers                                       0.0031
Name: proportion, dtype: float64

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID,narrative_length,cleaned_narrative
0,2017-02-24,Bank account or service,Checking account,"Account opening, closing, or management",,"On XXXX XXXX and XXXX XXXX, there were bank ac...",Company has responded to the consumer and the ...,WELLS FARGO & COMPANY,NJ,7002,,Consent provided,Web,2017-02-24,Closed with explanation,No,No,2360804,180.0,bank account opened well fargo name open accou...
1,2017-01-23,Bank account or service,Checking account,"Account opening, closing, or management",,WELLS FARGO CROOKS. Back in XXXX 2016 I closed...,Company has responded to the consumer and the ...,WELLS FARGO & COMPANY,TX,78109,,Consent provided,Web,2017-01-23,Closed with explanation,No,No,2303755,170.0,well fargo crook back closed well fargo checki...
2,2017-03-03,Bank account or service,Checking account,"Account opening, closing, or management",,I did not receive the {$400.00} business accou...,,FIFTH THIRD FINANCIAL CORPORATION,MI,48910,,Consent provided,Web,2017-03-03,Closed with explanation,Yes,No,2370332,141.0,receive business account opening bonus bank ev...
3,2015-10-29,Bank account or service,Checking account,"Account opening, closing, or management",,"Earlier this year, I opened an account with US...",Company believes it acted appropriately as aut...,UNITED SERVICES AUTOMOBILE ASSOCIATION,DC,20020,,Consent provided,Web,2015-11-02,Closed with explanation,Yes,No,1631745,357.0,earlier t year opened account usaa via online ...
4,2017-01-05,Bank account or service,Checking account,"Account opening, closing, or management",,In XXXX I had a bank account with Wells Fargo ...,Company has responded to the consumer and the ...,WELLS FARGO & COMPANY,AZ,85350,"Older American, Servicemember",Consent provided,Web,2017-01-05,Closed with explanation,No,No,2276494,273.0,bank account well fargo bank arizona closed ac...


### 2. Text Chunking
Implement `RecursiveCharacterTextSplitter` to handle long narratives.
We will experiment with `chunk_size` and `chunk_overlap`.
*   **Chunk Size**: 500 characters (approx 100-150 words) is often a good balance for sentence-level embeddings.
*   **Overlap**: 50 characters to maintain context between chunks.

In [8]:
# Initialize Splitter
chunk_size = 500
chunk_overlap = 50

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    length_function=len,
    separators=["\n\n", "\n", ". ", " ", ""]
)

# Function to process dataframe and create chunks
def create_chunks(df, text_col='cleaned_narrative'):
    chunks = []
    
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Chunking"):
        text = row[text_col]
        if not isinstance(text, str) or not text.strip():
            continue
            
        # Split text
        texts = text_splitter.split_text(text)
        
        # Create chunk records with metadata
        for i, chunk_text in enumerate(texts):
            chunks.append({
                'chunk_id': str(uuid.uuid4()),
                'complaint_id': str(row.get('Complaint ID', 'unknown')),
                'product': row['Product'],
                'text': chunk_text,
                'chunk_index': i,
                'original_index': idx
            })
            
    return pd.DataFrame(chunks)

# Apply chunking
df_chunks = create_chunks(df_sample)

print(f"Created {len(df_chunks)} chunks from {len(df_sample)} documents.")
print(f"Average chunks per document: {len(df_chunks)/len(df_sample):.2f}")
display(df_chunks.head())

Chunking: 100%|██████████| 10000/10000 [00:02<00:00, 3402.25it/s]


Created 18941 chunks from 10000 documents.
Average chunks per document: 1.89


Unnamed: 0,chunk_id,complaint_id,product,text,chunk_index,original_index
0,f7c9c86a-c474-4c7f-9b8b-a925d7e2323c,2360804,Bank account or service,bank account opened well fargo name open accou...,0,0
1,b5ebe23a-5d51-4998-a0a9-7e641285b030,2360804,Bank account or service,fargo pay identity theft protection service sa...,1,0
2,1cb89778-2bbe-4fe4-a160-b58c5e2af24e,2303755,Bank account or service,well fargo crook back closed well fargo checki...,0,1
3,dcbad2ba-6df5-44d6-999f-a711ab2b3d76,2370332,Bank account or service,receive business account opening bonus bank ev...,0,2
4,7688b11a-dcc6-4522-ba43-a571609e11f7,2370332,Bank account or service,addition performing debit transaction proof re...,1,2


### 3. Embedding & Indexing
*   **Model**: `sentence-transformers/all-MiniLM-L6-v2` (Fast, efficient, good performance for semantic search).
*   **Vector Store**: ChromaDB (Persistent).

In [9]:
# Initialize Embedding Model
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedding_model = SentenceTransformer(model_name)

# Generate Embeddings
# We can encode in batches for efficiency
batch_size = 32
texts = df_chunks['text'].tolist()
embeddings = embedding_model.encode(texts, batch_size=batch_size, show_progress_bar=True)

print(f"Generated embeddings shape: {embeddings.shape}")

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Batches: 100%|██████████| 592/592 [03:50<00:00,  2.57it/s]


Generated embeddings shape: (18941, 384)


In [11]:
# Initialize ChromaDB Client
chroma_client = chromadb.PersistentClient(path=str(VECTOR_STORE_DIR))

# Create or Get Collection
collection_name = "complaint_embeddings"
collection = chroma_client.get_or_create_collection(name=collection_name)
print(f"Loaded or created collection: {collection_name}")

# Prepare data for ChromaDB
ids = df_chunks['chunk_id'].tolist()
documents = df_chunks['text'].tolist()
metadatas = df_chunks[['complaint_id', 'product', 'chunk_index']].to_dict(orient='records')

# Add to collection (in batches to avoid hitting limits)
batch_size = 5000
total_chunks = len(ids)

for i in tqdm(range(0, total_chunks, batch_size), desc="Indexing"):
    end_idx = min(i + batch_size, total_chunks)
    
    collection.add(
        ids=ids[i:end_idx],
        embeddings=embeddings[i:end_idx].tolist(),
        metadatas=metadatas[i:end_idx],
        documents=documents[i:end_idx]
    )

print(f"Successfully indexed {collection.count()} chunks in ChromaDB.")

Loaded or created collection: complaint_embeddings


Indexing: 100%|██████████| 4/4 [00:26<00:00,  6.52s/it]

Successfully indexed 18941 chunks in ChromaDB.





### Summary of Task 2

In this notebook, we successfully transitioned from raw text data to a semantic search-ready vector store.

**Key Achievements:**
1.  **Stratified Sampling**: We created a representative sample of ~10,000 complaints, ensuring that the distribution of products (Credit Cards, Mortgages, etc.) matches the original dataset. This prevents bias towards the most frequent classes while keeping the dataset manageable for development.
2.  **Text Chunking**: We implemented a `RecursiveCharacterTextSplitter` strategy with a chunk size of 500 characters and an overlap of 50 characters. This breaks down long narratives into semantically meaningful units that fit within the context window of embedding models.
3.  **Embedding Generation**: We utilized the `sentence-transformers/all-MiniLM-L6-v2` model to convert text chunks into 384-dimensional dense vector embeddings. This model was chosen for its balance of speed and semantic accuracy.
4.  **Vector Store Indexing**: We successfully indexed the generated embeddings along with their metadata (Complaint ID, Product, Chunk Index) into a persistent **ChromaDB** collection named `complaint_embeddings`.

**Outcome:**
The `vector_store/` directory now contains a fully populated vector database. This database serves as the retrieval backbone for the RAG (Retrieval-Augmented Generation) system we will build in the next task. We can now query this store to find complaints semantically similar to a user's question.