In [1]:
import sys
import os
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from chromadb import Client
from chromadb.config import Settings
import numpy as np
from chromadb.config import Settings
from chromadb import Client
sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), 'src'))

In [2]:
#Initialize components
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    length_function=len,
    separators=["\n\n", "\n", " "]
)
embedder = SentenceTransformer("all-MiniLM-L6-v2")

In [4]:
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Initialize Chroma client using the new API
client = chromadb.PersistentClient(path="./vector_store")  

# Create or get a collection
collection = client.get_or_create_collection(
    name="complaint_embeddings",
    metadata={"hnsw:space": "cosine"}
)

# Optional: text splitter and embedding setup
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    length_function=len,
    separators=["\n\n", "\n", " "]
)
embedder = SentenceTransformer("all-MiniLM-L6-v2")


In [5]:
#Load data in chunks
def load_data_chunk(file_path, chunk_size=10000):
    """Load data in chunks to avoid memory issues"""
    return pd.read_csv(file_path, chunksize=chunk_size)

In [8]:
# Process data
data_path = os.path.join(os.path.dirname(os.getcwd()), 'data', 'complaints.csv')
chunks = load_data_chunk(data_path)

for i, chunk_df in enumerate(chunks):
    print(f"Processing chunk {i + 1}")
    
    # Filter to keep only necessary columns
    chunk_df = chunk_df[['Complaint ID', 'Product', 'Consumer complaint narrative']]
    
    # Skip rows without narrative
    chunk_df = chunk_df[chunk_df['Consumer complaint narrative'].notna()]
    
    # Split text
    all_chunks = []
    for _, row in chunk_df.iterrows():
        metadata = row.to_dict()
        text = row['Consumer complaint narrative']
        chunks = splitter.split_text(text)
        all_chunks.extend([{
            "text": chunk,
            "metadata": metadata
        } for chunk in chunks])
    
    # Extract texts and metadata
    texts = [chunk['text'] for chunk in all_chunks]
    metadatas = [chunk['metadata'] for chunk in all_chunks]
    
    # Generate embeddings in batches
    batch_size = 32
    embeddings = []
    for j in range(0, len(texts), batch_size):
        batch = texts[j:j + batch_size]
        batch_embeddings = embedder.encode(batch, show_progress_bar=False)
        embeddings.extend(batch_embeddings)
    
    # Store in vector database
    ids = [str(i) for i in range(len(embeddings))]
    collection.add(
        embeddings=embeddings,
        metadatas=metadatas,
        ids=ids
    )
    
    print(f"Chunk {i + 1} processed successfully")

Processing chunk 1
Chunk 1 processed successfully
Processing chunk 2
Chunk 2 processed successfully
Processing chunk 3
Chunk 3 processed successfully
Processing chunk 4
Chunk 4 processed successfully
Processing chunk 5
Chunk 5 processed successfully
Processing chunk 6
Chunk 6 processed successfully
Processing chunk 7
Chunk 7 processed successfully
Processing chunk 8
Chunk 8 processed successfully
Processing chunk 9
Chunk 9 processed successfully
Processing chunk 10
Chunk 10 processed successfully
Processing chunk 11
Chunk 11 processed successfully
Processing chunk 12
Chunk 12 processed successfully
Processing chunk 13
Chunk 13 processed successfully
Processing chunk 14
Chunk 14 processed successfully
Processing chunk 15
Chunk 15 processed successfully
Processing chunk 16
Chunk 16 processed successfully
Processing chunk 17
Chunk 17 processed successfully
Processing chunk 18
Chunk 18 processed successfully
Processing chunk 19
Chunk 19 processed successfully
Processing chunk 20
Chunk 20 p

InternalError: ValueError: Batch size of 6531 is greater than max batch size of 5461

In [None]:
 # Split text
 
for _, row in chunk_df.iterrows():
        metadata = row.to_dict()
        text = row['consumer_complaint_narrative']
        chunks = splitter.split_text(text)
        all_chunks.extend([{
            "text": chunk,
            "metadata": metadata
        } for chunk in chunks])

In [None]:
# Extract texts and metadata
texts = [chunk['text'] for chunk in all_chunks]
metadatas = [chunk['metadata'] for chunk in all_chunks]

In [None]:
 # Generate embeddings in batches
batch_size = 32
embeddings = []
for j in range(0, len(texts), batch_size):
        batch = texts[j:j + batch_size]
        batch_embeddings = embedder.encode(batch, show_progress_bar=False)
        embeddings.extend(batch_embeddings)


In [None]:
# Store in vector database
ids = [str(i) for i in range(len(embeddings))]
collection.add(
        embeddings=embeddings,
        metadatas=metadatas,
        ids=ids
    )
print(f"Chunk {i + 1} processed successfully")

In [None]:
# Find similar complaints
results = collection.query(
    query_embeddings=[query_embedding],
    n_results=5
)

In [None]:
# Display results
for i, result in enumerate(results):
    print(f"\nResult {i + 1}:")
    print(f"Similarity Score: {result['scores'][0][i]:.4f}")
    print(f"Product: {result['metadatas'][0][i]['product']}")
    print(f"Complaint: {result['documents'][0][i]}")
    print("-" * 80)