In [1]:
import os
import pandas as pd
import numpy as np
import pickle
import torch
import faiss

from tqdm import tqdm
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DATA_PATH = "../data/filtered_complaints.csv"
VECTOR_STORE_DIR = "../vector_store"
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 250
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
BATCH_SIZE = 32

In [3]:
# -------------------------
# Load data
# -------------------------
print("[+] Loading dataset...")
df = pd.read_csv(DATA_PATH)
assert "cleaned_narrative" in df.columns, "Missing 'cleaned_narrative' column"

# -------------------------
# Initialize components
# -------------------------
print("[+] Initializing text splitter and model...")
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP
)

device = 'cuda'
print(f"[+] Using device: {device}")

model = SentenceTransformer(EMBEDDING_MODEL_NAME, device=device)

# -------------------------
# Chunk narratives and collect metadata
# -------------------------
print("[+] Chunking text and collecting metadata...")
all_chunks = []
metadata_list = []

for idx, row in tqdm(df.iterrows(), total=len(df)):
    complaint_id = row.get("Complaint ID", idx)
    product = row["Product"]
    text = row["cleaned_narrative"]

    chunks = text_splitter.split_text(text)
    for chunk in chunks:
        all_chunks.append(chunk)
        metadata_list.append({
            "complaint_id": complaint_id,
            "product": product,
            "text": chunk
        })

print(f"[+] Total chunks: {len(all_chunks)}")

# -------------------------
# Generate embeddings (batch, GPU)
# -------------------------
print("[+] Generating embeddings...")
embeddings = model.encode(
    all_chunks,
    batch_size=BATCH_SIZE,
    show_progress_bar=True,
    convert_to_numpy=True,
    device=device
)

# -------------------------
# Build FAISS index
# -------------------------
print("[+] Building FAISS index...")
embedding_dim = embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(embeddings)

# -------------------------
# Save index and metadata
# -------------------------
os.makedirs(VECTOR_STORE_DIR, exist_ok=True)

faiss.write_index(index, os.path.join(VECTOR_STORE_DIR, "faiss_index.bin"))
with open(os.path.join(VECTOR_STORE_DIR, "metadata.pkl"), "wb") as f:
    pickle.dump(metadata_list, f)

print(f"[✓] Saved FAISS index and metadata for {len(embeddings)} chunks.")

[+] Loading dataset...
[+] Initializing text splitter and model...
[+] Using device: cuda
[+] Chunking text and collecting metadata...


100%|██████████| 248617/248617 [02:50<00:00, 1460.55it/s]


[+] Total chunks: 449658
[+] Generating embeddings...


Batches: 100%|██████████| 14052/14052 [27:25<00:00,  8.54it/s]


[+] Building FAISS index...
[✓] Saved FAISS index and metadata for 449658 chunks.
