# Task 2

In [None]:
# 📦 Install dependencies (run once)
!pip install -q langchain-huggingface langchain-community sentence-transformers faiss-cpu

# Import libraries
import pandas as pd
import numpy as np
import os
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
import time

# Load only a sample of complaints for faster processing
df = pd.read_csv('../data/filtered_complaints.csv').head(20000)  # Sample size here
print(f"Loaded {len(df)} complaints (sample)")

# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=50,
    length_function=len,
    separators=["\n\n", "\n", ". ", " ", ""]
)

# Chunk texts and prepare metadata
documents, metadatas = [], []
for _, row in df.iterrows():
    chunks = text_splitter.split_text(row['clean_narrative'])
    for chunk in chunks:
        documents.append(chunk)
        metadatas.append({
            "complaint_id": row.get("Complaint ID", None),
            "product": row["Product"],
            "word_count": len(chunk.split())
        })

print(f"Created {len(documents)} chunks from {len(df)} complaints")
print(f"Average chunks per complaint: {len(documents)/len(df):.2f}")

# Initialize embeddings (use GPU)
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cuda'},  # GPU device
    encode_kwargs={'normalize_embeddings': True}
)

# Batch processing to monitor progress
batch_size = 10000
batched_documents = [documents[i:i+batch_size] for i in range(0, len(documents), batch_size)]
batched_metadatas = [metadatas[i:i+batch_size] for i in range(0, len(metadatas), batch_size)]

vector_store = None

for i, (docs, metas) in enumerate(zip(batched_documents, batched_metadatas)):
    print(f"\n📦 Processing batch {i+1}/{len(batched_documents)}... ({len(docs)} chunks)")
    start = time.time()
    batch_vs = FAISS.from_texts(texts=docs, embedding=embeddings, metadatas=metas)
    print(f"⏱️ Batch {i+1} took {(time.time() - start)/60:.2f} minutes")

    if vector_store is None:
        vector_store = batch_vs
    else:
        vector_store.merge_from(batch_vs)

# Save vector store to Google Drive
vector_store_path = "/content/drive/MyDrive/vector_store/faiss_index_sample"
os.makedirs(vector_store_path, exist_ok=True)
vector_store.save_local(vector_store_path)

print(f"\n✅ Sample vector store saved to: {vector_store_path}")
print(f"Total vectors in index: {vector_store.index.ntotal}")


Loaded 20000 complaints (sample)
Created 39664 chunks from 20000 complaints
Average chunks per complaint: 1.98

📦 Processing batch 1/4... (10000 chunks)
⏱️ Batch 1 took 0.17 minutes

📦 Processing batch 2/4... (10000 chunks)
⏱️ Batch 2 took 0.16 minutes

📦 Processing batch 3/4... (10000 chunks)
⏱️ Batch 3 took 0.17 minutes

📦 Processing batch 4/4... (9664 chunks)
⏱️ Batch 4 took 0.18 minutes

✅ Sample vector store saved to: /content/drive/MyDrive/vector_store/faiss_index_sample
Total vectors in index: 39664
