pip install --upgrade huggingface-hub>=0.15.1 transformers>=4.30 accelerate>=0.18 sentence-transformers>=2.2.2

In [None]:
import os
import json
from glob import glob
from sentence_transformers import SentenceTransformer
import faiss
import pickle

# 1. Load Q&A data
data_dir = r"C:\Users\15278\CHATBOT\data_crawl-20250703T175549Z-1-001\data_crawl"
json_files = glob(os.path.join(data_dir, "quora_all_scraped_*_extracted.json"))
all_texts = []

for file_path in json_files:
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)["data"]
        for item in data:
            q = item.get("question_text", "")
            for a in item.get("answer_texts", []):
                text = f"Q: {q}\nA: {a}"
                all_texts.append(text)

print(f" Loaded Q&A pairs: {len(all_texts)}")

# 2. Use complete text directly (no chunking)
texts = all_texts

# 3. Load English Embedding model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# 4. Generate embeddings & build FAISS index
embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True)
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

# 5. Save vector database and original texts
with open("qa_chunks.pkl", "wb") as f:
    pickle.dump(texts, f)

faiss.write_index(index, "qa_index.faiss")
print(" Vector database built and saved successfully!")

In [None]:
import faiss
import pickle
import numpy as np

# 1. Load index
index = faiss.read_index("qa_index.faiss")

# 2. Load text chunks (qa_chunks.pkl)
with open("qa_chunks.pkl", "rb") as f:
    qa_chunks = pickle.load(f)

# 3. Check if index and text chunks count match
assert index.ntotal == len(qa_chunks), f"Index count ({index.ntotal}) and text chunks count ({len(qa_chunks)}) do not match"

# 4. Get first 10 vectors + corresponding texts
print("First 10 vectors in index and their corresponding texts:\n")

for i in range(10):
    vector = index.reconstruct(i)  # Get the i-th vector
    text = qa_chunks[i]            # Get the i-th corresponding text
    print(f"=== Entry {i} ===")
    print(f"[Vector length]: {len(vector)}")
    print(f"[Text content]: {text}\n")

In [None]:
import random
import numpy as np
import faiss
import pickle

# Assuming texts and embeddings are already prepared
# 1. Shuffle index order
indices = list(range(len(texts)))
random.shuffle(indices)

# 2. Split by ratio
train_ratio = 0.8
train_size = int(len(texts) * train_ratio)
train_indices = indices[:train_size]
test_indices = indices[train_size:]

# 3. Get train and test texts and vectors separately
train_texts = [texts[i] for i in train_indices]
test_texts = [texts[i] for i in test_indices]
train_embeddings = embeddings[train_indices]
test_embeddings = embeddings[test_indices]

# 4. Save texts
with open("train_texts.pkl", "wb") as f:
    pickle.dump(train_texts, f)

with open("test_texts.pkl", "wb") as f:
    pickle.dump(test_texts, f)

# 5. Build and save corresponding FAISS indices
dimension = embeddings.shape[1]

train_index = faiss.IndexFlatL2(dimension)
train_index.add(train_embeddings)
faiss.write_index(train_index, "train_index.faiss")

test_index = faiss.IndexFlatL2(dimension)
test_index.add(test_embeddings)
faiss.write_index(test_index, "test_index.faiss")

print(f"Training set size: {len(train_texts)}, Test set size: {len(test_texts)}")
print(" Splitting completed and saved!")