In [2]:
import json
import numpy as np
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import faiss

In [None]:
file_path = r"outputs/normalized_output.json"

documents = []
with open(file_path, "r", encoding="utf-8") as f:
    raw = f.read().strip()

# Remove trailing commas safely
raw = raw.rstrip(",")

# Wrap as JSON array
raw = "[" + raw + "]"

documents = json.loads(raw)

In [None]:
# Flatten nested list if needed
if isinstance(documents[0], list):
    documents = [item for sublist in documents for item in sublist]

print("Flattened documents:", len(documents))

Flattened documents: 537812


In [4]:
# Chunk data
splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=100
)

In [5]:
# Embedding model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
all_embeddings = []
metadata = []

In [None]:
BATCH_SIZE = 512

all_embeddings = []
metadata_file = "outputs/metadata.jsonl"
open(metadata_file, "w").close()

batch_texts = []
batch_meta = []

# Processing
for i, doc in enumerate(documents, start=1):
    text = " ".join([f"{k}: {v}" for k, v in doc["data"].items()])
    chunks = splitter.split_text(text)

    for chunk in chunks:
        batch_texts.append(chunk)
        batch_meta.append({
            "chunk_text": chunk,
            "record_id": doc["id"],
            "source_name": doc["source_name"],
            "timestamp": doc["timestamp"]
        })

    if len(batch_texts) >= BATCH_SIZE:
        embeddings = model.encode(
            batch_texts,
            batch_size=128,
            convert_to_numpy=True,
            show_progress_bar=False
        )

        all_embeddings.append(embeddings)

        with open(metadata_file, "a", encoding="utf-8") as f:
            for m in batch_meta:
                f.write(json.dumps(m) + "\n")

        batch_texts.clear()
        batch_meta.clear()

    if i % 5000 == 0:
        print(f"Completed {i} records")

# Flush remaining
if batch_texts:
    embeddings = model.encode(batch_texts, batch_size=128, convert_to_numpy=True)
    all_embeddings.append(embeddings)

    with open(metadata_file, "a", encoding="utf-8") as f:
        for m in batch_meta:
            f.write(json.dumps(m) + "\n")
print("Completed")

# Save embeddings
final_embeddings = np.vstack(all_embeddings).astype("float32")
np.save("outputs/embeddings.npy", final_embeddings)

In [None]:
# Load metadata from .jsonl file
metadata = []
with open("outputs/metadata.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        metadata.append(json.loads(line))

In [None]:
# Create FAISS index
dimension = all_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(all_embeddings)

# Save FAISS index
faiss.write_index(index, "vector_index.faiss")

# Save metadata as normal JSON (optional but useful)
with open("outputs/metadata.json", "w", encoding="utf-8") as f:
    json.dump(metadata, f, indent=2)

print(f"Stored {all_embeddings.shape[0]} embeddings in FAISS")

Stored 2541581 embeddings in FAISS


In [None]:
# Load and verify FAISS index
index = faiss.read_index("vector_index.faiss")
print("Number of vectors:", index.ntotal)
print("Vector dimension:", index.d)

Number of vectors: 2541581
Vector dimension: 384
