In [9]:
import pdfplumber, json
from pathlib import Path

def chunk_text(text, chunk_size=400, overlap=80):
    words = text.split()
    chunks = []
    i = 0
    while i < len(words):
        chunk = ' '.join(words[i:i+chunk_size])
        chunks.append(chunk)
        i += chunk_size - overlap
    return chunks

def extract_text(file_path):
    if file_path.suffix.lower() == ".pdf":
        text = ""
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                text += page.extract_text() + "\n"
        return text
    else:
        return file_path.read_text(encoding="utf-8", errors="ignore")

docs_path = Path("docs")
chunks = []

for file in docs_path.iterdir():
    if file.suffix.lower() in [".txt", ".pdf"]:
        text = extract_text(file)
        for i, c in enumerate(chunk_text(text)):
            chunks.append({"doc_id": file.name, "chunk_id": i, "text": c})

# Save chunks
os.makedirs("data", exist_ok=True)
with open("data/chunks.jsonl", "w", encoding="utf-8") as f:
    for c in chunks:
        f.write(json.dumps(c, ensure_ascii=False) + "\n")

print("Documents ingested into chunks!")

Documents ingested into chunks!


In [11]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import json

# Load chunks
with open("data/chunks.jsonl", "r", encoding="utf-8") as f:
    chunks = [json.loads(line) for line in f]

texts = [c["text"] for c in chunks]

# Embed
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(texts, convert_to_numpy=True)
embeddings /= np.linalg.norm(embeddings, axis=1, keepdims=True)

# Build FAISS index
dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(embeddings)

# Save index and metadata
import os
os.makedirs("index", exist_ok=True)
faiss.write_index(index, "index/faiss.index")
with open("index/meta.json", "w", encoding="utf-8") as f:
    json.dump(chunks, f, ensure_ascii=False)

print("FAISS index built successfully!")


FAISS index built successfully!
