**Install Dependencies**

In [None]:
# OCR & image/pdf handling
# !pip install -q pytesseract pdf2image pillow opencv-python

# !apt-get install -y poppler-utils tesseract-ocr > /dev/null

# If you're using Tesseract OCR on Windows:
# Manually install Tesseract OCR: https://github.com/UB-Mannheim/tesseract/wiki
# Then set the path to the tesseract executable:
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r"C:\Users\Fleming Siow\AppData\Local\Programs\Tesseract-OCR\tesseract.exe"

# Embedding & LangChain
# !pip install -q langchain langchain-community faiss-cpu unstructured

In [None]:
# !pip install -U langchain-huggingface

Collecting langchain-huggingface
  Downloading langchain_huggingface-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Downloading langchain_huggingface-0.1.2-py3-none-any.whl (21 kB)
Installing collected packages: langchain-huggingface
Successfully installed langchain-huggingface-0.1.2


In [None]:
# !pip install -U ipywidgets
# !jupyter nbextension enable --py widgetsnbextension

Collecting ipywidgets
  Downloading ipywidgets-8.1.6-py3-none-any.whl.metadata (2.4 kB)
Collecting widgetsnbextension~=4.0.14 (from ipywidgets)
  Downloading widgetsnbextension-4.0.14-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab_widgets~=3.0.14 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.14-py3-none-any.whl.metadata (4.1 kB)
Downloading ipywidgets-8.1.6-py3-none-any.whl (139 kB)
Downloading jupyterlab_widgets-3.0.14-py3-none-any.whl (213 kB)
Downloading widgetsnbextension-4.0.14-py3-none-any.whl (2.2 MB)
   ---------------------------------------- 0.0/2.2 MB ? eta -:--:--
   -------------------------------------- - 2.1/2.2 MB 10.7 MB/s eta 0:00:01
   ---------------------------------------- 2.2/2.2 MB 9.5 MB/s eta 0:00:00
Installing collected packages: widgetsnbextension, jupyterlab_widgets, ipywidgets
Successfully installed ipywidgets-8.1.6 jupyterlab_widgets-3.0.14 widgetsnbextension-4.0.14


Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: ok


**Imports & Setup**

In [2]:
import os
import json
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
from langchain.vectorstores import FAISS
from langchain.schema import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

from tqdm import tqdm

**Define Folder Paths**

In [3]:
# Base folder holding all categories
BASE_DIR = "data"

# Your content categories
CATEGORIES = [
    "penalties",
    "forms",
    "notices",
    "guidelines",
    "permits",
    "acts",
    "user_uploads"
]


**Load Metadata**

In [4]:
import os
import json
from glob import glob
from pathlib import Path

base_dir = "../data"
doc_folders = ["forms", "penalties", "notices", "guidelines", "permits", "acts", "user_uploads"]

# Helper to load metadata per folder
def load_metadata(folder):
    meta_path = os.path.join(base_dir, folder, f"{folder}_metadata.json")
    if os.path.exists(meta_path):
        with open(meta_path, "r", encoding="utf-8") as f:
            return {item["filename"]: item for item in json.load(f)}
    else:
        return {}

# Collect files and metadata
file_index = []
for folder in doc_folders:
    full_folder = os.path.join(base_dir, folder)
    metadata = load_metadata(folder)

    for fpath in glob(f"{full_folder}/*"):
        if Path(fpath).suffix.lower() not in [".pdf", ".png", ".jpg", ".jpeg"]:
            continue

        fname = os.path.basename(fpath)
        meta = metadata.get(fname, {})
        file_index.append({
            "filepath": fpath,
            "filename": fname,
            "folder": folder,
            "metadata": meta
        })

print(f"Discovered {len(file_index)} documents with metadata.")

Discovered 24 documents with metadata.


**Extract Text from PDFs and Images**

In [5]:
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
import mimetypes

def extract_text_from_pdf(pdf_path):
    try:
        pages = convert_from_path(pdf_path, dpi=200)
        text = "\n\n".join(pytesseract.image_to_string(page) for page in pages)
        return text.strip()
    except Exception as e:
        print(f"[PDF ERROR] {pdf_path}: {e}")
        return ""

def extract_text_from_image(image_path):
    try:
        img = Image.open(image_path)
        return pytesseract.image_to_string(img).strip()
    except Exception as e:
        print(f"[IMG ERROR] {image_path}: {e}")
        return ""

**Combine Text and Metadata for Embedding**

In [None]:
# If you're using Poppler on Windows, which pdf2image relies:
# Manually install Poppler: https://github.com/oschwartz10612/poppler-windows/releases
# Then set the Environment Variables's Path variable to the full path to the bin/ folder inside Poppler (e.g. C:\Program Files\poppler\Library\bin):

In [6]:
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path

docs_with_metadata = []

# Enable this if you want to try multithreaded OCR/PDF parsing (slightly risky for Tesseract concurrency)
USE_THREADING = True  # Set to True to enable

def process_file(item):
    path = item["filepath"]
    ext = Path(path).suffix.lower()

    # Parse text
    if ext == ".pdf":
        content = extract_text_from_pdf(path)
    elif ext in [".png", ".jpg", ".jpeg"]:
        content = extract_text_from_image(path)
    else:
        return None  # skip unknown formats

    if not content:
        return None

    # Inject metadata as header
    meta = item["metadata"]
    header = [
        f"Filename: {item['filename']}",
        f"Folder: {item['folder']}",
        f"Agency: {meta.get('agency', 'unknown')}",
        f"Type: {meta.get('type', 'unknown')}",
        f"Category: {meta.get('category', 'unknown')}",
        f"Date: {meta.get('doc_date', 'unknown')}",
        f"Summary: {meta.get('summary', 'No summary available')}"
    ]
    return "\n".join(header) + "\n\n" + content

# Choose threaded or normal loop
if USE_THREADING:
    with ThreadPoolExecutor() as executor:
        for result in tqdm(executor.map(process_file, file_index), total=len(file_index)):
            if result:
                docs_with_metadata.append(result)
else:
    for item in tqdm(file_index):
        result = process_file(item)
        if result:
            docs_with_metadata.append(result)

print(f"Parsed {len(docs_with_metadata)} documents successfully.")


  0%|          | 0/24 [00:00<?, ?it/s]

100%|██████████| 24/24 [06:26<00:00, 16.09s/it]

Parsed 24 documents successfully.





**Chunk & Prepare Documents for Indexing**

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Define text splitter config
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
    length_function=len
)

# Chunk + attach metadata
text_chunks  = []

for doc in tqdm(docs_with_metadata, desc="Splitting into chunks"):
    chunks = text_splitter.split_text(doc)
    for chunk in chunks:
        text_chunks .append(chunk)

print(f"Prepared {len(text_chunks)} chunks for embedding.")


Splitting into chunks: 100%|██████████| 24/24 [00:00<00:00, 1330.24it/s]

Prepared 3410 chunks for embedding.





In [8]:
# Chunk the OCR + metadata docs
image_chunks = []

for doc in tqdm(docs_with_metadata, desc="Splitting image / OCR docs"):
    chunks = text_splitter.split_text(doc)
    image_chunks.extend(chunks)

print(f"Total OCR chunks: {len(image_chunks)}")

Splitting image / OCR docs: 100%|██████████| 24/24 [00:00<00:00, 1351.22it/s]

Total OCR chunks: 3410





**Embed Chunks & Store with FAISS**

In [None]:
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# FAISS index for text
faiss_text_store = FAISS.from_texts(texts=text_chunks, embedding=embedding_model)
faiss_text_store.save_local("../vectorstores/faiss_index_multimodal/faiss_docs")

# FAISS index for image+OCR
faiss_image_store = FAISS.from_texts(texts=image_chunks, embedding=embedding_model)
faiss_image_store.save_local("../vectorstores/faiss_index_multimodal/faiss_images")

print("Saved FAISS text and image vectorstores.")

Saved FAISS text and image vectorstores.
