In [5]:
%reload_ext autoreload
%autoreload 2

In [6]:
import os
from openai import AsyncOpenAI
import asyncio
from typing import List
from concurrent.futures import ProcessPoolExecutor, as_completed
from tqdm import tqdm
import json
import logging
from dotenv import load_dotenv
import fitz
from pinecone import Pinecone

from src.chat_with_jfk_files.constants import PDF_BASE_DIR


load_dotenv()
client = AsyncOpenAI()


# Prepare Data

In [3]:
def filter_pdfs(min_pages: int, size_cutoff_percentage: float) -> list:
    """
    Filter PDFs based on the number of pages and size.

    :param base_dir: The directory to filter PDFs from.
    :param min_pages: The minimum number of pages a PDF must have to be included.
    :return: A list of filtered PDF paths.
    """
    filename_to_path = {}
    for folder in os.listdir(PDF_BASE_DIR):
        if folder == '.DS_Store':
            continue
        folder_path = os.path.join(PDF_BASE_DIR, folder)
        for file in os.listdir(folder_path):
            if file.endswith('.pdf'):
                pdf_path = os.path.join(folder_path, file)
                basename = os.path.basename(pdf_path)

                # we only want unique filenames
                if basename not in filename_to_path:
                    filename_to_path[basename] = pdf_path
    
    pdf_paths = list(filename_to_path.values())

    filtered_pdf_paths = []
    for pdf_path in pdf_paths:
        try:
            doc = fitz.open(pdf_path)
            page_count = len(doc)

            if page_count >= min_pages:
                filtered_pdf_paths.append(pdf_path)

        except Exception as e:
            print(f"Error processing {pdf_path}: {e}")

    pdf_sizes = [(pdf, os.path.getsize(pdf)) for pdf in filtered_pdf_paths]
    pdf_sizes.sort(key=lambda x: x[1])
    cutoff_index = int(len(pdf_sizes) * size_cutoff_percentage)
    filtered_pdf_paths = [pdf for pdf, _ in pdf_sizes[cutoff_index:]]

    return filtered_pdf_paths

filtered_pdfs = filter_pdfs(min_pages=5, size_cutoff_percentage=0.2)
total_size_bytes = sum(os.path.getsize(f) for f in filtered_pdfs)
total_size_gb = total_size_bytes / 1024 ** 3

print(f"Total files: {len(filtered_pdfs)}")
print(f"Total size: {total_size_gb:.2f} GB")

Total files: 40
Total size: 0.14 GB


# OCR

In [None]:
from src.chat_with_jfk_files.ocr import extract_text_from_pdf

def process_pdfs_parallel(paths: list, output_file: str, max_workers: int = 4):
    """
    Process a list of PDFs in parallel with logging, progress tracking, and incremental saving.
    
    :param paths: List of PDF file paths to process.
    :param output_file: Path to the JSON file for saving results.
    :param max_workers: Number of parallel workers.
    """
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        future_to_path = {executor.submit(extract_text_from_pdf, path): path for path in paths}
        for future in tqdm(as_completed(future_to_path), total=len(paths), desc="Processing PDFs"):
            path = future_to_path[future]
            try:
                text = future.result()
                parent_folder = os.path.basename(os.path.dirname(path))
                filename = os.path.basename(path)
                result = {"path": os.path.join(parent_folder, filename), "text": text}

                # append to file
                with open(output_file, "a") as f:
                    f.write(json.dumps(result) + "\n")

            except Exception as e:
                print(f"Error processing {path}: {e}")

output_jsonl_file = "ocr_output.jsonl"
process_pdfs_parallel(filtered_pdfs, output_jsonl_file, max_workers=5)

Processing PDFs:  98%|█████████▊| 39/40 [03:58<00:06,  6.10s/it]


# Create Chunks

In [7]:
def create_chunks(documents: list, min_chunk_length: int = 2500) -> list:
    """
    Create chunks of text from a list of documents

    document.keys() -> ['path', 'text']

    :param documents: List of documents to chunk
    :param min_chunk_length: The minimum length of a chunk in characters
    :return: A list of chunks
    """

    chunks = []
    for document in documents:
        current_chunk = ""
        
        
        doctext = document['text']
        parts = doctext.split('\n\n')
        
        
        
        for part in parts:
            part = part.strip()
            if not part:
                continue
            if current_chunk:
                if len(current_chunk) + len(part) + 1 > min_chunk_length:
                    chunks.append({"path": document["path"], "chunk": current_chunk})
                    current_chunk = part
                else:
                    current_chunk += " " + part
            else:
                current_chunk = part
        if current_chunk:
            chunks.append({"path": document["path"], "chunk": current_chunk})
    return chunks


with open("ocr_output.jsonl", "r", encoding="utf-8") as f:
    documents = [json.loads(line) for line in f]
chunks = create_chunks(documents)
print(len(chunks))

397


# Create Embeddings

In [8]:
from src.chat_with_jfk_files import llm

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)
logging.getLogger("openai").setLevel(logging.WARNING)



async def embed_chunk(chunk: dict) -> dict:
    """
    Embed a single chunk.

    :param chunk: The chunk to embed
    :return: dict
    """
    response = await llm.embed(chunk["chunk"])
    return {
        "path": chunk["path"],
        "chunk": chunk["chunk"],
        "embedding": response
    }

async def process_batch(batch: List[dict], file_path: str) -> None:
    """
    Process a batch of chunks and append embeddings to a file.

    :param batch: A list of chunks to embed
    :param file_path: The file to append results to
    """
    embeddings = await asyncio.gather(*(embed_chunk(chunk) for chunk in batch))
    with open(file_path, 'a') as file:
        for record in embeddings:
            file.write(json.dumps(record) + '\n')

async def create_embeddings_in_batches(chunks: List[dict], batch_size: int, file_path: str) -> None:
    """
    Create embeddings for chunks in batches and write to a file.

    :param chunks: A list of chunks to embed
    :param batch_size: The number of chunks to process in each batch
    :param file_path: The file to save embeddings to
    """
    total_chunks = len(chunks)
    for start in range(0, total_chunks, batch_size):
        end = start + batch_size
        batch = chunks[start:end]
        remaining = total_chunks - end
        logging.info(f"Processing batch {start // batch_size + 1} ({start} to {end - 1})... "
                     f"{remaining} chunks remaining out of {total_chunks}.")
        await process_batch(batch, file_path)


await create_embeddings_in_batches(chunks, batch_size=100, file_path="embeddings.jsonl")

INFO:root:Processing batch 1 (0 to 99)... 297 chunks remaining out of 397.
INFO:root:Processing batch 2 (100 to 199)... 197 chunks remaining out of 397.
INFO:root:Processing batch 3 (200 to 299)... 97 chunks remaining out of 397.
INFO:root:Processing batch 4 (300 to 399)... -3 chunks remaining out of 397.


# Upsert Embeddings to Pinecone

In [9]:
import os
import json
import logging
from typing import List
from pinecone import Pinecone

logging.basicConfig(level=logging.INFO)

def upsert_to_pinecone(
    embeddings: List[dict], 
    upsert_batch_size: int = 100,
    namespace: str = "jfk-docs-2025-master"
) -> object:
    """
    Upsert embeddings to Pinecone in batches with logging.
    
    :param embeddings: List of embedding dictionaries.
    :param upsert_batch_size: Number of embeddings per upsert batch.
    :param namespace: Pinecone namespace.
    :return: Pinecone index object.
    """
    logging.info("Initializing Pinecone client.")
    pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
    index = pc.Index("chat-with-jfk-files")
    
    total_embeddings = len(embeddings)
    logging.info(f"Starting upsert of {total_embeddings} embeddings with batch size {upsert_batch_size}.")
    
    for batch_start in range(0, total_embeddings, upsert_batch_size):
        batch = embeddings[batch_start : batch_start + upsert_batch_size]
        vectors = [
            {
                "id": f"chunk-{str(batch_start + j).zfill(6)}",
                "values": emb['embedding'],
                "metadata": {"text": emb['chunk'], "path": emb['path']}
            }
            for j, emb in enumerate(batch)
        ]
        batch_num = batch_start // upsert_batch_size + 1
        logging.info(f"Upserting batch {batch_num} "
                     f"with {len(batch)} embeddings (indices {batch_start} to {batch_start + len(batch) - 1}).")
        index.upsert(vectors=vectors, namespace=namespace)
    
    logging.info("Upsert complete.")
    return index

with open("embeddings.jsonl", "r", encoding="utf-8") as f:
    embeddings = [json.loads(line) for line in f]

index = upsert_to_pinecone(embeddings)
index.describe_index_stats()

INFO:root:Initializing Pinecone client.
INFO:pinecone_plugin_interface.logging:Discovering subpackages in _NamespacePath(['/Users/jamievoynow/Desktop/code/chat-with-jfk-files/.venv/lib/python3.11/site-packages/pinecone_plugins'])
INFO:pinecone_plugin_interface.logging:Looking for plugins in pinecone_plugins.inference
INFO:pinecone_plugin_interface.logging:Installing plugin inference into Pinecone
INFO:pinecone_plugin_interface.logging:Discovering subpackages in _NamespacePath(['/Users/jamievoynow/Desktop/code/chat-with-jfk-files/.venv/lib/python3.11/site-packages/pinecone_plugins'])
INFO:pinecone_plugin_interface.logging:Looking for plugins in pinecone_plugins.inference
INFO:root:Starting upsert of 397 embeddings with batch size 100.
INFO:root:Upserting batch 1 with 100 embeddings (indices 0 to 99).
INFO:root:Upserting batch 2 with 100 embeddings (indices 100 to 199).
INFO:root:Upserting batch 3 with 100 embeddings (indices 200 to 299).
INFO:root:Upserting batch 4 with 97 embeddings (i

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'epstein': {'vector_count': 580},
                'jfk-docs': {'vector_count': 28386},
                'jfk-docs-2025': {'vector_count': 9200}},
 'total_vector_count': 38166}

In [None]:
index.describe_index_stats()