In [8]:
%reload_ext autoreload
%autoreload 2

In [9]:
import os
from openai import OpenAI
import numpy as np
import asyncio
from typing import List
from pdf2image import convert_from_path
import pytesseract
from concurrent.futures import ThreadPoolExecutor
import polars as pl
from dotenv import load_dotenv

load_dotenv()
client = OpenAI()

In [10]:
def extract_text_from_pdf(path: str) -> str:
    images = convert_from_path(path)
    text = ""
    for img in images:
        text += pytesseract.image_to_string(img)
    return text

def process_pdfs_parallel(directory: str, max_workers: int = 4):
    pdf_files = [os.path.join(directory, f) for f in os.listdir(directory)[:500] if f.endswith(".pdf")]
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = list(executor.map(extract_text_from_pdf, pdf_files))
    return results

documents = process_pdfs_parallel('/Users/jamievoynow/Desktop/jfk_pdfs', max_workers=4)

In [11]:
def create_chunks(documents: list, min_chunk_length: int = 250) -> list:

    chunks = []
    current_chunk = ""

    for document in documents:
        for part in document.split('\n\n'):
            if len(current_chunk) + len(part) >= min_chunk_length:
                current_chunk += (' ' + part)  
                chunks.append(current_chunk.strip())
                current_chunk = "" 
            else:
                current_chunk += (' ' + part) if current_chunk else part 

    if current_chunk.strip():
        chunks.append(current_chunk.strip())

    return chunks


chunks = create_chunks(documents)

In [12]:
from pathlib import Path

async def fetch_batch_embeddings(
    texts: List[str], 
    model: str = "text-embedding-3-small"
) -> List[List[float]]:
    """Fetch embeddings for a single batch asynchronously."""
    response = await asyncio.to_thread(
        client.embeddings.create, 
        input=texts, 
        model=model
    )
    return [item.embedding for item in response.data]

async def get_batch_embeddings_async(
    texts: List[str], 
    model: str = "text-embedding-3-small", 
    batch_size: int = 16
) -> np.ndarray:
    """Asynchronously fetch embeddings in batches."""
    tasks = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        tasks.append(fetch_batch_embeddings(batch, model))
    
    results = await asyncio.gather(*tasks)
    return np.array([embedding for batch in results for embedding in batch])


async def save_embeddings(filepath: Path, texts: List[str], embeddings: np.ndarray):
    filepath = Path(filepath)
    filepath.parent.mkdir(parents=True, exist_ok=True)

    df = pl.DataFrame({
        "text": texts,
        "embedding": [embedding.tolist() for embedding in embeddings]
    })

    await asyncio.to_thread(df.write_parquet, str(filepath))

async def load_embeddings(filepath: Path) -> pl.DataFrame:
    """Load embeddings and texts from a Parquet file using Polars."""
    return await asyncio.to_thread(pl.read_parquet, str(filepath))


embeddings = await get_batch_embeddings_async(
    chunks, model="text-embedding-3-small", batch_size=16
)

output_file = Path("src/chat_with_jfk_files/embeddings.parquet")
await save_embeddings(embeddings=embeddings, texts=chunks, filepath=output_file)

df = await load_embeddings(output_file)
