In [2]:
# Before you start, make sure you have installed the required pgvector extension
# on MacOS you can use brew to install the postgres app which included the extension
# $ brew install --cask postgres-unofficial
# create a new database with the name `vondel`

# Install required libraries
%pip install psycopg2-binary openai numpy



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.1[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:

import psycopg2
import numpy as np
import os
from openai import OpenAI

from dotenv import load_dotenv

load_dotenv()

OpenAI.api_key = os.getenv("OPENAI_API_KEY")

client = OpenAI()


In [5]:
def connect_to_db():
    conn = psycopg2.connect(
        dbname=os.getenv("DB_NAME"),
        user=os.getenv("DB_USER"),
        password=os.getenv("DB_PASSWORD"),
        host=os.getenv("DB_HOST"),
        port=os.getenv("DB_PORT")
    )
    return conn

def initialize_db(conn):
    cur = conn.cursor()
    cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
    cur.execute("""
    CREATE TABLE IF NOT EXISTS pages (
        id SERIAL PRIMARY KEY,
        content TEXT,
        embedding VECTOR
    );
    """)
    conn.commit()
    cur.close()

def insert_document(conn, content, embedding):
    cur = conn.cursor()
    cur.execute(
        "INSERT INTO pages (content, embedding) VALUES (%s, %s)",
        (content, embedding.tolist())
    )
    conn.commit()
    cur.close()
    
def generate_embedding(text):
    response = client.embeddings.create(input=text, model="text-embedding-3-small")
    return np.array(response.data[0].embedding)


conn = connect_to_db()
initialize_db(conn)

# Get all OCR files from the ocr-step-2 directory
improved_files = [f'./ocr-step-2/{file}' for file in os.listdir('./ocr-step-2') if file.endswith('.txt')]

# Generate embeddings for all files, inset in database
for file in improved_files:
    with open(file, 'r') as f:
        content = f.read()
        embedding = generate_embedding(content)
        insert_document(conn, content, embedding)
        print(f"Inserted {file} into the database")

conn.close()


Inserted ./ocr-step-2/page_127.txt into the database
Inserted ./ocr-step-2/page_133.txt into the database
Inserted ./ocr-step-2/page_24.txt into the database
Inserted ./ocr-step-2/page_30.txt into the database
Inserted ./ocr-step-2/page_31.txt into the database
Inserted ./ocr-step-2/page_25.txt into the database
Inserted ./ocr-step-2/page_132.txt into the database
Inserted ./ocr-step-2/page_126.txt into the database
Inserted ./ocr-step-2/page_130.txt into the database
Inserted ./ocr-step-2/page_124.txt into the database
Inserted ./ocr-step-2/page_118.txt into the database
Inserted ./ocr-step-2/page_33.txt into the database
Inserted ./ocr-step-2/page_27.txt into the database
Inserted ./ocr-step-2/page_26.txt into the database
Inserted ./ocr-step-2/page_32.txt into the database
Inserted ./ocr-step-2/page_119.txt into the database
Inserted ./ocr-step-2/page_125.txt into the database
Inserted ./ocr-step-2/page_131.txt into the database
Inserted ./ocr-step-2/page_109.txt into the database
I