In [1]:
import pytesseract
import psycopg2
from pdf2image import convert_from_path
from db import Database

db = Database()

In [2]:
db.execute("""
    CREATE TABLE IF NOT EXISTS pages (
        id SERIAL PRIMARY KEY,
        page_number INTEGER UNIQUE,
        image_jpeg BYTEA,
        ocr_text TEXT,
        ocr_improved TEXT,
        markdown TEXT
    )
""")

In [3]:
pdf_path = './input-data/vond001brie02_01.pdf'
pages = convert_from_path(pdf_path)


In [4]:
for index, page in enumerate(pages):
    # Use brew install tesseract-lang to install additional languages
    text = pytesseract.image_to_string(page)
    page_number = index + 1
    image = page._repr_jpeg_()
    
    db.execute("""
        INSERT INTO pages (page_number, image_jpeg, ocr_text)
        VALUES (%s, %s, %s)
        ON CONFLICT (page_number) DO UPDATE
        SET image_jpeg = EXCLUDED.image_jpeg,
            ocr_text = EXCLUDED.ocr_text
    """, (page_number, psycopg2.Binary(image), text))
    
    print(f'Written page {page_number}.') 

print(f'Done! Written {len(pages)} pages.')
    


Written page 1.
Written page 2.
Written page 3.
Written page 4.
Written page 5.
Written page 6.
Written page 7.
Written page 8.
Written page 9.
Written page 10.
Written page 11.
Written page 12.
Written page 13.
Written page 14.
Written page 15.
Written page 16.
Written page 17.
Written page 18.
Written page 19.
Written page 20.
Written page 21.
Written page 22.
Written page 23.
Written page 24.
Written page 25.
Written page 26.
Written page 27.
Written page 28.
Written page 29.
Written page 30.
Written page 31.
Written page 32.
Written page 33.
Written page 34.
Written page 35.
Written page 36.
Written page 37.
Written page 38.
Written page 39.
Written page 40.
Written page 41.
Written page 42.
Written page 43.
Written page 44.
Written page 45.
Written page 46.
Written page 47.
Written page 48.
Written page 49.
Written page 50.
Written page 51.
Written page 52.
Written page 53.
Written page 54.
Written page 55.
Written page 56.
Written page 57.
Written page 58.
Written page 59.
Writte

## Improving results with GPT-4 Vision

In [3]:
import base64
from llm import LLM
from db import Database

client = LLM()
db = Database()

def encode_image(binary_image):
    return base64.b64encode(binary_image).decode('utf-8')

def process_image(base64_image, text_path):
    prompt = f"""
    You are an advanced text extraction and improvement assistant. Your task is to accurately extract OCR text from the provided image.

    - Include only the relevant text from the documents: the addressee, content of the letter, and any metadata within the letter itself.
    - Exclude footnotes. Footnotes are at the bottom of the page, typically starting with a number and separated by a line break. Do not remove any other text.
    - Only insert five asterisks (*****) on a new line when there is a visible centered horizontal line separating individual letters in the original document. Do not add asterisks at the end of a page if there is no horizontal line present.
    - Do not add any special characters or markings at the end of a page unless specifically indicated in the original document.
    - Always visually confirm the presence of a horizontal line in the image before inserting asterisks in the extracted text.
    - Preserve old Dutch spelling and grammar, for example, keep "Sijn" instead of "Zijn".
    - Use „ for opening quotation marks and " for closing quotation marks.
    - Replace ,, with „

    Below is the OCR text of the page. Use the image to improve the quality of the OCR text. Do not correct any mistakes if they are also incorrect in the source image.

    Respond only with the improved text from the letter.
    
    {ocr_text}
    """
    
    return client.generate(
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text", 
                        "text": prompt
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}"
                        }
                    }
                ],
            }
        ],
    )  
    
# # # clear the column ocr_improved
# db.execute("""
#     UPDATE pages
#     SET ocr_improved = NULL
# """)
    
# Retrieve the pages from the database
# start from page 21 because the first 21 pages are not letters
db.execute("""
    SELECT page_number, image_jpeg, ocr_text
    FROM pages
    WHERE page_number >= 23 AND ocr_improved IS NULL
    ORDER BY page_number ASC
""")

rows = db.fetchall()

print('Retrieved pages...', len(rows))
print('Page numbers:', [row[0] for row in rows])

for row in rows:
    page_number, image, ocr_text = row
    base64_image = encode_image(image)
    improved_text = process_image(base64_image, ocr_text)
    
    db.execute("""
        UPDATE pages
        SET ocr_improved = %s
        WHERE page_number = %s
    """, (improved_text, page_number))
    
    print(f'Processed page {page_number}.')
        
print('Done!')
    
    

Retrieved pages... 139
Page numbers: [34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172]
Generating response with model gpt-4o-2024-05-13
Processed page 34.
Generating response with model gpt-4o-2024-05-13
Processed page 35.
Generating response with model gpt-4o-2024-05-13
Processed page 36.
Generating response with model gpt-4o-2024-05-13
Processed page 37.
Generating response with model gpt-4o-2024-05-13
Processed pa