In [5]:
from llm import LLM
from db import Database

client = LLM()
db = Database()


In [6]:
db.execute("""
    ALTER TABLE IF EXISTS letters
    ADD COLUMN IF NOT EXISTS sender TEXT,
    ADD COLUMN IF NOT EXISTS recipient TEXT,
    ADD COLUMN IF NOT EXISTS subject TEXT,
    ADD COLUMN IF NOT EXISTS send_date DATE,
    ADD COLUMN IF NOT EXISTS location TEXT,
    ADD COLUMN IF NOT EXISTS geolocation FLOAT[],
    ADD COLUMN IF NOT EXISTS language TEXT,
    ADD COLUMN IF NOT EXISTS summary TEXT
""")


In [7]:
import json
from datetime import datetime

def prompt_for_metadata(text):
    system = """
    You are an expert in reading and analyzing 17th-century letters. Your task is to extract metadata from the given letter and return it in a strictly JSON format. Your entire response must be valid, parsable JSON. Do not include any explanatory text outside the JSON structure. Don't start with ```json or end with ```.
    """

    prompt = """
    Analyze the following 17th-century letter and extract in this exact JSON format:
    {
      "sender": "The person or entity sending the letter. Use the full name if this can be determined.",
      "recipient": "The person or entity receiving the letter. Use the full name if this can be determined.",
      "subject": "The main subject or purpose of the letter, written in modern English.",
      "send_date": "The date the letter was sent (YYYY-MM-DD format if possible)",
      "location": "The place where the letter was written",
      "geolocation": "The approximate latitude and longitude of the location, if it can be determined (format: [latitude, longitude])",
      "language": "The language in which the letter is written. In ISO 639-1 format (e.g., "en" for English, "nl" for Dutch)",
      "summary": "A brief summary of the letter's content in modern English, suitable for a general audience"
    }
    
    If any field cannot be determined, use null as the value. Don't use "Unknown" or similar placeholders.

    Here's the text to analyze:

    """ + text
    
    return client.generate(
        messages=[
            {
                'role': 'system',
                'content': system
            },
            {
                'role': 'user',
                'content': prompt
            }
        ],
    )
    
def validate_and_normalize_metadata(response_content):
    try:
        result = json.loads(response_content)
    except json.JSONDecodeError:
        print("Error: Invalid JSON response")
        return None

    expected_keys = ['sender', 'recipient', 'subject', 'send_date', 'location', 'geolocation', 'language', 'summary']
    
    if not all(key in result for key in expected_keys):
        print("Error: JSON is missing one or more required fields")
        return None

    # Convert send_date to proper date format if possible
    if result['send_date']:
        try:
            result['send_date'] = datetime.strptime(result['send_date'], '%Y-%m-%d').date()
        except ValueError:
            result['send_date'] = None  # Set to None if can't be parsed

    # Ensure geolocation is in correct format [lon, lat]
    if result['geolocation']:
        if isinstance(result['geolocation'], list) and len(result['geolocation']) == 2:
            try:
                result['geolocation'] = [float(result['geolocation'][0]),
                                         float(result['geolocation'][1])]
            except ValueError:
                result['geolocation'] = None
        else:
            result['geolocation'] = None

    return result

def extract_metadata(text):
    response_content = prompt_for_metadata(text)
    return validate_and_normalize_metadata(response_content)

In [8]:
from psycopg2.extras import Json

db.execute("""
    SELECT id, markdown, start_page, subject 
    FROM letters
    WHERE markdown IS NOT NULL AND subject IS NULL
    ORDER BY id ASC
""")

letters = db.fetchall()

print(f"Processing {len(letters)} letters")

for letter in letters:
    print(f"Processing letter {letter[0]}")
    print(f"First 2 lines of the letter: {letter[1].splitlines()[:2]}")
    
    extracted_info = extract_metadata(letter[1])
    print(f"Extracted metadata: {extracted_info}")
    
    if extracted_info is None:
        print(f"Failed to extract metadata for letter {letter[0]}")
        continue
    
    db.execute("""
        UPDATE letters
        SET sender = %s,
            recipient = %s,
            subject = %s,
            send_date = %s,
            location = %s,
            geolocation = %s,
            language = %s,
            summary = %s
        WHERE id = %s
    """, (extracted_info['sender'],
            extracted_info['recipient'],
            extracted_info['subject'],
            extracted_info['send_date'],
            extracted_info['location'],
            extracted_info['geolocation'],
            extracted_info['language'],
            extracted_info['summary'],
            letter[0]))

    print(f"Successfully processed letter {letter[0]}")

print("Finished processing letters")

Processing 5 letters
Processing letter 9
First 2 lines of the letter: ['## BRIEF VAN P. C. HOOFT AAN DE KAMER IN LIEFDE BLOEJENDE UIT FLORENCE', '']
Generating response with model gpt-4o-2024-05-13
Extracted metadata: {'sender': 'P. C. Hooft', 'recipient': 'De Kamer in Liefde Bloeijende', 'subject': 'Praise of Dutch literary achievements and longing for the homeland', 'send_date': datetime.date(1600, 1, 1), 'location': 'Florence', 'geolocation': [43.7696, 11.2558], 'language': 'nl', 'summary': 'P. C. Hooft writes to the Kamer in Liefde Bloeijende from Florence, praising the literary achievements in Holland and expressing a longing for his homeland. He mentions the high quality of poetry and intellectual discourse in Amsterdam, comparing it favorably to Italy.'}
Successfully processed letter 9
Processing letter 10
First 2 lines of the letter: ['## VONDEL AAN VAERLAER', '']
Generating response with model gpt-4o-2024-05-13
Extracted metadata: {'sender': 'Joost van den Vondel', 'recipient'