In [16]:
import os
import fitz  # PyMuPDF
from openai import OpenAI
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Retrieve API key from environment variables
openai_api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=openai_api_key)

# Initialize an empty dictionary to store responses
responses = {}

# Specify the folder containing the PDF
folder_path = "invoices"
filename = "faktura_hjulmand.pdf"
path = os.path.join(folder_path, filename)

# Open the PDF document
doc = fitz.open(path)

# Loop through each page and extract text
for page_num, page in enumerate(doc):
    text = page.get_text()

    # Send extracted text to OpenAI for JSON response
    response = client.chat.completions.create(
        model="gpt-4o-2024-08-06",
        messages=[
            {"role": "system", "content": "Retrieve invoice_number, date_of_issue, seller_info, client_info, "
                                          "invoice_items_table, invoice_summary_table. Response must be in JSON format."},
            {"role": "user", "content": text}
        ],
        response_format={
            "type": "json_schema",
            "json_schema": {
                "name": "invoice_schema",
                "schema": {
                    "type": "object",
                    "properties": {
                        "invoice_number": {
                            "description": "The invoice number",
                            "type": "string"
                        },
                        "date_of_issue": {
                            "description": "The date the invoice was issued",
                            "type": "string"
                        },
                        "seller_info": {
                            "description": "Information about the seller",
                            "type": "array",
                            "items": {
                                "type": "object",
                                "properties": {
                                    "company": {"type": "string"},
                                    "address": {"type": "string"},
                                    "zip_code": {"type": "string"},
                                    "cvr_number": {"type": "string"}
                                }
                            }
                        },
                        "client_info": {
                            "description": "Information about the client",
                            "type": "string"
                        },
                        "invoice_items_table": {
                            "description": "Table of items listed in the invoice",
                            "type": "array",
                            "items": {
                                "type": "object",
                                "properties": {
                                    "item_description": {"type": "string"},
                                    "quantity": {"type": "string"},
                                    "unit_price": {"type": "string"},
                                    "total_price": {"type": "string"}
                                },
                                "required": ["item_description", "quantity", "unit_price", "total_price"]
                            }
                        },
                        "invoice_summary_table": {
                            "description": "Summary table of the invoice totals",
                            "type": "object",
                            "properties": {
                                "subtotal": {"type": "string"},
                                "tax": {"type": "string"},
                                "total": {"type": "string"}
                            },
                            "required": ["subtotal", "tax", "total"]
                        }
                    },
                    "additionalProperties": False
                }
            }
        }
    )

    # Store the response in the dictionary
    responses[f"page_{page_num + 1}"] = response.choices[0].message.content

# Output the dictionary with page responses
responses

{'page_1': '{"invoice_number":"45698","date_of_issue":"09/09/2024","seller_info":[{"company":"Hjulmand Aps","address":"Landevejsgade 2","zip_code":"8412","cvr_number":"1216423"}],"client_info":"Kundenr. 568498","invoice_items_table":[{"item_description":"Software","quantity":"2.0","unit_price":"1000.0","total_price":"2000.0"},{"item_description":"Konsulent","quantity":"10.0","unit_price":"50.0","total_price":"500.0"}],"invoice_summary_table":{"subtotal":"2500.0","tax":"612.5","total":"3112.5"}}'}


In [17]:
import json

# Get the raw content from the response dictionary
raw_content = responses['page_1']

# Remove any potential code block markers and whitespace (if they exist)
json_content = raw_content.strip('```json').strip('```')

# Load the JSON content into a Python dictionary
parsed_json = json.loads(json_content)

parsed_json

{'invoice_number': '45698',
 'date_of_issue': '09/09/2024',
 'seller_info': [{'company': 'Hjulmand Aps',
   'address': 'Landevejsgade 2',
   'zip_code': '8412',
   'cvr_number': '1216423'}],
 'client_info': 'Kundenr. 568498',
 'invoice_items_table': [{'item_description': 'Software',
   'quantity': '2.0',
   'unit_price': '1000.0',
   'total_price': '2000.0'},
  {'item_description': 'Konsulent',
   'quantity': '10.0',
   'unit_price': '50.0',
   'total_price': '500.0'}],
 'invoice_summary_table': {'subtotal': '2500.0',
  'tax': '612.5',
  'total': '3112.5'}}