In [12]:
# imports
import os, re, json, logging
from typing import List, Dict
import fitz          
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv

#find table

In [13]:
# ---------- Page discovery utilities ----------
MIN_CODES = 5                                           # tweak if a report is sparse
CODE_RE   = re.compile(r'\b(?:BP|GOV|SBM|IRO|E[1-5]|S[1-4]|G[1-3])[-–‐]?\d{1,2}\b', re.I)

def code_cnt(txt: str) -> int:
    "Count ESRS-style code tokens in a block of text."
    return len(CODE_RE.findall(txt))

def find_esrs_pages(pdf_path: str) -> List[int]:
    """
    Return a **1-based** list of pages that most likely hold the ESRS
    cross-reference table, using the heuristic you provided.
    """
    doc = fitz.open(pdf_path)

    # 1️⃣ “ESRS table of contents” segment
    toc_hits = [
        i for i, p in enumerate(doc)
        if "esrs table of contents" in p.get_text().lower()
        and code_cnt(p.get_text()) >= MIN_CODES
    ]
    if toc_hits:
        start  = toc_hits[-1]
        pages  = [start]
        for j in range(start + 1, len(doc)):
            if code_cnt(doc[j].get_text()) >= MIN_CODES:
                pages.append(j)
            else:
                break
        return [p + 1 for p in pages]                     # 1-based return

    # 2️⃣ densest contiguous block anywhere in the PDF
    qualifies = [code_cnt(p.get_text()) >= MIN_CODES for p in doc]
    blocks, i = [], 0
    while i < len(doc):
        if not qualifies[i]:
            i += 1; continue
        start = i
        while i < len(doc) and qualifies[i]:
            i += 1
        blocks.append(range(start, i))
    if not blocks:
        return []

    best = max(blocks, key=lambda r: len(r)*sum(code_cnt(doc[p].get_text()) for p in r))
    return [p + 1 for p in best]                          # 1-based list


In [14]:
import fitz

try:
    # Update path to local Reports directory
    doc = fitz.open("Reports/PhilipsFullAnnualReport2024-English.pdf")
    text = doc[242].get_text()  # zero-indexed
    print(text[:1000])  # show first 1000 characters
finally:
    # Ensure document is closed
    if 'doc' in locals():
        doc.close()

8.7 
ESRS cross-reference table
ESRS 2 
General disclosures
Disclosure Requirement BP-1 – General basis for preparation of sustainability
Sustainability statement: General basis for preparation
169 Limited Assurance
Disclosure Requirement BP-2 – Disclosures in relation to specific circumstances
Sustainability statement: General basis for preparation
169 Limited Assurance
Disclosure Requirement GOV-1 – The role of the administrative, management and 
supervisory bodies
SFDR/BRR
Environmental, Social and Governance: ESG governance
Supervisory Board report: composition, diversity and self-evaluation
Other Board-related matters: Diversity
40
64
268
Limited Assurance
Disclosure Requirement GOV-2 – Information provided to and sustainability matters 
addressed by the undertaking’s administrative, management and supervisory bodies
Environmental, Social and Governance: ESG governance
40 Limited Assurance
Disclosure Requirement GOV-3 - Integration of sustainability-related performance in incentiv

In [15]:
pdf_path = "Reports/PhilipsFullAnnualReport2024-English.pdf"
pages = find_esrs_pages(pdf_path)
print("ESRS pages:", pages)

ESRS pages: [243, 244, 245, 246, 247]


#LLM APY

In [16]:
# ---------- LLM extraction ----------
load_dotenv()  # Load environment variables

api_key = os.getenv('OPENAI_API_KEY')
client = OpenAI(api_key=api_key)

def extract_page_items(text: str, page_num: int, client: OpenAI) -> List[Dict]:
    """
    Call the model once for a single page of text and return list[dict].
    """
    prompt = f"""You are an ESRS disclosure analyzer. Extract **all** ESRS disclosures
from the text below (page {page_num}).  Return JSON like:
{{"items":[{{"code":"","title":"","page_reference":""}}]}}.

Text:
{text}"""
    try:
        resp  = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role":"user","content":prompt}],
            response_format={"type":"json_object"},
        )
        data  = json.loads(resp.choices[0].message.content)
        for itm in data.get("items", []):
            itm["source_page"] = page_num
        return data.get("items", [])
    except Exception as e:
        logging.error(f"LLM error on page {page_num}: {e}")
        return []


In [17]:
pdf_path = "Reports/PhilipsFullAnnualReport2024-English.pdf"

pages      = find_esrs_pages(pdf_path)
print("Chosen pages:", pages)

items: List[Dict] = []
doc = fitz.open(pdf_path)
for p in pages:
    text = doc[p-1].get_text()          # convert back to 0-based for PyMuPDF
    items.extend(extract_page_items(text, p, client))

# Show results in-notebook
if items:
    df = pd.DataFrame(items)
    display(df)                         # full DataFrame in Colab
else:
    print("No ESRS references found.")

Chosen pages: [243, 244, 245, 246, 247]


Unnamed: 0,code,title,page_reference,source_page
0,BP-1,General basis for preparation of sustainability,169,243
1,BP-2,Disclosures in relation to specific circumstances,169,243
2,GOV-1,"The role of the administrative, management and...","40, 64, 268",243
3,GOV-2,Information provided to and sustainability mat...,40,243
4,GOV-3,Integration of sustainability-related performa...,"76, 80",243
5,GOV-4,Statement on due diligence,170,243
6,GOV-5,Risk management and internal controls over sus...,"40, 58",243
7,SBM-1,"Strategy, business model and value chain","54, 170",243
8,SBM-2,Interests and views of stakeholders,"54, 170",243
9,SBM-3,"Material impacts, risks and opportunities and ...","54, 170, 174",243
