In [1]:
import os
import re
import fitz  # PyMuPDF
import pandas as pd
import openai
from typing import List, Dict
from dotenv import load_dotenv
from openai import OpenAI
import json, logging

load_dotenv()  # loads environment variables from .env file
client = OpenAI(api_key="input_your_api_key_here")  # replace with your OpenAI API key


# --- Config ---
CATEGORY = "DR"
MAX_PAGE_COLS = 6
REPORTS_DIR = "Reports"
CODE_RE = re.compile(r'\b(?:BP|GOV|SBM|IRO|E[1-5]|S[1-4]|G[1-3])[-\u2013\u2014\u2010]?\d{1,2}\b', re.I)
MIN_CODES = 5

def code_cnt(txt: str) -> int:
    return len(CODE_RE.findall(txt))

def find_esrs_pages(pdf_path: str) -> List[int]:
    doc = fitz.open(pdf_path)
    toc_hits = [i for i, p in enumerate(doc)
                if "esrs table of contents" in p.get_text().lower() and code_cnt(p.get_text()) >= MIN_CODES]
    if toc_hits:
        start = toc_hits[-1]
        pages = [start]
        for j in range(start + 1, len(doc)):
            if code_cnt(doc[j].get_text()) >= MIN_CODES:
                pages.append(j)
            else:
                break
        return [p + 1 for p in pages]

    qualifies = [code_cnt(p.get_text()) >= MIN_CODES for p in doc]
    blocks, i = [], 0
    while i < len(doc):
        if not qualifies[i]: i += 1; continue
        start = i
        while i < len(doc) and qualifies[i]: i += 1
        blocks.append(range(start, i))
    if not blocks:
        return []

    best = max(blocks, key=lambda r: len(r) * sum(code_cnt(doc[p].get_text()) for p in r))
    return [p + 1 for p in best]

def clean_page_refs(ref_str: str, current_page: int) -> str:
    toks = re.split(r"[, \s]+", ref_str.strip())
    refs = []
    for t in toks:
        if t.isdigit():
            n = int(t)
            if n == current_page or n < 10:
                continue
            refs.append(str(n))
    return ", ".join(refs)

def get_company_name(pdf_path: str) -> str:
    filename = os.path.basename(pdf_path).lower()
    clean_name = (filename.replace('-', ' ').replace('_', ' ').replace('.pdf', '')
                  .replace('annual', '').replace('report', '').replace('group', '').replace('integrated', '').strip())
    first_word = clean_name.split()[0].title()
    return first_word if len(first_word) > 2 else os.path.splitext(os.path.basename(pdf_path))[0]

def extract_page_items(text: str, page_num: int, client: OpenAI, company: str) -> pd.DataFrame:
    prompt = f"""You are an ESRS disclosure analyzer. Extract **all** ESRS disclosures from the text below (page {page_num}).
    Return JSON exactly like: {{\"items\":[{{\"code\":\"\",\"title\":\"\",\"page_reference\":\"\"}}]}}

Text:\n{text}"""
    try:
        resp = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}],
            response_format={"type": "json_object"},
        )
        data = json.loads(resp.choices[0].message.content)
        items: List[Dict] = data.get("items", [])
    except Exception as e:
        logging.error(f"LLM error on page {page_num}: {e}")
        items = []

    if not items:
        return pd.DataFrame(columns=["name", "category", "variable", "value"] + [f"Page_ref{i}" for i in range(1, MAX_PAGE_COLS + 1)])

    df = pd.DataFrame(items)
    df["page_reference"] = df["page_reference"].apply(lambda x: ", ".join(map(str, x)) if isinstance(x, (list, tuple)) else str(x or ""))
    df["page_reference"] = df["page_reference"].apply(lambda s: clean_page_refs(s, page_num))
    df = df.groupby("code", as_index=False, sort=False).agg({"page_reference": ", ".join})
    df["value"] = (df["page_reference"].str.len() > 0).astype(int)
    df = df.assign(name=company, category=CATEGORY, variable=df.get("code")).drop(columns=["title"], errors="ignore")
    pages = df["page_reference"].str.replace(r"\s+", " ", regex=True).str.strip().str.split(r"[,\s]+", expand=True).rename(columns=lambda i: f"Page_ref{i + 1}")
    for i in range(pages.shape[1] + 1, MAX_PAGE_COLS + 1):
        pages[f"Page_ref{i}"] = ""
    df = pd.concat([df.drop(columns=["page_reference"]), pages], axis=1)
    return df.fillna("")

def process_reports_folder():
    summary = []
    pdf_files = [f for f in os.listdir(REPORTS_DIR) if f.endswith(".pdf")]
    for pdf_file in pdf_files:
        pdf_path = os.path.join(REPORTS_DIR, pdf_file)
        print(f"\nProcessing: {pdf_file}")
        try:
            company = get_company_name(pdf_path)
            pages = find_esrs_pages(pdf_path)
            if not pages:
                print("❌ No ESRS pages found")
                continue
            print(f"✅ ESRS pages: {pages}")
            doc = fitz.open(pdf_path)
            dfs = [extract_page_items(doc[p - 1].get_text(), p, client, company) for p in pages]
            full_table = pd.concat(dfs, ignore_index=True)
            full_table["source"] = pdf_file
            summary.append(full_table)
        except Exception as e:
            print(f"❌ Error processing {pdf_file}: {e}")
    if summary:
        all_data = pd.concat(summary, ignore_index=True)
        all_data.to_csv("all_esrs_results.csv", index=False)
        print("\n✅ Finished. Output saved to all_esrs_results.csv")

# Run the batch processor
process_reports_folder()



Processing: 2024 Adyen Annual Report.pdf
✅ ESRS pages: [112, 113]


ERROR:root:LLM error on page 112: Error code: 401 - {'error': {'message': 'Incorrect API key provided: input_yo***********here. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}
ERROR:root:LLM error on page 113: Error code: 401 - {'error': {'message': 'Incorrect API key provided: input_yo***********here. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}



Processing: annual-report-adidas-ar24.pdf


KeyboardInterrupt: 