In [1]:
import pandas as pd
import fitz
import re
from pathlib import Path
import unicodedata

In [16]:
def normalize_whitespace(text):
    return re.sub(r'\s+', ' ', text).strip()

def normalize_unicode(text):
    return unicodedata.normalize("NFKC", text)

def parse_page_range(text):
    text = str(text)
    numbers = set()
    for part in re.split(r"[;,]", text):
        part = part.strip()
        if re.match(r"^\d+\s*[-–]\s*\d+$", part):
            start, end = map(int, re.split(r"[-–]", part))
            numbers.update(range(start, end + 1))
        elif part.isdigit():
            numbers.add(int(part))
    return numbers

def extract_pdf_text_by_code_and_section(pdf_path):
    pdf_path = Path(pdf_path)
    pdf_stem = pdf_path.stem

    table_path = Path("../2_output/standardized_merged_by_company") / f"{pdf_stem}_standardized_full.csv"
    if not table_path.exists():
        raise FileNotFoundError(f"Cannot find the standardized table at: {table_path}")

    df = pd.read_csv(table_path)
    df.columns = df.columns.str.strip()
    df["Code"] = df["Code"].astype(str).str.strip()
    df["Relevant Pages"] = df["Relevant Pages"].astype(str).str.strip()
    df["Section Reference"] = df["Section Reference"].astype(str).str.strip()
    df["Page Range"] = df["Page Range"].astype(str).str.strip()

    doc = fitz.open(pdf_path)
    output = []

    for i, row in df.iterrows():
        code = row["Code"]
        valid_code = True

        rel_pages_raw = row["Relevant Pages"]
        section_raw = row["Section Reference"]
        page_range_set = parse_page_range(row.get("Page Range", ""))
        EXPANSION_RANGE = 2

        total_pages = len(doc)
        cutoff_page = int(total_pages * 0.8)

        # ✅ Relevant pages logic
        used_pages = set()
        if rel_pages_raw and pd.notna(rel_pages_raw):
            page_nums = [int(p) for p in re.findall(r"\d+", rel_pages_raw)]
            for p in page_nums:
                if 20 < p <= cutoff_page:
                    for n in range(p - EXPANSION_RANGE, p + EXPANSION_RANGE + 1):
                        if 1 <= n <= cutoff_page:
                            used_pages.add(n)
        for page_num in sorted(used_pages):
            raw_text = doc[page_num - 1].get_text()
            page_text = normalize_whitespace(normalize_unicode(raw_text))
            output.append({
                "Matched Code": code if valid_code else None,
                "Relevant Pages (Raw)": rel_pages_raw,
                "Used Pages": page_num,
                "Section Reference": None,
                "Extracted Text": page_text.strip()
            })

        if section_raw and pd.notna(section_raw):
            section_list = [s.strip() for s in section_raw.split(",") if s.strip()]
            for sec in section_list:
                matched_pages = set()
                for page_num in range(21, total_pages + 1):
                    if page_num in page_range_set:
                        continue
                    text = doc[page_num - 1].get_text()
                    if re.search(rf"\b{re.escape(sec)}\b", text):
                        print(f"[Section Match] Row {i}, Code='{code}', Section='{sec}', Triggered Page={page_num}")
                        if 20 < page_num <= cutoff_page:
                            for n in range(page_num - EXPANSION_RANGE, page_num + EXPANSION_RANGE + 1):
                                if 1 <= n <= cutoff_page:
                                    matched_pages.add(n)

                if not matched_pages:
                    print(f"[Section Miss] Row {i}, Code='{code}', Section='{sec}' — no matches found.")

                for page_num in sorted(matched_pages):
                    raw_text = doc[page_num - 1].get_text()
                    page_text = normalize_whitespace(normalize_unicode(raw_text))
                    output.append({
                        "Matched Code": code if valid_code else None,
                        "Relevant Pages (Raw)": rel_pages_raw,
                        "Used Pages": page_num,
                        "Section Reference": sec,
                        "Extracted Text": page_text.strip()
                    })

    return pd.DataFrame(output)

In [21]:
pdf_path = "../0_data/pdfs/01UK_total_HR.pdf" #insert the PDF of interest
result_df = extract_pdf_text_by_code_and_section(pdf_path)

[Section Miss] Row 0, Code='nan', Section='nan' — no matches found.
[Section Miss] Row 1, Code='BP-1', Section='nan' — no matches found.
[Section Miss] Row 2, Code='BP-2', Section='nan' — no matches found.
[Section Miss] Row 3, Code='E1', Section='nan' — no matches found.
[Section Miss] Row 4, Code='E1-1', Section='nan' — no matches found.
[Section Miss] Row 5, Code='E1-2', Section='nan' — no matches found.
[Section Miss] Row 6, Code='E1-3', Section='nan' — no matches found.
[Section Miss] Row 7, Code='E1-4', Section='nan' — no matches found.
[Section Miss] Row 8, Code='E1-6', Section='nan' — no matches found.
[Section Miss] Row 9, Code='GOV-1', Section='nan' — no matches found.
[Section Miss] Row 10, Code='GOV-2', Section='nan' — no matches found.
[Section Miss] Row 11, Code='GOV-4', Section='nan' — no matches found.
[Section Miss] Row 12, Code='GOV-5', Section='nan' — no matches found.
[Section Miss] Row 13, Code='IRO-1', Section='nan' — no matches found.
[Section Miss] Row 14, Code=

In [22]:
result_df
#mind that matched code = nan is the first row that always have the missing code

Unnamed: 0,Matched Code,Relevant Pages (Raw),Used Pages,Section Reference,Extracted Text
0,,"11,12,15,18,20,21,33,34,35,36,40,41,43,45,48,4...",19,,Fabien De Jonge : The Rentel and SeaMade wind ...
1,,"11,12,15,18,20,21,33,34,35,36,40,41,43,45,48,4...",20,,Next generations deserve new heroes who change...
2,,"11,12,15,18,20,21,33,34,35,36,40,41,43,45,48,4...",21,,"Valérie Van Brabant : In 2024, we continued th..."
3,,"11,12,15,18,20,21,33,34,35,36,40,41,43,45,48,4...",22,,* The main objective of the Corporate Sustaina...
4,,"11,12,15,18,20,21,33,34,35,36,40,41,43,45,48,4...",23,,"Looking beyond our core business, we were very..."
...,...,...,...,...,...
465,e,1634878899100114,112,,Annual report 2024 - CFE 112 Number of employe...
466,e,1634878899100114,113,,Annual report 2024 - CFE 113 3.1.11. S1-14 Hea...
467,e,1634878899100114,114,,Annual report 2024 - CFE 114 3.2.1. SBM2 Inter...
468,e,1634878899100114,115,,Annual report 2024 - CFE 115 3.2.5. S2-3 Proce...


In [23]:
#preview of the text
for i, row in result_df.iterrows():
    print(f"\n--- Code: {row['Matched Code']} | Page: {row['Used Pages']} | Section: {row['Section Reference']} ---\n")
    print(row["Extracted Text"][:1000])  # preview first 1000 chars


--- Code: nan | Page: 19 | Section: None ---

Fabien De Jonge : The Rentel and SeaMade wind farms, in which Green Offshore holds 12.5% and 8.75% respectively, were faced with less favourable weather conditions than in 2023. Furthermore, unlike 2023, the price of electricity remained well below the guaran­ teed price. Combined green energy production from the two parks reached 2.8 Twh in 2024. In Vietnam, Deep C Holding saw its industrial land sales decline to 80 hectares (127 hectares in 2023), partly due to the introduction of new real estate sales laws, which have led to delays in industrial land sales. It is worth noting that park service activities performed very well in 2024, posting a significant increase in sales and operating income. Via GreenStor, CFE continues to innovate in the battery farm market. GreenStor has a 38% stake in BSTOR, a company that co-develops battery farms in Belgium. The first 10 MW farm has been operational since the end of 2021. Construction of a second

In [24]:
#save the text in the folder extracted_text
text_only = result_df["Extracted Text"]

company_name = Path(pdf_path).stem.strip()
save_path = Path("../2_output/extracted_text") / f"{company_name}_extracted_text_only.txt"
save_path.parent.mkdir(parents=True, exist_ok=True)

text_only.to_csv(save_path, index=False, header=False)
print(f"✅ Saved extracted text only to: {save_path}")

✅ Saved extracted text only to: ../2_output/extracted_text/01UK_total_HR_extracted_text_only.txt


Text includes all codes so IF INTERESTRED IN THE SPECIFIC CODE MODIFY THE valid_code for instance replace valid_code = True with
valid_code = bool(re.search(r"\b(ESRS\s*)?E[-\s]?\d+\b", code, re.IGNORECASE))
        if not valid_code and i != 0:
            continue
if interested in codes with pattern of E