In [7]:
# Ensure you're on the correct kernel (e.g. 'esg-py311')
import sys
from pathlib import Path
import pandas as pd

# 2) Point to your 1_code folder so we can import the run() function
#    If auto‑detect fails, manually set REPO_ROOT to the folder containing both '1_code' and '2_output'
# 2) Auto‑detect the repo root by climbing until we see '2_output/extracted_tables'
REPO_ROOT = Path().resolve()
while True:
    if (REPO_ROOT / '2_output' / 'extracted_tables').exists():
        break
    if REPO_ROOT.parent == REPO_ROOT:
        raise RuntimeError("Couldn't locate '2_output/extracted_tables' in any parent")
    REPO_ROOT = REPO_ROOT.parent
print(f"Using REPO_ROOT = {REPO_ROOT}")

sys.path.append(str(REPO_ROOT / '1_code'))

from stage2B import run   # import the existing main() entrypoint


Using REPO_ROOT = /Users/valeriiaklynna/Documents/GitHub/bachelor-thesis-group32-new/bachelor-thesis-group32-folder


In [8]:
# Point to the folder where Stage2A dumped all extracted tables:
# It should contain subfolders for each report (e.g. 'vinci-2024-universal-registration-document')
extracted_root = REPO_ROOT / '2_output' / 'extracted_tables'
assert extracted_root.exists(), f"No such folder: {extracted_root}"

# Define where to write filtered tables for all reports
filtered_root = REPO_ROOT / '2_output' / 'filtered_tables'
filtered_root.mkdir(exist_ok=True, parents=True)

In [9]:
# 1) List all extracted reports to get the exact folder name:
available = [d.name for d in extracted_root.iterdir() if d.is_dir()]
print("Available reports:", available)

# 2) Specify the report folder name under 'extracted_tables'
#    Must exactly match one of the names printed above
report_name = '01UK_total_HR'

# 3) Build the input and output paths for that report
input_dir  = extracted_root / report_name
assert input_dir.exists(), f"No such folder: {input_dir}"
output_dir = filtered_root  / report_name

# 4) Run Stage2B non‑interactively on that one report
run(input_dir, output_dir, threshold=10.0)
print(f"✅ {report_name} → {output_dir.resolve()}")

Available reports: ['Volkswagen', 'RFA_ELO_2024_EN-1', 'Commerzbank_Group_Annual_Report_2024', 'cez-group-annual-financial-report-2024', 'upm-annual-report-2024', 'Carlsberg Group_2024 Annual Report', 'brenntag_annual-report-2024_en', 'Amorim_RC24_EN', 'Cenergy-Annual_Report_2024', 'Annual-Report-HoldCo-2024-Web', 'AB InBev 2024 Annual Report FINAL_Interactive', 'DHL-Group-2024-Annual-Report', 'annual-report-2024-data', 'Orsted Annual Report 2024', 'annual-report-2024-equinor-compressed', '01UK_total_HR', '24-VUB-Vyrocna-sprava-ENG', 'consolidated-annual-report-endesa-2024', 'sopra_steria_urd_2024_en_opti', 'ferrovial-integrated-annual-report-2024-1', 'ad_annual-report_2024_interactive', 'f3bb40f8-e502-409f-b3b3-a61e2954531b', 'adidas-ar24', '2024 ING Groep NV annual report', '2025-04-23-Eramet-URD-2024-EN', 'EXOR 2024 Annual Report', 'Wolters_Kluwer_2024_Annual_Report', 'FiskarsGroup_Annual_Report_2024', 'eri_euroapi2024_urd_en_basse-definition_31march_19h28', 'Evonik_Financial_and_Su

In [10]:
#OPTIONALLY
#run through all pdfs in excel file

In [11]:
#Load all report names from Excel
excel_path = REPO_ROOT / "0_data" / "table_extraction_accuracy2.xlsx"
sheet_name = "suggested weights"
pdf_df = pd.read_excel(excel_path, sheet_name=sheet_name)
pdf_names = pdf_df["pdf_path"].dropna().apply(lambda p: Path(p).stem).unique()

In [12]:
for report_name in pdf_names:
    input_dir  = extracted_root / report_name
    output_dir = filtered_root  / report_name

    if not input_dir.exists():
        print(f"⚠️ Skipping {report_name}: no extracted tables found.")
        continue

    try:
        print(f"\n🔍 Running Stage2B for: {report_name}")
        run(input_dir, output_dir, threshold=10.0)
        print(f"✅ Saved filtered tables to: {output_dir}")
    except Exception as e:
        print(f"❌ Failed on {report_name}: {e}")


🔍 Running Stage2B for: 01UK_total_HR
Page 121: saved page_121_tbl02_Camelot-Stream_conf97.csv, score=273.18
Page 122: saved page_122_tbl01_Camelot-Stream_conf77.csv, score=105.45
✅ Saved filtered tables to: /Users/valeriiaklynna/Documents/GitHub/bachelor-thesis-group32-new/bachelor-thesis-group32-folder/2_output/filtered_tables/01UK_total_HR

🔍 Running Stage2B for: 2.2 BoD Report 2024 EN -14.03.2025_FINAL
Page 201: saved page_201_tbl01_Camelot-Stream_conf73.csv, score=29.84
Page 202: saved page_202_tbl01_Camelot-Stream_conf71.csv, score=39.86
Page 203: saved page_203_tbl01_Camelot-Lattice_conf99.csv, score=17.11
Page 207: saved page_207_tbl01_Camelot-Stream_conf58.csv, score=7.71
✅ Saved filtered tables to: /Users/valeriiaklynna/Documents/GitHub/bachelor-thesis-group32-new/bachelor-thesis-group32-folder/2_output/filtered_tables/2.2 BoD Report 2024 EN -14.03.2025_FINAL

🔍 Running Stage2B for: 2024 Adyen Annual Report
Page 111: saved page_111_tbl01_Camelot-Stream_conf58.csv, score=15.90