In [2]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import json
import time

In [3]:
# Setup headless browser
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Load the page
url = "https://xbrl.efrag.org/e-esrs/esrs-set1-2023.html"
driver.get(url)
time.sleep(3)
soup = BeautifulSoup(driver.page_source, "html.parser")
driver.quit()

In [6]:
with open("scraped_text/esrs_streamed.json", "w", encoding="utf-8") as f:
    f.write("[\n")
    first = True
    current_id = None
    main_container = soup.find("div", class_="eli-container", id="anx_I")

    for item in main_container.children: # one item is either a section-title, a subsetion-title, or a normal point
        item_extracted = None  

        if item.name == "p": # section and subsection
            direct_text = item.find(string=True, recursive=False) # direct text under class
            direct_text = direct_text.strip() if direct_text else ""

            useful_child = item.find("span")
            child_text = useful_child.get_text(strip=True) if useful_child else ""

            if direct_text.endswith('.'): # section
                item_extracted = {
                    "class_type_id": f"p_{item.get('id', '')}",
                    "section_num": direct_text,
                    "section_title": child_text
                }
            else: # subsection
                item_extracted = {
                    "class_type_id": f"p_{item.get('id', '')}",
                    "subsection_num": direct_text,
                    "subsection_title": child_text
                }
        elif item.name == "a" and item.has_attr("id"): # a singel-item/point (smallest unit)
            full_text = item.get_text(separator="\n", strip=True)
            texts = full_text.split('\n')
            item_num = texts[0] if texts else ''
            item_content = " ".join(texts[1:]) if len(texts) > 1 else ""

            item_extracted = {
                "class_type_id": f'a_{item["id"]}',
                "item_num": item_num,
                "item_content": item_content
            }


        if item_extracted: # Only write if something was extracted
            if not first: # no need of comma for the first item in json
                f.write(",\n") # add the comma for json
            else:
                first = False
            f.write(json.dumps(item_extracted, ensure_ascii=False))

    f.write("\n]")
    print("✅ Streamed JSON written to text/esrs_streamed.json")

✅ Streamed JSON written to text/esrs_streamed.json
