In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import json
import time

In [2]:
# Setup headless browser
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Load the page
url = "https://xbrl.efrag.org/e-esrs/esrs-set1-2023.html"
driver.get(url)
time.sleep(3)
soup = BeautifulSoup(driver.page_source, "html.parser")
driver.quit()

In [None]:
count = 0
with open("scraped_text/esrs_streamed.json", "w", encoding="utf-8") as f:
    f.write("[\n")
    first = True
    current_id = None
    main_container = soup.find("div", class_="eli-container", id="anx_I")

    for item in main_container.children: # one item is either a section-title, a subsetion-title, or a normal point
        item_extracted = None  

        if item.name == "p": # section and subsection
            direct_text = item.find(string=True, recursive=False) # direct text under class
            direct_text = direct_text.strip() if direct_text else ""

            useful_child = item.find("span")
            child_text = useful_child.get_text(strip=True) if useful_child else ""

            if direct_text.endswith('.'): # section
                item_extracted = {
                    "class_type_id": f"p_{item.get('id', '')}",
                    "section_num": direct_text,
                    "section_title": child_text
                }
            else: # subsection
                item_extracted = {
                    "class_type_id": f"p_{item.get('id', '')}",
                    "subsection_num": direct_text,
                    "subsection_title": child_text
                }
        elif item.name == "a" and item.has_attr("id"): # a singel-item/point (smallest unit)
            full_text = item.get_text(separator="\n", strip=True)
            texts = full_text.split('\n')
            item_num = texts[0] if texts else ''
            item_content = " ".join(texts[1:]) if len(texts) > 1 else ""

            item_extracted = {
                "class_type_id": f'a_{item["id"]}',
                "item_num": item_num,
                "item_content": item_content
            }


        if item_extracted: # Only write if something was extracted
            if not first: # no need of comma for the first item in json
                f.write(",\n") # add the comma for json
            else:
                first = False
            f.write(json.dumps(item_extracted, ensure_ascii=False))
            count = count+1

    f.write("\n]")
    print(f"✅ Streamed JSON written to text/esrs_streamed.json,{count} items")

✅ Streamed JSON written to text/esrs_streamed,1514items.json


In [None]:
{"class_type_id": 
 "p_d1e429-3-1", "subsection_num": "", 
 "subsection_title": "Objective"},
{"class_type_id": "a_63", "item_num": "1.", "item_content": "The objective of European Sustainability Reporting Standards (ESRS) is to specify the sustainability information that an undertaking shall disclose in accordance with Directive 2013/34/EU of the European Parliament and of the Council ( 1 ) , as amended by Directive (EU) 2022/2464 of the European Parliament and of the Council. ( 2 ) Reporting in accordance with ESRS does not exempt undertakings from other obligations laid down in Union law."},
{"class_type_id": "a_64", "item_num": "2.", "item_content": "Specifically, ESRS specify the information that an undertaking shall disclose about its material impacts , risks and opportunities in relation to environmental, social, and governance sustainability matters. ESRS do not require undertakings to disclose any information on environmental, social and governance topics covered by ESRS when the undertaking has assessed the topic in question as non-material (See Appendix E of this Standard “ Flowchart for determining disclosures to be included ”).The information disclosed in accordance with ESRS enables users of the sustainability statement to understand the undertaking’s material impacts on people and environment and the material effects of sustainability matters on the undertaking’s development, performance and position."},
{"class_type_id": "a_65", "item_num": "3.", "item_content": "The objective of this Standard (ESRS 1) is to provide an understanding of the architecture of ESRS, the drafting conventions and fundamental concepts used, and the general requirements for preparing and presenting sustainability information in accordance with Directive 2013/34/EU, as amended by Directive (EU) 2022/2464."},
{"class_type_id": "p_d1e500-3-1", "section_num": "1.", "section_title": "Categories of ESRS Standards, reporting areas and drafting conventions"},
{"class_type_id": "p_d1e509-3-1", "subsection_num": "1.1", "subsection_title": "Categories of ESRS standards"},
{"class_type_id": "a_66", "item_num": "4.", "item_content": "There are three categories of ESRS: (a) cross-cutting standards; (b) topical standards (Environmental, Social and Governance standards); and (c) sector-specific standards. Cross-cutting standards and topical standards are sector-agnostic, meaning that they apply to all undertakings regardless of which sector or sectors the undertaking operates in."},


