In [4]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import json
import time
from bs4 import Tag

In [5]:
# Setup headless browser
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Load the page
url = "https://xbrl.efrag.org/e-esrs/esrs-set1-2023.html"
driver.get(url)
time.sleep(3)
soup = BeautifulSoup(driver.page_source, "html.parser")
driver.quit()

In [6]:
count = 0
with open("scraped_text/esrs_streamed.json", "w", encoding="utf-8") as f:
    f.write("[\n")
    first = True
    current_id = None
    main_container = soup.find("div", class_="eli-container", id="anx_I")

    for item in main_container.children: # one item is either a section-title, a subsetion-title, or a paragraph
        item_extracted = None  
        if item.name == "p" or (item.name == "a" and item.has_attr("id")): # p and a (all except appendix)
            # section and subsection
            if item.name == "p": 
                direct_text = item.find(string=True, recursive=False) # direct text under class
                direct_text = direct_text.strip() if direct_text else ""

                useful_child = item.find("span")
                child_text = useful_child.get_text(strip=True) if useful_child else ""

                if direct_text.endswith('.'): # section
                    item_extracted = {
                        "class_type_id": f"p_{item.get('id', '')}",
                        "section_num": direct_text,
                        "section_title": child_text
                    }
                else: # subsection
                    item_extracted = {
                        "class_type_id": f"p_{item.get('id', '')}",
                        "subsection_num": direct_text,
                        "subsection_title": child_text
                    }

            # a paragraph, <singel-item (smallest unit)>
            elif item.name == "a" and item.has_attr("id"): 
                full_text = item.get_text(separator="\n", strip=True)
                texts = full_text.split('\n')
                item_num = texts[0] if texts else ''
                item_content = " ".join(texts[1:]) if len(texts) > 1 else ""

                item_extracted = {
                    "class_type_id": f'a_{item["id"]}',
                    "item_num": item_num,
                    "item_content": item_content
                }

            # ✅ Write it out either a p or a
            if item_extracted:
                if not first:
                    f.write(",\n")
                else:
                    first = False
                f.write(json.dumps(item_extracted, ensure_ascii=False))
                count += 1

        # Appendix
        elif item.name == "div" : # the appendixs, one item => one appendix. e.g.: Appendix A
            tag_child = next((c for c in item.children if isinstance(c, Tag)), None)
            item_extracted = None

            if tag_child:
                titles = [                  # section title "Appendix A", 'Application Requirements'
                    p.get_text(strip=True)
                    for p in tag_child.find_all("p", class_="oj-doc-ti")
                    if p.get("class") == ["oj-doc-ti"]
                ]
                item_extracted = {
                    "class_type_id": f"ar_section_no_id",
                    "item_num": titles[0],      # First content paragraph
                    "item_content": titles[1],  # Second content paragraph
                }
                if item_extracted:
                    # ✅ Write it out immediately
                    if not first:
                        f.write(",\n")
                    else:
                        first = False
                    f.write(json.dumps(item_extracted, ensure_ascii=False))
                    count += 1

                for sub_item in tag_child.children: # Below contain everything in one (Appendix A), is written two-level below 【☑️ check】
                    item_extracted = None
                    if sub_item.name == "p": # Section title, subsection-title, pre_para_note 【☑️ check】
                        # print(sub_item.get('class'))
                        if sub_item.get('class') == ['oj-doc-ti']:  # Section title
                            continue
                        elif sub_item.find("span", class_="oj-italic"): # pre_para_note, written in italic e.g.:'Stakeholders and their relevance to the materiality assessment process'
                            item_extracted = {
                                "class_type_id": f"ar_pre_para_note_no_id",
                                "item_num": sub_item.get_text(strip=True), # get direct text
                                "item_content": sub_item.get_text(strip=True), # get direct text
                            }
                        else: # subsection title, written in bold e.g.:'Entity specific disclosures'
                            item_extracted = {
                                "class_type_id": f"ar_subsection_no_id",
                                "item_num": sub_item.get_text(strip=True), # get direct text
                                "item_content": sub_item.get_text(strip=True), # get direct text
                            }
                            
                    
                    elif sub_item.name == "a" and sub_item.has_attr("id"): # a paragraph, <singel-item (smallest unit)>
                        full_text = sub_item.get_text(separator="\n", strip=True)
                        texts = full_text.split('\n')
                        subitem_num = texts[0] if texts else ''
                        subitem_content = " ".join(texts[1:]) if len(texts) > 1 else ""

                        item_extracted = {
                            "class_type_id": f'ar_{sub_item["id"]}',
                            "item_num": subitem_num, # AR 1
                            "item_content": subitem_content # content
                        }

                    # ✅ Write a item in appendix_class.child
                    if item_extracted:
                        if not first:
                            f.write(",\n")
                        else:
                            first = False
                        f.write(json.dumps(item_extracted, ensure_ascii=False))
                        count += 1

    f.write("\n]")
    print(f"✅ Streamed JSON written to text/esrs_streamed.json,{count} items")

✅ Streamed JSON written to text/esrs_streamed.json,2312 items
