In [13]:
import json

In [14]:
with open("scraped_text/esrs_streamed.json", "r", encoding="utf-8") as f:
    scraped_text = json.load(f)

# Now `data` is a Python list of dictionaries
print(type(scraped_text))       # <class 'list'>
print(scraped_text[0])          # First item

<class 'list'>
{'class_type_id': 'a_0', 'item_num': 'ANNEX I to Commission Delegated Regulation (EU) 2023/2772 supplementing Directive', 'item_content': '2013/34/EU of the European Parliament and of the Council as regards sustainability reporting standards, published in the Official Journal of the European Union on 22 December 2023 and including the corrigendum published on 18 April 2024. European sustainability reporting standards (ESRS)'}


In [15]:
tagged_text = []

In [16]:
# revise chapter title's key
chapter_names = [
    "ESRS 1",
    "ESRS 2",
    "ESRS E1",
    "ESRS E2",
    "ESRS E3",
    "ESRS E4",
    "ESRS E5",
    "ESRS S1",
    "ESRS S2",
    "ESRS S3",
    "ESRS S4",
    "ESRS G1",
    "Acronyms",
    "Defined Terms"
]

In [18]:
# Tag the text with paragarph_num, paragraph_content, title_num, title_text, type, location_chapter, location_section, location_subsection
# Output the whole tagged text
current_chapter = 'Introduction'
current_section_num = ''
current_subsection_num = ''


for item in scraped_text:
    if item.get('subsection_title') in chapter_names: # is a chapter title
        new_item = {
            'content': item['subsection_title'],
            'metadata':{
                'type': 'title_chapter', # 'content' or 'title'
                'location_chapter': item['subsection_title'],
                'location_section': '',
                'location_subsection': '',
            }
        }
        current_chapter = item['subsection_title']
        current_section_num = ''
        current_subsection_num = ''
        tagged_text.append(new_item)

    elif 'section_num' in item: # is a section title
        new_item = {
            'content': item['section_title'],
            'metadata':{
                'title_num': item['section_num'],
                'type': 'title_section',
                'location_chapter': current_chapter,
                'location_section': item['section_num'],
                'location_subsection': '',
            }
        }
        current_section_num = item.get('section_num')
        current_subsection_num = ''
        tagged_text.append(new_item)

    elif 'subsection_num' in item:
        new_item = {
            'content': item['subsection_title'],
            'metadata':{
                'title_num': item['subsection_num'],
                'type': 'title_subsection',
                'location_chapter': current_chapter,
                'location_section': current_section_num,
                'location_subsection': item['subsection_num'],
            }
        }
        current_subsection_num = item.get('subsection_num')
        current_subsection_name = item.get('subsection_title')
        tagged_text.append(new_item)

    elif 'item_num' in item:
        new_item = {
            'content': item['item_content'],
            'metadata':{
                'paragraph_num': item['item_num'],
                'type': 'paragraph',
                'location_chapter': current_chapter,
                'location_section': current_section_num,
                'location_subsection': current_subsection_num,
            }
        }
        tagged_text.append(new_item)
    else:
        new_item['error'] = 'je ne comprends pas'
        new_item = new_item | item

        tagged_text.append(new_item)



# Save to a JSON file
with open('tagged_text/tagged_text_all.json', 'w', encoding='utf-8') as f:
    f.write("[\n")
    first = True
    for item in tagged_text:
        if not first: # no need of comma for the first item in json
            f.write(",\n") # add the comma for json
        else:
            first = False
        json.dump(item, f, ensure_ascii=False)
    f.write("]\n")

print("JSON file 'tagged_text_all.json' has been created.")
    

JSON file 'tagged_text_all.json' has been created.


In [6]:
length_list = []
# Output tagged text for each chapter
for chapter_x in chapter_names:
    tagged_text_chapter_x = []
    
    for item in tagged_text:
        if item.get('location_chapter') == chapter_x:
            tagged_text_chapter_x.append(item)
    with open(f'tagged_text/tagged_text_{chapter_x}.json', 'w', encoding='utf-8') as f:
        for item in tagged_text_chapter_x:
            json.dump(item, f, ensure_ascii=False)
            f.write('\n')
    length_list.append(len(tagged_text_chapter_x))

    print(f"JSON file 'tagged_text_{chapter_x}.json' has been created, {len(tagged_text_chapter_x)} items.")

print(f'Summation of items:{sum(length_list)}')
    



    

JSON file 'tagged_text_ESRS 1.json' has been created, 0 items.
JSON file 'tagged_text_ESRS 2.json' has been created, 0 items.
JSON file 'tagged_text_ESRS E1.json' has been created, 0 items.
JSON file 'tagged_text_ESRS E2.json' has been created, 0 items.
JSON file 'tagged_text_ESRS E3.json' has been created, 0 items.
JSON file 'tagged_text_ESRS E4.json' has been created, 0 items.
JSON file 'tagged_text_ESRS E5.json' has been created, 0 items.
JSON file 'tagged_text_ESRS S1.json' has been created, 0 items.
JSON file 'tagged_text_ESRS S2.json' has been created, 0 items.
JSON file 'tagged_text_ESRS S3.json' has been created, 0 items.
JSON file 'tagged_text_ESRS S4.json' has been created, 0 items.
JSON file 'tagged_text_ESRS G1.json' has been created, 0 items.
JSON file 'tagged_text_Acronyms.json' has been created, 0 items.
JSON file 'tagged_text_Defined Terms.json' has been created, 0 items.
Summation of items:0
