In [1]:
import json
import pandas as pd

In [2]:
with open("scraped_text/esrs_streamed.json", "r", encoding="utf-8") as f:
    scraped_text = json.load(f)

# Now `data` is a Python list of dictionaries
print(type(scraped_text))       # <class 'list'>
print(scraped_text[0])          # First item

<class 'list'>
{'class_type_id': 'a_0', 'item_num': 'ANNEX I to Commission Delegated Regulation (EU) 2023/2772 supplementing Directive', 'item_content': '2013/34/EU of the European Parliament and of the Council as regards sustainability reporting standards, published in the Official Journal of the European Union on 22 December 2023 and including the corrigendum published on 18 April 2024. European sustainability reporting standards (ESRS)'}


In [3]:
tagged_text = []

In [4]:
# revise chapter title's key
chapter_names = [
    "ESRS 1",
    "ESRS 2",
    "ESRS E1",
    "ESRS E2",
    "ESRS E3",
    "ESRS E4",
    "ESRS E5",
    "ESRS S1",
    "ESRS S2",
    "ESRS S3",
    "ESRS S4",
    "ESRS G1",
    "Acronyms",
    "Defined Terms"
]

In [5]:
# # Produce a json
# # Tag the text with paragarph_num, paragraph_content, title_num, title_text, type, location_chapter, location_section, location_subsection
# # Output the whole tagged text
# current_chapter = 'Introduction'
# current_section_num = ''
# current_subsection_num = ''


# for item in scraped_text:
#     if item.get('subsection_title') in chapter_names: # is a chapter title
#         new_item = {
#             'content': item['subsection_title'],
#             'metadata':{
#                 'type': 'title_chapter', # 'content' or 'title'
#                 'location_chapter': item['subsection_title'],
#                 'location_section': '',
#                 'location_subsection': '',
#             }
#         }
#         current_chapter = item['subsection_title']
#         current_section_num = ''
#         current_subsection_num = ''
#         tagged_text.append(new_item)

#     elif 'section_num' in item: # is a section title
#         new_item = {
#             'content': item['section_title'],
#             'metadata':{
#                 'title_num': item['section_num'],
#                 'type': 'title_section',
#                 'location_chapter': current_chapter,
#                 'location_section': item['section_num'],
#                 'location_subsection': '',
#             }
#         }
#         current_section_num = item.get('section_num')
#         current_subsection_num = ''
#         tagged_text.append(new_item)

#     elif 'subsection_num' in item:
#         new_item = {
#             'content': item['subsection_title'],
#             'metadata':{
#                 'title_num': item['subsection_num'],
#                 'type': 'title_subsection',
#                 'location_chapter': current_chapter,
#                 'location_section': current_section_num,
#                 'location_subsection': item['subsection_num'],
#             }
#         }
#         current_subsection_num = item.get('subsection_num')
#         current_subsection_name = item.get('subsection_title')
#         tagged_text.append(new_item)

#     elif 'item_num' in item:
#         new_item = {
#             'content': item['item_content'],
#             'metadata':{
#                 'paragraph_num': item['item_num'],
#                 'type': 'paragraph',
#                 'location_chapter': current_chapter,
#                 'location_section': current_section_num,
#                 'location_subsection': current_subsection_num,
#             }
#         }
#         tagged_text.append(new_item)
#     else:
#         new_item['error'] = 'je ne comprends pas'
#         new_item = new_item | item

#         tagged_text.append(new_item)



# # Save to a JSON file
# with open('tagged_text/tagged_text_all.json', 'w', encoding='utf-8') as f:
#     f.write("[\n")
#     first = True
#     for item in tagged_text:
#         if not first: # no need of comma for the first item in json
#             f.write(",\n") # add the comma for json
#         else:
#             first = False
#         json.dump(item, f, ensure_ascii=False)
#     f.write("]\n")

# print("JSON file 'tagged_text_all.json' has been created.")
    

In [6]:
# Produce a csv
import csv

# Initialize trackers and output list
current_chapter = 'Introduction'
current_section_num = ''
current_subsection_num = ''
current_subsection_title = ''
current_pre_note = ''
tagged_text = []

# Process items
for item in scraped_text:
    if item.get('subsection_title') in chapter_names:  #1 Chapter title
        new_item = {
            'label':f"{item['subsection_title']}",
            'content': item['subsection_title'],
            'type': 'title_chapter',
            'title_num': '',
            'paragraph_num': '',
            'location_chapter': item['subsection_title'],
            'location_section': '',
            'location_subsection': '',
            'location_subsection_title': '',
            'pre_para_note': ''
        }
        current_chapter = item['subsection_title']
        current_section_num = ''
        current_subsection_num = ''
        current_subsection_title = ''
        current_pre_note = ''
        tagged_text.append(new_item)

        table_of_contents = []

    elif 'section_num' in item:  #2 Section title
        new_item = {
            'label':f"{current_chapter}({item['section_num']})",
            'content': item['section_title'],
            'type': 'title_section',
            'title_num': item['section_num'],
            'paragraph_num': '',
            'location_chapter': current_chapter,
            'location_section': item['section_num'],
            'location_subsection': '',
            'location_subsection_title': '',
            'pre_para_note': ''
        }
        current_section_num = item['section_num']
        current_subsection_num = ''
        current_subsection_title = ''
        current_pre_note = ''
        tagged_text.append(new_item)

    elif 'subsection_num' in item:  #3 Subsection title
        new_item = {
            'label':f"{current_chapter}({current_section_num})({item['subsection_num'][0:-2]})", # exclude the  . at the end for readability
            'content': item['subsection_title'],
            'type': 'title_subsection',
            'title_num': item['subsection_num'],
            'paragraph_num': '',
            'location_chapter': current_chapter,
            'location_section': current_section_num,
            'location_subsection': item['subsection_num'],
            'location_subsection_title': item['subsection_title'],
            'pre_para_note': ''
        }
        current_subsection_num = item['subsection_num']
        current_subsection_title = item['subsection_title']
        current_pre_note = ''
        tagged_text.append(new_item)
    
    elif item.get("class_type_id") == "ar_pre_para_note_no_id": #4 Pre-paragraph note
        current_pre_note = item['item_content']
        new_item = {
            'label':f"{current_chapter}({current_section_num})({current_subsection_num})(-{item['item_content']})", # exclude the  . at the end for readability
            'content': item['item_content'],
            'type': 'pre_para_note',
            'title_num': '',
            'paragraph_num': '',
            'location_chapter': current_chapter,
            'location_section': current_section_num,
            'location_subsection': current_subsection_num,
            'location_subsection_title': current_subsection_title,
            'pre_para_note': ''
        }
        if (current_subsection_num == 'TABLE OF CONTENTS'):
            table_of_contents.append(new_item)
        else:
            tagged_text.append(new_item)


    elif 'item_num' in item:  #5 Paragraph content
        new_item = {
                'label': f"{current_chapter} - {item['item_num'][0:-1]}", # exclude the  . at the end for readability
                'content': item['item_content'],
                'type': 'paragraph',
                'title_num': '',
            'paragraph_num': item['item_num'],
            'location_chapter': current_chapter,
            'location_section': current_section_num,
            'location_subsection': current_subsection_num,
            'location_subsection_title': current_subsection_title,
            'pre_para_note': current_pre_note
        }
        if (current_subsection_num == 'TABLE OF CONTENTS'):
            table_of_contents.append(new_item)
        else:
            tagged_text.append(new_item)

    else:  #6 Unrecognized format
        new_item = {
            'label':f"Unrecognized",
            'content': str(item),
            'type': 'unrecognized',
            'title_num': '',
            'paragraph_num': '',
            'location_chapter': current_chapter,
            'location_section': current_section_num,
            'location_subsection': current_subsection_num,
            'location_subsection_title': current_subsection_title,
            'pre_para_note': ''
        }
        tagged_text.append(new_item)

# Write to CSV
csv_columns = ['label','content','type',
               'location_chapter', 'location_section', 'location_subsection','location_subsection_title', 'pre_para_note', 'title_num', 'paragraph_num']

with open('tagged_text/tagged_text_all.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
    writer.writeheader()
    writer.writerows(tagged_text)

print("CSV file 'tagged_text_all.csv' has been created.")

CSV file 'tagged_text_all.csv' has been created.


In [7]:
length_list = []
# Output tagged text for each chapter
for chapter_x in chapter_names:
    tagged_text_chapter_x = []
    
    for item in tagged_text:
        if item.get('location_chapter') == chapter_x:
            tagged_text_chapter_x.append(item)
    # with open(f'tagged_text/tagged_text_{chapter_x}.json', 'w', encoding='utf-8') as f:
    #     for item in tagged_text_chapter_x:
    #         json.dump(item, f, ensure_ascii=False)
    #         f.write('\n')
    length_list.append(len(tagged_text_chapter_x))

    print(f"Chapter_{chapter_x}' has {len(tagged_text_chapter_x)} items.")

print(f'Summation of items:{sum(length_list)}')
    



    

Chapter_ESRS 1' has 258 items.
Chapter_ESRS 2' has 169 items.
Chapter_ESRS E1' has 244 items.
Chapter_ESRS E2' has 106 items.
Chapter_ESRS E3' has 91 items.
Chapter_ESRS E4' has 116 items.
Chapter_ESRS E5' has 109 items.
Chapter_ESRS S1' has 285 items.
Chapter_ESRS S2' has 121 items.
Chapter_ESRS S3' has 120 items.
Chapter_ESRS S4' has 117 items.
Chapter_ESRS G1' has 221 items.
Chapter_Acronyms' has 0 items.
Chapter_Defined Terms' has 0 items.
Summation of items:1957
