<a href="https://colab.research.google.com/github/tubagokhan/ADGM/blob/main/ParsingIntermediateFormatV3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [128]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [129]:
import json
import uuid

def generate_unique_identifier():
    # Generate a UUID and convert it to a string
    return str(uuid.uuid4())

def get_document_id(filename):

    doc_ids = {
        'AML_VER09.211223': 1,
        'CIB_VER04.030220': 2,
        'COBS_VER15.150823': 3,
        'FEES_VER16.181223': 4,
        'FP_VER01.110319': 5,
        'FUNDS_VER08.040723' : 6,
        'GEN_VER08.181223': 7,
        'GLO_VER19.181223':8,
        'IFR_VER07.181223':9,
        'MIR_VER07.181223':10,
        'MKT_VER08.181223':11,
        'PIN_VER05.181223':12,
        'PRU_VER13.181223':13,
        'BRR Regulations (December 2018)':14,
        'CRS Regulations 2017 (Consolidated_October 2023) v6': 15,
        'Foreign Tax Account Compliance Regulations 2022':16,
        'FSMR (Consolidated_December 2023)':17,
        'Guidance â€“ Regulatory Framework for Fund Managers of Venture Capital Funds (VER03.181223)':18,
        'Guidance - Virtual Asset Activities in ADGM (VER05.181223)': 19,
        'ADGM_Guidance_-_Application_of_English_Laws':20,
        'API - Guidance Note_Final 14 October 2019 Eng':21,
        'CMC_VER03.270922':22,
        'CONF_VER03.18042019': 23,
        'Draft Guidance - FSRA Guiding Principles for Virtual Assets Regulation and Supervision (IA)':24,
        'Environmental Social and Governance Disclosures Guidance_VER01.040723':25,
        'FinTech RegLab Guidance_VER01.31082016':26,
        'GPM VER03.120623':27,
        'Guidance - Continuous Disclosure_VER01.280922':28,
        'Guidance - Digital Securities Offerings and Virtual  Assets under the Financial Services and Markets Regulations_240220':29,
        'Guidance - Disclosure Requirements for Mining Reporting Entities_VER01.280922':30,
        'Guidance - Disclosure Requirements for Petroleum Reporting Entities_VER01.280922':31,
        'Guidance - Private Credit Funds_VER01.040523':32,
        'Guidance  Regulation of Digital Securities Activities in ADGM_240224':33,
        'Guidance - Regulation of Spot Commodities Activities in ADGM (VER02.181223)':34,
        'Guidance_Regulatory Framework for PFP and Multilateral Trading Facilities dealing with Private Capital Markets (VER02.181223)':35,
        'SFWG_Guidance on Principles for the Effective Management of Climate-related Financial Risks':36,
        'Supplementary Guidance  Authorisation of Digital Investment Management (Robo-advisory) Activities':37,
        'Supplementary Guidance OTCLPs (VER02.181223)':38,
        'Sustainable Finance Supplementary Guidance_VER01.040723':39,
        'UAE_CRS_Guidance_Notes_17 June 2020 (002)':40

    }
    # Return the DocumentID based on the base name
    return doc_ids.get(filename, None)  # Returns None if not found

def process_file(filename, input_file_path, output_file_path):
    data = []
    current_entry = None
    inside_table = False
    inside_figure = False
    inside_special_case=False

    # Determine DocumentID from input file name
    document_id = get_document_id(filename)
    if document_id is None:
        raise ValueError("DocumentID not found for the file name provided.")

    with open(input_file_path, 'r') as file:
        for line in file:
            if '/Table Start' in line:
                inside_table = True
                current_entry['Passage'] += '\n' + line.strip()
                print('Table')
                continue
            elif '/Table End' in line:
                inside_table = False
                current_entry['Passage'] += '\n' + line.strip()
                continue

            if '/Figure Start' in line:
                inside_figure = True
                current_entry['Passage'] += '\n' + line.strip()
                print('Figure')
                continue
            elif '/Figure End' in line:
                inside_figure = False
                current_entry['Passage'] += '\n' + line.strip()
                continue

            if '""' in line:
                inside_special_case = True
                current_entry['Passage'] += '\n' + line.strip()
                print('Special Case')
                continue
            elif '""' in line:
                inside_special_case = False
                current_entry['Passage'] += '\n' + line.strip()
                continue

            if inside_table or inside_figure or inside_special_case:
                current_entry['Passage'] += '\n' + line.strip()
            elif (
                  line.startswith('PART ') or
                  line.startswith('Part ') or
                  line[0].isdigit() or
                  line.startswith('APP') or
                  line.startswith('Schedule') or
                  line.startswith('BANK RECOVERY AND RESOLUTION REGULATIONS 2018') or
                  line.startswith('COMMON REPORTING STANDARD REGULATIONS 2017') or
                  line.startswith('FOREIGN ACCOUNT TAX COMPLIANCE REGULATIONS 2022') or
                  line.startswith('FINANCIAL SERVICES AND MARKETS REGULATIONS 2015') or
                  line.startswith('APPENDIX') or
                  line.startswith('Definitions') or
                  line.startswith('Disclaimer') or
                  line.startswith('ANNEX') or
                  line.startswith('A.') or
                  line.startswith('B.') or
                  line.startswith('C.') or
                  line.startswith('D.') or
                  line.startswith('SECTION')
              ):
                parts = line.split('\t', 1)
                context_id = parts[0].strip()
                text = parts[1].strip() if len(parts) > 1 else ''

                if current_entry:
                    data.append(current_entry)

                unique_id = generate_unique_identifier()

                current_entry = {
                    'ID': unique_id,
                    'DocumentID': document_id,
                    'PassageID': context_id,
                    'Passage': text
                }
            elif current_entry:
                current_entry['Passage'] += '\n' + line.strip()

    if current_entry:
        data.append(current_entry)

    with open(output_file_path, 'w') as json_file:
        json.dump(data, json_file, indent=4)


filename = 'UAE_CRS_Guidance_Notes_17 June 2020 (002)'
input_path = f'/content/drive/Othercomputers/MBZUAI/MBZUAI/ADGM-Project/ADGM-Docs/StandardizedDocs/FSRA Guidance/{filename}.txt'
output_path = f'/content/drive/Othercomputers/MBZUAI/MBZUAI/ADGM-Project/ADGM-Docs/StandardizedDocs/JsonFormat/{filename}.json'
process_file(filename, input_path, output_path)


Table


In [130]:
import json

def check_the_context_id(json_file_path):
    with open(json_file_path, 'r') as json_file:
        data = json.load(json_file)

        for entry in data:
            #if len(entry['PassageID']) > 15:
                print(entry['PassageID'])

# Replace 'output_file_path.json' with the path to your JSON file
check_the_context_id(output_path)

1.
2.
3.
4.
5.
5.1
5.2
6.
6.1.
6.2.
7.
7.A.
7.A.1.
7.A.2.
7.A.3.
7.B.
7.B.1.
7.B.2.
SECTION
SECTION I
SECTION I.A.
SECTION I.A.1.
SECTION I.A.2.
SECTION I.A.3.
SECTION I.A.4.
SECTION I.A.5.
SECTION I.A.6.
SECTION I.A.7.
SECTION I.B.
SECTION I.C.
SECTION I.D.
SECTION I.E.
SECTION I.F.
SECTION I.G.
SECTION I.H.
SECTION II
SECTION II.A.
SECTION II.B.
SECTION II.C.
SECTION II.D.
SECTION II.E.
SECTION II.F.
SECTION III
SECTION III.A.
SECTION III.B.
SECTION III.B.1.
SECTION III.B.2.
SECTION III.B.3.
SECTION III.B.4.
SECTION III.B.5.
SECTION III.B.6.
SECTION III.C.
SECTION III.C.1.
SECTION III.C.2.
SECTION III.C.3.
SECTION III.C.4.
SECTION III.C.5.
SECTION III.C.6.
SECTION III.C.7.
SECTION III.C.8.
SECTION III.C.9.
SECTION III.D.
SECTION III.E.
SECTION IV
SECTION IV.A.
SECTION IV.B.
SECTION IV.C.
SECTION V
SECTION V.A.
SECTION V.B.
SECTION V.C.
SECTION V.C.1.
SECTION V.C.2.
SECTION V.D.
SECTION V.D.1.
SECTION V.D.2.
SECTION V.D.3.
SECTION VI
SECTION VI.A.
SECTION VI.A.1.
SECTION VI.A.2.
SECTI