In [10]:
import os
import json
import re
import unicodedata

# Directory containing JSON files
INPUT_DIRECTORY = "StructuredRegulatoryDocumentsJson"
OUTPUT_PATH = "filtered_passages_ready_for_reference_extraction.json"

# Regular expressions to extract various references
PART_PATTERN = re.compile(r'\bPart\s\d+', re.IGNORECASE)
SECTION_PATTERN = re.compile(r'\bSection\s\d+(\.\d+)*', re.IGNORECASE)
SUBSECTION_PATTERN = re.compile(r'subsection\s\d+(\([a-zA-Z0-9]+\))*', re.IGNORECASE)
RULE_PATTERN = re.compile(r'\bRule\s\d+(\.\d+)*(\([a-zA-Z0-9]+\))*', re.IGNORECASE)
CHAPTER_PATTERN = re.compile(r'\bChapter\s\d+(,\sRule\s\d+(\.\d+)*)?', re.IGNORECASE)
CATEGORY_PATTERN = re.compile(r'Category\s\d+[A-Z]*', re.IGNORECASE)
APP_SECTION_PATTERN = re.compile(r'APP\d+\.\w+\.\d+(\.\d+)*', re.IGNORECASE)
GUIDANCE_PATTERN = re.compile(r'Guidance(\.\d+)*', re.IGNORECASE)
GUIDANCE_NOTE_PATTERN = re.compile(r'Guidance\snote\s\d+', re.IGNORECASE)
IFRS_PATTERN = re.compile(r'International Financial Reporting Standards', re.IGNORECASE)
FEDERAL_LAW_PATTERN = re.compile(r'Federal Law No\. \d+ of \d+', re.IGNORECASE)
DECREE_PATTERN = re.compile(r'Federal Decree by Law No\. \d+ of \d+', re.IGNORECASE)
CABINET_DECISION_PATTERN = re.compile(r'Cabinet Decision No\. \(\d+\) of \d+', re.IGNORECASE)
INTERNATIONAL_STANDARD_PATTERN = re.compile(r'(FATF|Basel Committee|Wolfsberg Group|Network for Greening the Financial System)', re.IGNORECASE)
PARAGRAPH_PATTERN = re.compile(r'paragraph\s\d+\s+of\s+Chapter\s\d+', re.IGNORECASE)
SCHEDULE_PATTERN = re.compile(r'Chapter\s\d+\s+of\s+Schedule\s\d+\s+of\s+FSMR', re.IGNORECASE)
GEN_PATTERN = re.compile(r'GEN\s\d+\.\d+\.\d+', re.IGNORECASE)
FEES_PATTERN = re.compile(r'FEES\s\d+(\.\d+)*', re.IGNORECASE)
FUNDS_PATTERN = re.compile(r'FUNDS\sRules\s\d+\.\d+\.\d+\([a-z]\)', re.IGNORECASE)
SECTION_SUBSECTION_PATTERN = re.compile(r'(section|subsection|sub-paragraph)\s(\d+|\([a-zA-Z0-9]+\))(\s*to\s*\d+)?', re.IGNORECASE)
FSMR_PATTERN = re.compile(r'\bSection\s\d+\s+of\s+the\sFSMR', re.IGNORECASE)
MKT_RULE_PATTERN = re.compile(r'under\sMKT\sRule\s\d+(\.\d+)*(\([a-zA-Z0-9]+\))*', re.IGNORECASE)

def count_words(text):
    return len(text.split())

def clean_passage_text(passage_text):
    cleaned_text = re.sub(r'/Table Start.*?/Table End', '', passage_text, flags=re.DOTALL)
    return cleaned_text

def process_tuple(reference_tuple):
    """
    Joins elements of a tuple into a single string, ignoring empty elements.
    """
    return ' '.join(filter(None, reference_tuple))

def extract_references(passage_text):
    passage_text = clean_passage_text(passage_text)
    references = []

    # Match various patterns in the passage text
    references.extend(PART_PATTERN.findall(passage_text))
    references.extend(SECTION_PATTERN.findall(passage_text))
    references.extend(SUBSECTION_PATTERN.findall(passage_text))
    references.extend(RULE_PATTERN.findall(passage_text))
    references.extend(CHAPTER_PATTERN.findall(passage_text))
    references.extend(CATEGORY_PATTERN.findall(passage_text))
    references.extend(APP_SECTION_PATTERN.findall(passage_text))
    references.extend(GUIDANCE_PATTERN.findall(passage_text))
    references.extend(GUIDANCE_NOTE_PATTERN.findall(passage_text))
    references.extend(IFRS_PATTERN.findall(passage_text))
    references.extend(FEDERAL_LAW_PATTERN.findall(passage_text))
    references.extend(DECREE_PATTERN.findall(passage_text))
    references.extend(CABINET_DECISION_PATTERN.findall(passage_text))
    references.extend(INTERNATIONAL_STANDARD_PATTERN.findall(passage_text))
    references.extend(PARAGRAPH_PATTERN.findall(passage_text))
    references.extend(SCHEDULE_PATTERN.findall(passage_text))
    references.extend(GEN_PATTERN.findall(passage_text))
    references.extend(FEES_PATTERN.findall(passage_text))
    references.extend(FUNDS_PATTERN.findall(passage_text))
    references.extend(SECTION_SUBSECTION_PATTERN.findall(passage_text))
    references.extend(FSMR_PATTERN.findall(passage_text))
    references.extend(MKT_RULE_PATTERN.findall(passage_text))

    # Process the references to handle tuples
    processed_references = []
    for ref in references:
        if isinstance(ref, tuple):
            processed_references.append(process_tuple(ref).strip())
        else:
            processed_references.append(ref.strip())

    # Return only unique references
    return list(set([ref for ref in processed_references if ref]))

def normalize_unicode(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')

def main():
    os.makedirs(OUTPUT_DIRECTORY, exist_ok=True)

    filtered_data = []
    seen_source_ids = set()  # Set to keep track of unique SourceIDs

    for filename in os.listdir(INPUT_DIRECTORY):
        if filename.endswith(".json"):
            filepath = os.path.join(INPUT_DIRECTORY, filename)
            with open(filepath, "r", encoding="utf-8") as json_file:
                document = json.load(json_file)
                if document:
                    for item in document:
                        source_id = item.get("ID")
                        source_document_id = item.get("DocumentID")
                        source_passage_id = item.get("PassageID")
                        source_passage = item.get("Passage", "")

                        # Skip if SourceID is not unique
                        if source_id in seen_source_ids:
                            continue
                        seen_source_ids.add(source_id)

                        if count_words(source_passage) > 200:
                            continue

                        references = extract_references(source_passage)
                        source_passage = normalize_unicode(source_passage)
                        references = [normalize_unicode(ref) for ref in references]

                        if any(ref in source_passage for ref in references):
                            filtered_data.append({
                                "ID": source_id,
                                "DocumentID": source_document_id,
                                "PassageID": source_passage_id,
                                "Passage": source_passage,
                            })

    with open(OUTPUT_PATH, "w", encoding="utf-8") as jsonfile:
        json.dump(filtered_data, jsonfile, ensure_ascii=False, indent=4)

if __name__ == "__main__":
    main()
