# Execute the py script on the parent folder to get chunks

Run the cell below FROM THE PARENT FOLDER to get the chunks. Copy paste this code to a notebook/script that is run from the parent folder. If you run it here, the code won't work.

This code is linked to mas_preprocessor.py that exist in the same directory as this file.

In [None]:
from mas_preprocessor import MASPreprocessor

preprocessor = MASPreprocessor() 
chunks = preprocessor.get_chunks()

This code will produce a json file of the chunks (if you run this cell, it will overwrite the json file that already exist in the directory)

In [None]:
import json

with open("mas_chunks.json", "w", encoding="utf-8") as f:
    json.dump(chunks, f, ensure_ascii=False, indent=2)

# Notebook Codes

- THERE'S NO NEED TO RUN THIS WHOLE SECTION OF CODES XD

- ITS HERE FOR REFERENCE ONLY. 
- THIS CODES INCLUDES INLINE DOCUMENTATION AND YOU CAN SEE THE CHUNKS IN A PRETTIER FORMAT.

- THE CODE ABOVE IS SUFFICIENT TO GET THE CHUNKS AND JSON.

In [2]:
import fitz  # PyMuPDF
import os
import re
from typing import List, Dict

In [3]:
def print_chunks_pretty(chunks: List[Dict], limit: int = None):
    for i, chunk in enumerate(chunks[:limit] if limit else chunks):
        print("=" * 80)
        print(f"Chunk {i + 1}:")
        print(f"ID        : {chunk['id']}")
        print(f"Part      : {chunk['metadata']['part_id']} - {chunk['metadata']['part_title']}")
        print(f"Section   : {chunk['metadata']['section_id']}")
        if chunk['metadata']['section_title']:
            print(f"Title     : {chunk['metadata']['section_title']}")
        print("Text:")
        print(chunk['text'])
        if chunk["metadata"].get("references"):
            print("\nReferences:")
            for ref in chunk["metadata"]["references"]:
                print(f"  - {ref}")
        print("=" * 80)
        print("\n")

## Chunking MAS Notice 626

In [4]:
def chunk_mas_notice(pdf_path: str, start_page: int = 0, end_page: int = None) -> List[Dict]:
    doc = fitz.open(pdf_path)
    if end_page is None:
        end_page = len(doc)

    chunks = []
    section_id_pattern = re.compile(r"^(\d+[A-Z]?\.\d+[A-Z]?)\b")

    current_part_id = None
    current_part_title = None
    current_section_id = None
    current_section_title = None
    current_text = []

    last_bold_line = None

    def flush_section():
        if current_section_id and current_text:
            chunks.append({
                "id": f"mas-notice-626-{current_section_id}",
                "text": " ".join(current_text).strip(),
                "metadata": {
                    "part_id": current_part_id,
                    "part_title": current_part_title,
                    "section_id": current_section_id,
                    "section_title": current_section_title or "",
                    "regulation": "MAS Notice 626",
                    "references": []
                }
            })

    for page_num in range(start_page, end_page):
        page = doc[page_num]
        lines_raw = page.get_text("dict")["blocks"]

        # Step 1: Collect bold lines from layout for potential section titles
        layout_lines = []
        for block in lines_raw:
            for line in block.get("lines", []):
                spans = line.get("spans", [])
                if not spans:
                    continue
                text_combined = " ".join(span["text"].strip() for span in spans if span["text"].strip())
                if not text_combined:
                    continue
                layout_lines.append({
                    "text": text_combined,
                    "bold": any("Bold" in span["font"] for span in spans),
                    "x0": spans[0]["bbox"][0],
                })

        # Step 2: Use plain text extraction to get line-by-line content
        plain_lines = page.get_text("text").splitlines()

        for i, line in enumerate(plain_lines):
            stripped = line.strip()

            # Check for potential section titles from layout (non-all-caps, not number-only, bold)
            matching_layout = next((l for l in layout_lines if l["text"] == stripped), None)
            if (
                matching_layout and
                matching_layout["bold"] and
                stripped != stripped.upper() and
                not re.match(r"^\d+$", stripped)
            ):
                last_bold_line = stripped
                continue

            # Detect part header
            if re.match(r"^\d+[A-Z]?$", stripped) and i + 1 < len(plain_lines) and plain_lines[i + 1].strip().isupper():
                flush_section()
                current_part_id = stripped
                current_part_title = plain_lines[i + 1].strip()
                current_section_id = None
                current_section_title = None
                current_text = []
                last_bold_line = None
                continue

            # Detect section ID
            match = section_id_pattern.match(stripped)
            if match:
                flush_section()
                current_section_id = match.group(1)

                # Use bold line above as section title if valid
                if last_bold_line:
                    current_section_title = last_bold_line
                    last_bold_line = None
                else:
                    # Inherit from previous chunk if same part
                    current_section_title = (
                        chunks[-1]["metadata"]["section_title"]
                        if chunks and chunks[-1]["metadata"]["part_id"] == current_part_id
                        else ""
                    )

                current_text = [stripped]
                continue

            if current_section_id:
                current_text.append(stripped)

    flush_section()
    return chunks


In [5]:
# Run merged logic with section ID + section title detection
chunks_mas_notice = chunk_mas_notice("mas_documents\MAS Notice 626 dated 28 March 2024.pdf")


In [6]:
print_chunks_pretty(chunks_mas_notice, limit = 5)

Chunk 1:
ID        : mas-notice-626-1.1
Part      : 1 - INTRODUCTION
Section   : 1.1
Text:
1.1 This Notice is issued under section 16 of the Financial Services and Markets Act 2022 (“FSM Act”) and applies to all banks in Singapore, as defined in section 2 of the Banking Act 1970 (“BA”).


Chunk 2:
ID        : mas-notice-626-1.2
Part      : 1 - INTRODUCTION
Section   : 1.2
Text:
1.2 This Notice shall take effect from 1 April 2024.


Chunk 3:
ID        : mas-notice-626-2.1
Part      : 2 - DEFINITIONS
Section   : 2.1
Text:
2.1 For the purposes of this Notice – “AML/CFT” means anti-money laundering and countering the financing of terrorism; “Authority” means the Monetary Authority of Singapore; “bank” means a bank in Singapore, as defined in section 2 of the BA; “beneficial owner”, in relation to a customer of a bank, means the natural person who ultimately owns or controls the customer or the natural person on whose behalf a transaction is conducted or business relations are established, 

## Chunking Fair Dealing Guidelines

In [7]:
def chunk_fair_dealing(pdf_path: str, start_page: int = 5):
    import copy

    doc = fitz.open(pdf_path)
    chunks = []

    # Patterns
    part_pattern = re.compile(r"^(\d+)\s+Fair Dealing Outcome", re.IGNORECASE)
    section_title_pattern = re.compile(r"^(\d+\.\d+)\s+(.+)")
    section_id_pattern = re.compile(r"^(\d+\.\d+\.\d+)\b")
    practice_start_pattern = re.compile(r"^(Good|Poor) practice (\d+\.\d+)", re.IGNORECASE)

    current_part_id = None
    current_part_title = None
    current_section_id = None
    current_section_title = None
    current_text = []
    current_section_type = "main"
    temp_title_tracker = None
    expecting_part_subtitle = False
    active_chunk = None

    def flush_active_chunk():
        nonlocal active_chunk, current_text
        if active_chunk:
            active_chunk["text"] = " ".join(current_text).strip()
            chunks.append(active_chunk)
            active_chunk = None
            current_text = []

    def start_new_chunk(section_id, title=""):
        return {
            "id": f"mas-fair-dealing-{section_id}",
            "text": "",
            "metadata": {
                "part_id": current_part_id,
                "part_title": current_part_title,
                "section_id": section_id,
                "section_title": title,
                "regulation": "MAS Fair Dealing Guidelines",
                "references": []
            }
        }

    for page_num in range(start_page, len(doc)):
        page = doc[page_num]
        blocks = [b for b in page.get_text("dict")["blocks"] if "lines" in b]

        for block in blocks:
            line_text = " ".join(
                span["text"].strip()
                for line in block["lines"]
                for span in line.get("spans", [])
                if span["text"].strip()
            ).strip()

            if not line_text:
                continue

            # Detect Part Header
            match_part = part_pattern.match(line_text)
            if match_part:
                flush_active_chunk()
                current_part_id = match_part.group(1)
                current_part_title = line_text
                expecting_part_subtitle = True
                continue

            # Detect subtitle after part
            if expecting_part_subtitle:
                if any("bold" in span["font"].lower() for line in block["lines"] for span in line["spans"]):
                    current_part_title += " " + line_text
                    expecting_part_subtitle = False
                    continue

            # Section title (e.g., 4.2 Title)
            match_title = section_title_pattern.match(line_text)
            if match_title:
                temp_title_tracker = line_text.strip()
                continue

            # Section ID (e.g., 4.2.1)
            match_section = section_id_pattern.match(line_text)
            if match_section:
                flush_active_chunk()
                current_section_id = match_section.group(1)
                section_prefix = ".".join(current_section_id.split(".")[:2])
                if temp_title_tracker and temp_title_tracker.startswith(section_prefix):
                    current_section_title = temp_title_tracker
                active_chunk = start_new_chunk(current_section_id, current_section_title)
                current_text = [line_text]
                current_section_type = "main"
                continue

            # Practice block (Good or Poor)
            match_practice = practice_start_pattern.match(line_text)
            if match_practice:
                flush_active_chunk()
                kind, number = match_practice.groups()
                practice_id = f"{kind.lower()}-practice-{number}"
                current_section_id = practice_id
                current_section_type = kind.lower()
                active_chunk = start_new_chunk(practice_id, current_section_title)
                current_text = [line_text]
                continue

            # Regular paragraph inside active chunk
            if active_chunk:
                current_text.append(line_text)

    flush_active_chunk()
    return chunks

In [8]:
chunks_fair_dealing = chunk_fair_dealing("mas_documents/Fair Dealing Guidelines 30 May 2024.pdf")

In [9]:
print_chunks_pretty(chunks_fair_dealing, limit =10)

Chunk 1:
ID        : mas-fair-dealing-1.1.1
Part      : 1 - 1 Fair Dealing Outcome One Customers have confidence that they deal with financial institutions where fair dealing is central to the corporate culture.
Section   : 1.1.1
Title     : 1.1 Rationale
Text:
1.1.1 The way financial institutions manufacture, select, market and distribute financial products and services will affect customers’ financial decisions. Financial institutions should therefore ensure that their business functions are cognisant of the influence on customers’ decisions, and have a strong emphasis on fair dealing in their corporate culture. Customers are better able to trust and rely on FIs that consistently emphasise fair dealing in all aspects of their business.


Chunk 2:
ID        : mas-fair-dealing-1.1.2
Part      : 1 - 1 Fair Dealing Outcome One Customers have confidence that they deal with financial institutions where fair dealing is central to the corporate culture.
Section   : 1.1.2
Title     : 1.1 Rati

## Chunk MAS 626 Guidelines

In [10]:
def chunk_mas_guidelines(pdf_path: str, start_page: int = 2) -> List[Dict]:
    import fitz
    import re
    from typing import List, Dict

    doc = fitz.open(pdf_path)
    chunks = []

    section_id_pattern = re.compile(r"^(\d+-\d+)\b")
    subsection_pattern = re.compile(r"^(\d+-\d+-\d+)\b")
    part_id_pattern = re.compile(r"^\d+$")
    page_header_phrases = [
        "GUIDELINES TO MAS NOTICE 626",
        "COUNTERING THE FINANCING OF TERRORISM"
    ]

    all_lines = []
    for page_num in range(start_page, len(doc)):
        page = doc[page_num]
        lines = [line.strip() for line in page.get_text("text").splitlines()]
        lines = [
            line for line in lines
            if line and not any(header.lower() in line.lower() for header in page_header_phrases)
        ]
        all_lines.extend(lines)

    # Step 1: Pre-extract part positions
    part_positions = {}
    for i in range(len(all_lines) - 1):
        if part_id_pattern.match(all_lines[i]):
            next_line = all_lines[i + 1]
            if (
                next_line[0].isupper()
                and len(next_line.split()) > 1
                and not any(header.lower() in next_line.lower() for header in page_header_phrases)
            ):
                part_positions[i] = (all_lines[i], next_line)

    # Step 2: Set initial fallback part manually
    last_seen_part_id = "1"
    last_seen_part_title = "Introduction"

    # Section parsing
    current_section_id = None
    current_section_title = None
    inherited_section_title = ""
    current_text = []
    potential_title = None

    def flush():
        nonlocal current_section_id, current_section_title, inherited_section_title, current_text
        if current_section_id and current_text:
            section_title_to_use = current_section_title or inherited_section_title or ""
            if section_title_to_use == last_seen_part_title:
                section_title_to_use = ""
            chunks.append({
                "id": f"mas-guidelines-626-{current_section_id}",
                "text": " ".join(current_text).strip(),
                "metadata": {
                    "part_id": last_seen_part_id,
                    "part_title": last_seen_part_title,
                    "section_id": current_section_id,
                    "section_title": section_title_to_use,
                    "regulation": "MAS Guidelines to Notice 626",
                    "references": []
                }
            })
        inherited_section_title = ""
        current_section_id = None
        current_section_title = None
        current_text = []

    # Step 3: Parse and assign part info as we go
    i = 0
    while i < len(all_lines):
        if i in part_positions:
            flush()  # flush previous section before switching part
            last_seen_part_id, last_seen_part_title = part_positions[i]
            inherited_section_title = ""  # Reset inherited section title on new part

        line = all_lines[i]

        # Section title
        if (
            not section_id_pattern.match(line)
            and not subsection_pattern.match(line)
            and not part_id_pattern.match(line)
            and len(line.split()) <= 8
            and line[0].isupper()
        ):
            for lookahead in range(1, 4):
                if i + lookahead < len(all_lines) and section_id_pattern.match(all_lines[i + lookahead]):
                    if line == last_seen_part_title:
                        potential_title = ""
                        inherited_section_title = ""
                    else:
                        potential_title = line
                    break
            i += 1
            continue

        # Section ID
        match_section = section_id_pattern.match(line)
        if match_section and not subsection_pattern.match(line):
            flush()
            current_section_id = match_section.group(1)
            current_section_title = potential_title or ""
            inherited_section_title = current_section_title if current_section_title else ""
            potential_title = None
            current_text = [line]
            i += 1
            continue

        if current_section_id:
            current_text.append(line)

        i += 1

    flush()
    return chunks



In [11]:
chunks_mas_guidelines = chunk_mas_guidelines("mas_documents/Guidelines to MAS Notice 626 March 2024 - Final.pdf")

In [12]:
print_chunks_pretty(chunks_mas_guidelines, limit = 100)

Chunk 1:
ID        : mas-guidelines-626-1-1
Part      : 1 - Introduction
Section   : 1-1
Text:
1-1 These Guidelines provide guidance to all banks on the requirements in MAS Notice 626 on Prevention of Money Laundering and Countering the Financing of Terrorism – Banks (“the Notice”). These Guidelines should be read in conjunction with the Notice.


Chunk 2:
ID        : mas-guidelines-626-1-2
Part      : 1 - Introduction
Section   : 1-2
Text:
1-2 The expressions used in these Guidelines have the same meanings as those found in the Notice, except where expressly defined in these Guidelines or where the context otherwise requires. For the purposes of these Guidelines, a reference to “CDD measures” shall mean the measures as required by paragraphs 6, 7 and 8 of the Notice.


Chunk 3:
ID        : mas-guidelines-626-1-3
Part      : 1 - Introduction
Section   : 1-3
Text:
1-3 The degree of observance with these Guidelines by a bank may have an impact on the Authority’s overall risk assessment o

### Reference for MAS Notice

In [13]:
def add_references_to_notice(notice_chunks: List[Dict]) -> List[Dict]:
    import re
    id_set = {c["id"] for c in notice_chunks}
    part_map = {}
    for c in notice_chunks:
        pid = c["metadata"].get("part_id")
        if pid:
            part_map.setdefault(pid, []).append(c["id"])

    def get_references(text: str, chunk_id: str) -> List[str]:
        refs = set()

        # (1) paragraphs 6, 7 and 8 or 6.1, 6.2 and 6.3 FIRST
        for match in re.findall(r"paragraphs?\s+((?:\d+(?:\.\d+[A-Z]?)?|\d+)(?:[ ,and]+(?:\d+(?:\.\d+[A-Z]?)?|\d+))*)", text, re.IGNORECASE):
            all_nums = re.split(r"[,\s]+and\s+|,\s*|\s+and\s+", match)
            for num in all_nums:
                num = num.strip()
                if not num:
                    continue
                if re.match(r"\d+\.\d+[A-Z]?$", num):
                    ref_id = f"mas-notice-626-{num}"
                    if ref_id in id_set:
                        refs.add(ref_id)
                elif num in part_map:
                    refs.update(part_map[num])

        # (2) paragraph x.y or x.yA (no bracket)
        for match in re.findall(r"paragraphs?\s+(\d+\.\d+[A-Z]?)(?!\(|[A-Z])\b", text, re.IGNORECASE):
            ref_id = f"mas-notice-626-{match}"
            if ref_id in id_set:
                refs.add(ref_id)

        # (3) paragraph x.yA(a) → only match x.yA
        for match in re.findall(r"paragraphs?\s+(\d+\.\d+[A-Z]?)\([a-z]\)", text, re.IGNORECASE):
            ref_id = f"mas-notice-626-{match}"
            if ref_id in id_set:
                refs.add(ref_id)

        # (4) paragraph x (single number → full part)
        for match in re.findall(r"paragraphs?\s+(\d+)(?![\.-])", text, re.IGNORECASE):
            if match in part_map:
                refs.update(part_map[match])

        # (5) paragraphs x.y to x.z (ignore letters for range)
        for match in re.findall(r"paragraphs?\s+(\d+)\.(\d+)\s+to\s+(\d+)\.(\d+)", text, re.IGNORECASE):
            part1, start, part2, end = match
            if part1 == part2:
                for i in range(int(start), int(end) + 1):
                    sec_id = f"{part1}.{i}"
                    ref_id = f"mas-notice-626-{sec_id}"
                    if ref_id in id_set:
                        refs.add(ref_id)

        refs.discard(chunk_id)
        return sorted(refs)

    for chunk in notice_chunks:
        chunk["metadata"]["references"] = get_references(chunk["text"], chunk["id"])

    return notice_chunks


In [14]:
chunks_mas_notice_with_reference = add_references_to_notice(chunks_mas_notice)

In [15]:
print_chunks_pretty(chunks_mas_notice_with_reference, limit = 5)

Chunk 1:
ID        : mas-notice-626-1.1
Part      : 1 - INTRODUCTION
Section   : 1.1
Text:
1.1 This Notice is issued under section 16 of the Financial Services and Markets Act 2022 (“FSM Act”) and applies to all banks in Singapore, as defined in section 2 of the Banking Act 1970 (“BA”).


Chunk 2:
ID        : mas-notice-626-1.2
Part      : 1 - INTRODUCTION
Section   : 1.2
Text:
1.2 This Notice shall take effect from 1 April 2024.


Chunk 3:
ID        : mas-notice-626-2.1
Part      : 2 - DEFINITIONS
Section   : 2.1
Text:
2.1 For the purposes of this Notice – “AML/CFT” means anti-money laundering and countering the financing of terrorism; “Authority” means the Monetary Authority of Singapore; “bank” means a bank in Singapore, as defined in section 2 of the BA; “beneficial owner”, in relation to a customer of a bank, means the natural person who ultimately owns or controls the customer or the natural person on whose behalf a transaction is conducted or business relations are established, 

### Reference for MAS Guidelines

In [16]:
def add_references_to_guidelines(guideline_chunks: List[Dict], notice_chunks: List[Dict]) -> List[Dict]:
    id_set_notice = {c["id"] for c in notice_chunks}
    id_set_guideline = {c["id"] for c in guideline_chunks}
    part_map_notice = {}
    for c in notice_chunks:
        part_id = c["metadata"].get("part_id")
        if part_id:
            part_map_notice.setdefault(part_id, []).append(c["id"])

    def get_references(chunk: Dict) -> List[str]:
        refs = set()

        section_id = chunk["metadata"]["section_id"]
        section_title = chunk["metadata"].get("section_title", "")
        text = chunk["text"]
        full_text = f"{section_title} {text}"

        # Match MAS Notice section by direct conversion
        dot_section_id = section_id.replace("-", ".")
        direct_id = f"mas-notice-626-{dot_section_id}"
        if direct_id in id_set_notice:
            refs.add(direct_id)

        # Paragraph patterns
        # (1) paragraph 4.1
        for match in re.findall(r"paragraphs? (\d+\.\d+)(?!\()", full_text, re.IGNORECASE):
            ref_id = f"mas-notice-626-{match}"
            if ref_id in id_set_notice:
                refs.add(ref_id)

        # (2) paragraph 4.1(a)
        for match in re.findall(r"paragraphs? (\d+\.\d+)\([a-z]\)", full_text, re.IGNORECASE):
            ref_id = f"mas-notice-626-{match}"
            if ref_id in id_set_notice:
                refs.add(ref_id)

        # (3) paragraph 4-1
        for match in re.findall(r"paragraphs? (\d+-\d+)(?!-\d)", full_text, re.IGNORECASE):
            ref_id = f"mas-guidelines-626-{match}"
            if ref_id in id_set_guideline:
                refs.add(ref_id)

        # (4) paragraph 4-1-2 → 4-1
        for match in re.findall(r"paragraphs? (\d+-\d+)-\d+", full_text, re.IGNORECASE):
            ref_id = f"mas-guidelines-626-{match}"
            if ref_id in id_set_guideline:
                refs.add(ref_id)

        # (5) paragraph 6 → whole part
        for match in re.findall(r"paragraphs? (\d+)(?![\.-])", full_text, re.IGNORECASE):
            part_id = match
            refs.update(part_map_notice.get(part_id, []))

        # (6) paragraphs 6, 7 and 8
        for match in re.findall(r"paragraphs? ((?:\d+(?:\.\d+)?(?:, )?)+(?:and \d+(?:\.\d+)?))", full_text, re.IGNORECASE):
            nums = re.findall(r"\d+(?:\.\d+)?", match)
            for num in nums:
                if "." in num:
                    ref_id = f"mas-notice-626-{num}"
                    if ref_id in id_set_notice:
                        refs.add(ref_id)
                else:
                    refs.update(part_map_notice.get(num, []))

        # (7) paragraphs 6.1 to 6.5
        for match in re.findall(r"paragraphs? (\d+)\.(\d+) to (\d+)\.(\d+)", full_text, re.IGNORECASE):
            part1, start, part2, end = match
            if part1 == part2:
                for i in range(int(start), int(end) + 1):
                    sec_id = f"{part1}.{i}"
                    ref_id = f"mas-notice-626-{sec_id}"
                    if ref_id in id_set_notice:
                        refs.add(ref_id)

        refs.discard(chunk["id"])
        return sorted(refs)

    # Apply to all
    for chunk in guideline_chunks:
        chunk["metadata"]["references"] = get_references(chunk)

    return guideline_chunks

In [17]:
chunks_mas_guidelines_with_reference = add_references_to_guidelines(chunks_mas_guidelines, chunks_mas_notice)

In [18]:
print_chunks_pretty(chunks_mas_guidelines_with_reference, limit = 10)

Chunk 1:
ID        : mas-guidelines-626-1-1
Part      : 1 - Introduction
Section   : 1-1
Text:
1-1 These Guidelines provide guidance to all banks on the requirements in MAS Notice 626 on Prevention of Money Laundering and Countering the Financing of Terrorism – Banks (“the Notice”). These Guidelines should be read in conjunction with the Notice.

References:
  - mas-notice-626-1.1


Chunk 2:
ID        : mas-guidelines-626-1-2
Part      : 1 - Introduction
Section   : 1-2
Text:
1-2 The expressions used in these Guidelines have the same meanings as those found in the Notice, except where expressly defined in these Guidelines or where the context otherwise requires. For the purposes of these Guidelines, a reference to “CDD measures” shall mean the measures as required by paragraphs 6, 7 and 8 of the Notice.

References:
  - mas-notice-626-1.2
  - mas-notice-626-6.1
  - mas-notice-626-6.10
  - mas-notice-626-6.11
  - mas-notice-626-6.11A
  - mas-notice-626-6.11B
  - mas-notice-626-6.11C
  -

## Combining 3 Documents Chunks

In [19]:
chunks_all_mas_documents = chunks_mas_notice_with_reference + chunks_mas_guidelines_with_reference + chunks_fair_dealing

In [21]:
def append_title(chunks: List[Dict]) -> List[Dict]:
    for chunk in chunks:
        meta = chunk["metadata"]
        chunk["text"] = (
            f"Part {meta['part_id']}. {meta['part_title']}\n"
            f"Section {meta['section_id']}. {meta['section_title']}\n"
            f"{chunk['text']}"
        )
    return chunks


In [22]:
chunks_all_mas_documents = append_title(chunks_all_mas_documents)

In [23]:
chunks_all_mas_documents

[{'id': 'mas-notice-626-1.1',
  'text': 'Part 1. INTRODUCTION\nSection 1.1. \n1.1 This Notice is issued under section 16 of the Financial Services and Markets Act 2022 (“FSM Act”) and applies to all banks in Singapore, as defined in section 2 of the Banking Act 1970 (“BA”).',
  'metadata': {'part_id': '1',
   'part_title': 'INTRODUCTION',
   'section_id': '1.1',
   'section_title': '',
   'regulation': 'MAS Notice 626',
   'references': []}},
 {'id': 'mas-notice-626-1.2',
  'text': 'Part 1. INTRODUCTION\nSection 1.2. \n1.2 This Notice shall take effect from 1 April 2024.',
  'metadata': {'part_id': '1',
   'part_title': 'INTRODUCTION',
   'section_id': '1.2',
   'section_title': '',
   'regulation': 'MAS Notice 626',
   'references': []}},
 {'id': 'mas-notice-626-2.1',
  'text': 'Part 2. DEFINITIONS\nSection 2.1. \n2.1 For the purposes of this Notice – “AML/CFT” means anti-money laundering and countering the financing of terrorism; “Authority” means the Monetary Authority of Singapore;

In [36]:
import json

with open("mas_chunks.json", "w", encoding="utf-8") as f:
    json.dump(chunks_all_mas_documents, f, ensure_ascii=False, indent=2)