In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# Main
import os
import re
import json

# Check if the document is a "Nghị định"
def is_nghi_dinh(document_data):
    if "name" in document_data:
        name_parts = document_data["name"].split('-')
        return len(name_parts) > 2 and name_parts[0].lower() == "nghi" and name_parts[1].lower() == "dinh"
    return False

# Clean text by removing excessive whitespace and newlines
def clean_text(text):
    text = re.sub(r'(\r\n|\n|\s)+', ' ', text).strip()
    return re.sub(r'\s{2,}', ' ', text)

# Clean and structure the extracted data
def cleaned_structured_data(structured_data):
    cleaned_data = structured_data.copy()
    cleaned_data["Header"] = clean_text(cleaned_data["Header"])

    for section in ["Phần", "Chương", "Mục", "Tiểu Mục", "Điều"]:
        for item in cleaned_data[section]:
            item["title"] = clean_text(item["title"])

    return cleaned_data

# Extract structured data from the input JSON
def extract_data(input_data):
    result = {
        "Header": "",
        "Phần": [],
        "Chương": [],
        "Mục": [],
        "Tiểu Mục": [],
        "Điều": []
    }

    # Extract the header
    header_pattern = re.compile(r"^(.+?)(?=\n\n(Phần|Chương|Mục|Điều thứ|Điều)|$)", re.DOTALL)
    header_match = header_pattern.search(input_data["passage"])
    if header_match:
        result["Header"] = header_match.group(1).strip()

    # Extract standalone "Điều"
    dieu_pattern_alone = re.compile(r"\n\n(?:Điều thứ|Điều)\s+([0-9]+[abcd]?)[\.:]?\s+(.*?)(?=\n\n(Điều thứ|Điều|Mục|MỤC|Chương)|$)", re.DOTALL)
    dieu_matches = dieu_pattern_alone.finditer(input_data["passage"])

    for dieu_match in dieu_matches:
        dieu_position = dieu_match.start()
        preceding_text = input_data["passage"][:dieu_position]
        if "\n\nChương" not in preceding_text and "\n\nMỤC" not in preceding_text and "\n\nPhần" not in preceding_text:
            dieu_number = dieu_match.group(1).strip()
            dieu_title_content = dieu_match.group(2).strip()
            result["Điều"].append({
                "number": dieu_number,
                "title": dieu_title_content,
                "Chương": "",
                "Mục": ""
            })

    # Check for "Phần"
    part_pattern = re.compile(r"\n\nPhần\s*([0-9IVXLCDM]+):?\s*(.*?)(?=\n\n(Phần|Chương|Điều|Điều thứ)|$)", re.DOTALL)
    part_matches = list(part_pattern.finditer(input_data["passage"]))

    if part_matches:
        for part_match in part_matches:
            part_number = part_match.group(1).strip()
            part_title = part_match.group(2).strip()
            current_part = {
                "number": part_number,
                "title": part_title
            }
            result["Phần"].append(current_part)

            part_start = part_match.start()
            part_end = part_matches[part_matches.index(part_match) + 1].start() if part_matches.index(part_match) + 1 < len(part_matches) else len(input_data["passage"])

            chapter_pattern = re.compile(r"\n\n([0-9IVX]+)\.\s*(.*?)(?=\n\n[0-9]+\.\s*|$)", re.DOTALL)
            chapter_matches = list(chapter_pattern.finditer(input_data["passage"][part_start:part_end]))

            if chapter_matches:
                for chapter_match in chapter_matches:
                    chapter_number = chapter_match.group(1).strip()
                    chapter_title = chapter_match.group(2).strip()
                    current_chuong = {
                        "number": chapter_number,
                        "title": chapter_title,
                        "Phần": part_number
                    }
                    result["Chương"].append(current_chuong)

                    chuong_start = chapter_match.start() + part_start
                    chuong_end = chapter_matches[chapter_matches.index(chapter_match) + 1].start() + part_start if chapter_matches.index(chapter_match) + 1 < len(chapter_matches) else part_end

                    extract_muc_and_dieu(result, input_data["passage"][chuong_start:chuong_end], part_number, chapter_number)

            else:
                extract_dieu(result, input_data["passage"][part_start:part_end], part_number)

    else:
        chapter_pattern = re.compile(r"\n\nChương\s*([0-9IVXLCDM]+):?\s*(.*?)(?=\n\n(MỤC|Mục|Điều|Điều thứ|Chương)|$)", re.DOTALL)
        chapter_matches = list(chapter_pattern.finditer(input_data["passage"]))

        for chapter_match in chapter_matches:
            chapter_number = chapter_match.group(1).strip()
            chapter_title = chapter_match.group(2).strip()
            current_chuong = {
                "number": chapter_number,
                "title": chapter_title,
                "Phần": ""
            }
            result["Chương"].append(current_chuong)

            chuong_start = chapter_match.start()
            chuong_end = chapter_matches[chapter_matches.index(chapter_match) + 1].start() if chapter_matches.index(chapter_match) + 1 < len(chapter_matches) else len(input_data["passage"])

            extract_muc_and_dieu(result, input_data["passage"][chuong_start:chuong_end], "", chapter_number)

    return result

# Extract "Mục" and "Điều"
def extract_muc_and_dieu(result, text, part_number, chapter_number):
    muc_pattern = re.compile(r"\n\n(?:Mục|MỤC)\s*([0-9]+)[\.:]?\s*(.*?)(?=\n\n(?:Điều|Điều thứ|Mục|Chương)|$)", re.DOTALL)
    muc_matches = list(muc_pattern.finditer(text))

    if muc_matches:
        for muc_match in muc_matches:
            muc_number = muc_match.group(1).strip()
            muc_title = muc_match.group(2).strip()
            current_muc = {
                "number": muc_number,
                "title": muc_title,
                "Phần": part_number,
                "Chương": chapter_number
            }
            result["Mục"].append(current_muc)

            muc_start = muc_match.start()
            muc_end = muc_matches[muc_matches.index(muc_match) + 1].start() if muc_matches.index(muc_match) + 1 < len(muc_matches) else len(text)

            extract_dieu(result, text[muc_start:muc_end], part_number, chapter_number, muc_number)
    else:
        extract_dieu(result, text, part_number, chapter_number)

# Extract "Điều"
def extract_dieu(result, text, part_number, chapter_number="", muc_number=""):
    dieu_pattern = re.compile(r"\n\n(?:Điều thứ|Điều)\s+([0-9]+[abcd]?)[\.:]?\s+(.*?)(?=\n\n(Điều thứ|Điều|Mục|MỤC|Chương)|$)", re.DOTALL)
    dieu_matches = dieu_pattern.finditer(text)

    for dieu_match in dieu_matches:
        dieu_number = dieu_match.group(1).strip()
        dieu_title_content = dieu_match.group(2).strip()
        result["Điều"].append({
            "number": dieu_number,
            "title": dieu_title_content,
            "Phần": part_number,
            "Chương": chapter_number,
            "Mục": muc_number
        })

# Paths for input and output directories
input_folder = '/content/drive/MyDrive/contexts/'
output_folder = '/content/drive/MyDrive/Structured_Nghị Định/'

for filename in os.listdir(input_folder):
    input_file = os.path.join(input_folder, filename)
    if os.path.isfile(input_file) and filename.endswith('.json'):
        print(f"Processing file: {input_file}")

        if os.path.getsize(input_file) > 10 * 1024 * 1024:  # Skip files larger than 10MB
            print(f"File {input_file} is too large to process.")
            continue

        with open(input_file, 'r', encoding='utf-8') as f:
            input_data = json.load(f)

        if is_nghi_dinh(input_data):
            structured_data = extract_data(input_data)
            structured_data = cleaned_structured_data(structured_data)

            output_file = os.path.join(output_folder, filename)
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(structured_data, f, ensure_ascii=False, indent=4)

            print(f"Structured data saved to: {output_file}")
        else:
            print(f"Skipped non-Nghị định document: {input_file}")