In [None]:
import re
from pathlib import Path
from typing import Dict, List, Optional, Union
from pydantic import BaseModel
import os
def extract_name(filepath):
    # 获取文件名（带扩展名）
    filename = os.path.basename(filepath)  # 示例："中华人民共和国民法典.txt"
    # 去掉.txt后缀
    return filename[:-4] if filename.endswith('.txt') else filename
class LawStructure(BaseModel):
    编: Optional[Dict[str, str]] = None
    章: Optional[Dict[str, str]] = None
    节: Optional[Dict[str, str]] = None

def parse_civil_code(file_path: str) -> List[Dict]:
    text = Path(file_path).read_text(encoding='utf-8')
    law_type = extract_name(file_path)
    current_level = {'编': None, '章': None, '节': None}
    
    articles = []
    current_article = None  # 当前条文对象
    collecting = False      # 是否正在收集条文内容

    for line in text.split('\n'):
        line = line.strip()
        line = re.sub(r'\u3000+', ' ', line)  # 替换全角空格

        if not line:
            continue

        # 判断是否为标题
        if '编' in line:
            match = re.match(r'第([一二三四五六七八九十百千零〇\d]+)编\s*(.*)', line)
            if match:
                current_level['编'] = {'序号': match.group(1), '标题': match.group(2).strip()}
                current_level['章'] = None
                current_level['节'] = None
                continue

        elif '章' in line:
            match = re.match(r'第([一二三四五六七八九十百千零〇\d]+)章\s*(.*)', line)
            if match:
                current_level['章'] = {'序号': match.group(1), '标题': match.group(2).strip()}
                current_level['节'] = None
                continue

        elif '节' in line:
            match = re.match(r'第([一二三四五六七八九十百千零〇\d]+)节\s*(.*)', line)
            if match:
                current_level['节'] = {'序号': match.group(1), '标题': match.group(2).strip()}
                continue

        # 检测条文开始
        match = re.match(r'^第([一二三四五六七八九十百千零〇\d]+)条\s*(.*)', line)
        if match:
            # 存储上一条
            if current_article:
                current_article['page_content'] = current_article['page_content'].strip()
                articles.append(current_article)

            # 创建新条文
            article_no = match.group(1)
            content = match.group(2).strip()

            structure = {}
            if current_level['编']:
                structure['编'] = current_level['编'].copy()
            if current_level['章']:
                structure['章'] = current_level['章'].copy()
            if current_level['节']:
                structure['节'] = current_level['节'].copy()

            current_article = {
                'page_content': content,
                'metadata': {
                    'law_type': law_type,
                    'article_no': article_no,
                    'structure': structure if structure else None
                }
            }
            collecting = True
        elif collecting and current_article:
            # 收集条文正文内容（条文项、段落等）
            current_article['page_content'] += '\n' + line

    # 处理最后一条
    if current_article:
        current_article['page_content'] = current_article['page_content'].strip()
        articles.append(current_article)

    return articles
try:
    articles = []
    # 解析多个文件
    articles.extend(parse_civil_code("中华人民共和国刑法.txt"))  
    articles.extend(parse_civil_code("中华人民共和国民法典.txt"))
    articles.extend(parse_civil_code("中华人民共和国宪法.txt"))
    #articles = parse_civil_code("/root/中华人民共和国刑法.txt")
    print(f"成功解析 {len(articles)} 个条文")
    if articles:
        print("示例条文：", articles[1234])
except Exception as e:
    print(f"解析失败: {str(e)}")
import json
# 保存
with open("articles.json", "w", encoding="utf-8") as f:
    json.dump(articles, f, ensure_ascii=False, indent=2)