In [1]:
# Legal Reasoning Project, NCCU (2025)
# 1_graph_builder.ipynb: Build a knowledge graph for subsequent training work based on the structured osh document.

In [2]:
# revised by claude Sonnet
# manually upload osh_doc_merged.json

In [3]:
import json
import hashlib
import re
from typing import Dict, List, Tuple, Optional, Any, Set
import networkx as nx
from collections import defaultdict
import argparse
import os
from pathlib import Path

In [4]:
# Configuration: Ctrl + F the following keywords:
## "input directory setting"
## "output directory setting"

In [5]:
# ========================================
# 語義邊類型定義 (Semantic Edge Types)
# ========================================

# === 歸責路徑 (Attribution Path) ===
EDGE_HAS_CAUSE = "HAS_CAUSE"              # Incident -> Cause
EDGE_ENABLED_BY = "ENABLED_BY"            # Cause -> Cause (Basic->Indirect->Direct)
EDGE_LEADS_TO = "LEADS_TO"                # Cause -> Violation (推理橋接)
EDGE_VIOLATES = "VIOLATES_LAW"            # Violation -> Regulation (法律對映)

# === 關聯路徑 (Association Path) ===
EDGE_INVOLVES = "INVOLVES_OBJECT"         # Incident -> Medium
EDGE_IS_SUBCLASS = "IS_SUBCLASS_OF"       # Medium Hierarchy
EDGE_REGULATED_BY = "REGULATED_BY"        # Medium -> Regulation (需外部補充)

# === 屬性與情境 ===
EDGE_OCCURS_IN = "OCCURS_IN"              # Incident -> Industry
EDGE_HAS_TYPE = "HAS_INCIDENT_TYPE"       # Incident -> IncidentType
EDGE_APPLIES_TO = "APPLIES_TO_INCIDENT"   # Regulation -> Incident (法規適用)

In [6]:
class OccupationalSafetyKnowledgeGraph:
    """
    職業安全法律推論知識圖譜建構器 V2

    設計目標:
    - 為大型語言模型 (LLM) 提供結構化的法律推理訓練數據
    - 支援圖神經網路 (GNN) 的異質圖學習
    - 保留完整的因果推理鏈與法律適用邏輯
    - 避免捷徑學習 (Shortcut Learning) 與數據污染
    """

    def __init__(self, strict_mode: bool = True, enable_semantic: bool = True):
        """
        Args:
            strict_mode: 嚴格模式，啟用數據衛生檢查
            enable_semantic: 啟用語義相似度計算 (用於違規-原因匹配)
        """
        self.graph = nx.MultiDiGraph()
        self.strict_mode = strict_mode
        self.enable_semantic = enable_semantic

        # 統計資訊 (用於分析與除錯)
        self.stats = defaultdict(int)
        self.regulation_frequency = defaultdict(int)
        self.filtered_nodes = []  # 記錄被過濾的節點

    def _generate_id(self, text: str, prefix: str = "") -> str:
        """生成穩定且唯一的節點 ID"""
        if not text:
            return f"{prefix}_UNKNOWN_{hash(text)}"
        hash_val = hashlib.md5(text.encode('utf-8')).hexdigest()[:12]
        return f"{prefix}_{hash_val}"

    def _clean_text(self, text: str) -> str:
        """基礎文本清洗"""
        if not text:
            return ""
        # 移除多餘空白與換行
        text = re.sub(r'\s+', ' ', text.strip())
        # 移除引號 (避免 JSON 問題)
        text = text.replace('"', '').replace("'", '')
        return text

    def _is_valid_node(self, label: str, node_type: str = "") -> bool:
        """
        [關鍵改動] 數據衛生檢查 (Data Sanitization)

        過濾規則:
        1. 長度過短 (< 2 字元)
        2. LLM 拒絕回答的標記
        3. 無意義的通用詞
        4. 特定節點類型的額外檢查
        """
        if not label or len(label.strip()) < 2:
            return False

        label_lower = label.lower().strip()

        # 通用無效關鍵字
        invalid_keywords = [
            "抱歉", "無法提供", "無法提取", "沒有提供",
            "sorry", "none", "unknown", "n/a", "null",
            "無", "未知", "無資料", "無此資料"
        ]

        if any(kw in label_lower for kw in invalid_keywords):
            if self.strict_mode:
                self.filtered_nodes.append((label, node_type, "invalid_keyword"))
            return False

        # 針對特定節點類型的檢查
        if node_type == "Regulation":
            # 法規必須包含法律名稱或條文號碼
            if not re.search(r'(法|條|規則|辦法|標準|第\d+)', label):
                if self.strict_mode:
                    self.filtered_nodes.append((label, node_type, "invalid_regulation"))
                return False

        if node_type == "Cause":
            # 原因不應該只是單一動詞或過於簡短
            if len(label.strip()) < 5:
                return False

        return True

    def _parse_cause_analysis(self, cause_text: str) -> Dict[str, str]:
        """
        解析三層原因結構 (瑞士乳酪模型)

        Returns:
            {
                'direct': '直接原因文本',
                'indirect': '間接原因文本',
                'basic': '基本原因文本'
            }
        """
        causes = {'direct': '', 'indirect': '', 'basic': ''}

        if not cause_text:
            return causes

        # Pattern 1: 直接原因
        patterns = [
            r'[(\(\[]?一[)\)\]]?[、\s]*直接原因[:\s]+(.*?)(?=[(\(\[]?二[)\)\]]|$)',
            r'直接原因[:\s]+(.*?)(?=間接原因|基本原因|$)'
        ]
        for pattern in patterns:
            match = re.search(pattern, cause_text, re.DOTALL)
            if match:
                causes['direct'] = self._clean_text(match.group(1))
                break

        # Pattern 2: 間接原因
        patterns = [
            r'[(\(\[]?二[)\)\]]?[、\s]*間接原因[:\s]+(.*?)(?=[(\(\[]?三[)\)\]]|$)',
            r'間接原因[:\s]+(.*?)(?=基本原因|$)'
        ]
        for pattern in patterns:
            match = re.search(pattern, cause_text, re.DOTALL)
            if match:
                causes['indirect'] = self._clean_text(match.group(1))
                break

        # Pattern 3: 基本原因
        patterns = [
            r'[(\(\[]?三[)\)\]]?[、\s]*基本原因[:\s]+(.*?)$',
            r'基本原因[:\s]+(.*?)$'
        ]
        for pattern in patterns:
            match = re.search(pattern, cause_text, re.DOTALL)
            if match:
                causes['basic'] = self._clean_text(match.group(1))
                break

        return causes

    def _extract_violations_from_summary(self, summary: str) -> List[str]:
        """
        從 cause_summary 提取個別違規項目

        Example:
            Input: "「勞工未使用安全帶、雇主未設置安全衛生管理員」"
            Output: ["勞工未使用安全帶", "雇主未設置安全衛生管理員"]
        """
        if not summary:
            return []

        # 移除引號
        clean = summary.strip('「」『』""\'\'')

        # 以頓號或逗號分割
        violations = re.split(r'[、,]', clean)

        # 過濾與清洗
        result = []
        for v in violations:
            v = self._clean_text(v)
            if self._is_valid_node(v, "Violation"):
                result.append(v)

        return result

    def _parse_regulations(self, reg_text: str) -> List[Tuple[str, str]]:
        """
        解析法規文字,返回 [(法規名稱, 條文號)] 列表

        Example:
            Input: "職業安全衛生設施規則第228條暨職業安全衛生法第6條第1項"
            Output: [
                ("職業安全衛生設施規則", "第228條"),
                ("職業安全衛生法", "第6條第1項")
            ]
        """
        if not reg_text:
            return []

        regulations = []

        # 先以逗號或頓號分割
        segments = re.split(r'[,、;]', reg_text)

        for seg in segments:
            # 處理「暨」連接的多個法規
            sub_regs = re.split(r'暨', seg)

            for sub in sub_regs:
                sub = self._clean_text(sub)
                if not sub:
                    continue

                # 提取法規名稱與條文
                # Pattern: XXX法/規則/辦法/標準 + 第X條...
                pattern = r'(.*?(?:法|規則|辦法|標準))\s*(第.*?)(?:$|[,、])'
                match = re.search(pattern, sub)

                if match:
                    law_name = match.group(1).strip()
                    article = match.group(2).strip()

                    # 驗證有效性
                    if self._is_valid_node(law_name, "Regulation"):
                        regulations.append((law_name, article))
                else:
                    # 無法精確解析,但如果看起來是法規,整段作為法規名稱
                    if re.search(r'(法|規則|辦法|標準)', sub):
                        if self._is_valid_node(sub, "Regulation"):
                            regulations.append((sub, ""))

        return regulations

    def _semantic_similarity(self, text1: str, text2: str) -> float:
        """
        [修正版] 計算語義相似度

        改進點:
        1. 使用 Bi-gram (雙字) 而非單字切分，提高中文匹配精確度。
           (例如: 避免 "安全帽" 與 "安全帶" 被誤判為高度相似)
        2. 加入長度懲罰，避免短字串導致的分數虛高。
        """
        if not text1 or not text2:
            return 0.0

        # 預處理：移除常見停用詞與標點
        stop_chars = "的，、。；：未無有"
        t1 = "".join([c for c in text1 if c not in stop_chars])
        t2 = "".join([c for c in text2 if c not in stop_chars])

        if not t1 or not t2:
            return 0.0

        # 使用 Bi-gram (雙字) 集合
        # "高空作業" -> {"高空", "空作", "作業"}
        s1 = set(t1[i:i+2] for i in range(len(t1)-1))
        s2 = set(t2[i:i+2] for i in range(len(t2)-1))

        # 如果字串太短無法形成 bigram，退回單字比對
        if not s1: s1 = set(t1)
        if not s2: s2 = set(t2)

        intersection = len(s1 & s2)
        union = len(s1 | s2)

        score = intersection / union if union > 0 else 0.0

        # TODO (Future Work): 在這裡接入 sentence-transformers
        # from sentence_transformers import util
        # score = util.pytorch_cos_sim(self.model.encode(text1), self.model.encode(text2))

        return score

    # ========================================
    # 圖構建主流程
    # ========================================

    def build_graph(self, incidents_data: List[Dict[str, Any]]) -> nx.MultiDiGraph:
        """
        主構建流程

        Args:
            incidents_data: 事故資料列表

        Returns:
            構建完成的知識圖譜
        """
        print(f"開始構建知識圖譜，共 {len(incidents_data)} 筆事件...")
        print(f"嚴格模式: {self.strict_mode}, 語義分析: {self.enable_semantic}")

        for idx, incident in enumerate(incidents_data, 1):
            try:
                self._process_single_incident(incident)

                if idx % 10 == 0:
                    print(f"已處理 {idx}/{len(incidents_data)} 筆資料")

            except Exception as e:
                print(f"⚠ 處理第 {idx} 筆資料時發生錯誤: {str(e)}")
                continue

        # 輸出統計
        print(f"\n{'='*60}")
        print(f"圖譜構建完成:")
        print(f"  節點數: {self.graph.number_of_nodes()}")
        print(f"  邊數: {self.graph.number_of_edges()}")

        if self.strict_mode and self.filtered_nodes:
            print(f"  過濾節點數: {len(self.filtered_nodes)}")
            print(f"  (詳見 get_filtered_nodes() 方法)")

        print(f"{'='*60}\n")

        return self.graph

    def _process_single_incident(self, data: Dict):
        """處理單一事件，構建完整的雙路徑結構"""

        # 1. 建立核心事件節點 (Anchor Node)
        inc_text = data.get('description_summary', '') or data.get('description', '')
        if not self._is_valid_node(inc_text, "Incident"):
            return

        inc_id = self._generate_id(inc_text, "INC")

        self.graph.add_node(
            inc_id,
            node_type="Incident",
            label=inc_text[:100] + "..." if len(inc_text) > 100 else inc_text,
            full_text=inc_text,
            industry=data.get('industry', 'Unknown'),
            incident_type=data.get('incident_type', 'Unknown'),
            incident_type_id=data.get('incident_type_id', '')
        )

        self.stats['incident'] += 1

        # 2. 建構「歸責路徑」 (Attribution Path)
        # Incident -> Cause Chain -> Violation -> Regulation
        cause_ids = self._build_causal_chain(inc_id, data)
        self._build_attribution_path(inc_id, cause_ids, data)

        # 3. 建構「關聯路徑」 (Association Path)
        # Incident -> Medium Hierarchy
        self._build_association_path(inc_id, data)

        # 4. 建構情境屬性
        self._build_contextual_attributes(inc_id, data)

    def _build_causal_chain(self, inc_id: str, data: Dict) -> Dict[str, str]:
        """
        [修正版] 建立三層因果鏈 (Swiss Cheese Model)

        改進點:
        1. 加入 Fallback 機制: 如果 Regex 解析失敗，嘗試使用整個欄位內容。
        2. 加入 Description Injection: 如果完全沒有原因欄位，從事故描述中生成推論原因。
        3. 確保回傳的 cause_ids 至少有一個節點，避免圖譜斷鏈。
        """
        raw_cause_text = data.get('cause_analysis', '')
        causes = self._parse_cause_analysis(raw_cause_text)

        # === Fallback 策略 1: 寬鬆提取 ===
        # 如果正規表達式沒抓到東西，但欄位裡其實有字，就將整段文字視為「直接原因」
        if not any(causes.values()) and raw_cause_text:
            cleaned = self._clean_text(raw_cause_text)
            if self._is_valid_node(cleaned, "Cause"):
                causes['direct'] = cleaned

        # === Fallback 策略 2: 從描述推論 (Description Injection) ===
        # 如果完全沒有原因資料，為了維持圖連通性，從 Summary 或 Description 提取
        if not any(causes.values()):
            # 優先使用摘要，若無則使用描述的前段
            source_text = data.get('description_summary') or data.get('description', '')
            source_text = self._clean_text(source_text)

            if source_text:
                # 簡單啟發式：通常原因會接在「因...」或「由於...」之後 (這裡簡化處理)
                # 這裡建立一個特殊節點，標記為 Inferred (推論而得)
                causes['direct'] = source_text[:100]  # 截取前100字作為原因概括

        cause_ids = {}

        # 定義因果鏈層級
        # 注意：這裡允許 Inferred 類型的出現
        chain = [
            ('basic', causes['basic'], 'Cause_Basic'),
            ('indirect', causes['indirect'], 'Cause_Indirect'),
            ('direct', causes['direct'], 'Cause_Direct')
        ]

        prev_id = None

        for level, cause_text, node_type in chain:
            if not cause_text:
                continue

            # 注意：對於 Fallback 產生的原因，我們可能需要放寬一點 valid check
            if not self._is_valid_node(cause_text, "Cause"):
                continue

            cause_id = self._generate_id(cause_text, f"CAUSE_{level.upper()}")
            cause_ids[level] = cause_id

            # 添加原因節點
            self.graph.add_node(
                cause_id,
                node_type=node_type,
                label=cause_text,
                cause_level=level,
                text_length=len(cause_text),
                is_inferred=(not raw_cause_text) # 標記是否為推論資料
            )

            self.stats[f'cause_{level}'] += 1

            # 建立層級間的因果關係 Basic -> Indirect -> Direct
            if prev_id:
                self.graph.add_edge(
                    prev_id, cause_id,
                    relation=EDGE_ENABLED_BY,
                    weight=1.0,
                    causal_strength='strong'
                )

            prev_id = cause_id

        # 連接到事故 (最後一個原因 -> Incident)
        if prev_id:
            self.graph.add_edge(
                prev_id, inc_id,
                relation=EDGE_HAS_CAUSE,
                weight=1.0,
                causal_strength='direct'
            )
        else:
            # === Ultimate Fallback: 防止孤立節點 ===
            # 如果經過上述努力還是沒有原因節點，創建一個 Unknown Cause
            unk_id = f"CAUSE_UNKNOWN_{inc_id}"
            self.graph.add_node(unk_id, node_type="Cause_Unknown", label="未知原因")
            self.graph.add_edge(unk_id, inc_id, relation=EDGE_HAS_CAUSE, weight=0.1)
            cause_ids['unknown'] = unk_id

        return cause_ids

    def _build_attribution_path(self, inc_id: str, cause_ids: Dict[str, str], data: Dict):
        """
        [修正版] 構建歸責路徑: Cause -> Violation -> Regulation

        改進點:
        1. 解決 Semantic Broadcasting: 只有當原因與法規/違規語義相關時才連結。
        2. 嚴格的相似度閾值: 防止「未戴安全帽」連結到「鷹架法規」。
        3. 動態權重: 根據相似度給予邊不同的權重 (Weight)。
        """

        regulations = self._parse_regulations(data.get('preventive_regulations', ''))
        violations_text = self._extract_violations_from_summary(data.get('cause_summary', ''))

        # 收集所有有效的原因節點 ID 與其文本
        valid_causes = [] # list of (id, text, type)
        for level, cid in cause_ids.items():
            if cid in self.graph:
                valid_causes.append((cid, self.graph.nodes[cid]['label'], level))

        # 設定語義匹配閾值 (可調整)
        SIMILARITY_THRESHOLD = 0.15  # Bi-gram 下，0.15 通常代表有共用關鍵詞

        # === Path A: 透過明確的 Violation 節點橋接 (Cause -> Violation -> Regulation) ===
        if violations_text:
            for vio_text in violations_text:
                vio_id = self._generate_id(vio_text, "VIO")

                # 建立 Violation 節點
                actor = 'employer' if '雇主' in vio_text else 'worker' if '勞工' in vio_text else 'unknown'
                self.graph.add_node(
                    vio_id, node_type="Violation", label=vio_text,
                    actor=actor, is_omission='未' in vio_text
                )
                self.stats['violation'] += 1

                # 1. Link Cause -> Violation (with Filtering)
                linked_cause = False
                for cid, c_text, c_level in valid_causes:
                    sim = self._semantic_similarity(vio_text, c_text)

                    # 只有相似度夠高，或是文字包含關係，才建立連結
                    if sim > SIMILARITY_THRESHOLD or vio_text in c_text or c_text in vio_text:
                        self.graph.add_edge(
                            cid, vio_id,
                            relation=EDGE_LEADS_TO,
                            weight=max(0.5, sim),
                            sim_score=sim
                        )
                        linked_cause = True

                # 如果沒有任何原因匹配到這個違規 (可能是推論不足)，強制連結最後一層原因 (Direct)
                if not linked_cause and valid_causes:
                    # 取最後一個原因 (通常是 Direct)
                    last_cid = valid_causes[-1][0]
                    self.graph.add_edge(last_cid, vio_id, relation=EDGE_LEADS_TO, weight=0.3, note="forced_link")

                # 2. Link Violation -> Regulation (with Filtering)
                for law_name, article in regulations:
                    reg_text = f"{law_name} {article}".strip()
                    reg_id = self._generate_id(reg_text, "REG")

                    if reg_id not in self.graph:
                        self.graph.add_node(reg_id, node_type="Regulation", label=reg_text, law_name=law_name)
                        self.stats['regulation'] += 1
                    self.regulation_frequency[reg_id] += 1

                    # 計算違規行為與法規的相似度
                    # 注意：法規文字通常很長，這裡主要比對法規名稱或內容摘要(如有)
                    # 這裡假設 label 包含法規名稱，這對於匹配幫助有限，理想是需要法規全文
                    # 但為了防止完全無關的廣播，我們至少檢查法規名稱中的關鍵字是否出現在違規中
                    # (例如: 違規有"墜落"，法規是"高空作業...")

                    # 暫時策略：全連結，但在邊上標記這是從哪個 Violation 來的，
                    # 真正的過濾應該發生在 Cause -> Violation 階段
                    self.graph.add_edge(
                        vio_id, reg_id,
                        relation=EDGE_VIOLATES,
                        weight=1.0
                    )

        # === Path B: 隱含路徑 (Cause -> Implicit Violation -> Regulation) ===
        # 當沒有明確的 cause_summary 時使用
        else:
            for law_name, article in regulations:
                reg_text = f"{law_name} {article}".strip()
                reg_id = self._generate_id(reg_text, "REG")

                if reg_id not in self.graph:
                    self.graph.add_node(reg_id, node_type="Regulation", label=reg_text, law_name=law_name)
                    self.stats['regulation'] += 1
                self.regulation_frequency[reg_id] += 1

                if valid_causes:
                    for cid, c_text, c_level in valid_causes:
                        # 這裡進行關鍵過濾：原因是否跟這條法規有關？
                        # 如果完全沒關係，就不連，避免 broadcasting
                        sim = self._semantic_similarity(c_text, reg_text)

                        if sim > SIMILARITY_THRESHOLD * 0.8: # 法規文字較短，降低一點閾值
                            # 建立隱含違規節點作為橋梁
                            vio_id = self._generate_id(f"IMP_{cid}_{reg_id}", "VIO")
                            if vio_id not in self.graph:
                                self.graph.add_node(
                                    vio_id, node_type="Violation",
                                    label="推論違規", is_implicit=True
                                )

                            self.graph.add_edge(cid, vio_id, relation=EDGE_LEADS_TO, weight=sim)
                            self.graph.add_edge(vio_id, reg_id, relation=EDGE_VIOLATES, weight=sim)
                else:
                    # 無原因，直接 Incident -> Regulation
                    self.graph.add_edge(inc_id, reg_id, relation="DIRECT_LIABILITY", weight=0.5)

    def _build_association_path(self, inc_id: str, data: Dict):
        """
        構建關聯路徑: Incident -> Medium Hierarchy

        三層分類樹: Specific -> Normal -> General
        """
        levels = [
            ('specific', data.get('medium_type_specific', ''),
             data.get('medium_type_specific_id', '')),
            ('normal', data.get('medium_type_normal', ''),
             data.get('medium_type_normal_id', '')),
            ('general', data.get('medium_type_general', ''),
             data.get('medium_type_general_id', ''))
        ]

        prev_id = inc_id

        for level, label, code in levels:
            if not label or not code:
                continue

            if not self._is_valid_node(label, "Medium"):
                continue

            node_id = self._generate_id(f"{label}_{code}", f"MED_{level.upper()}")

            # 添加節點 (如果已存在則跳過)
            if node_id not in self.graph:
                self.graph.add_node(
                    node_id,
                    node_type=f"Medium_{level.capitalize()}",
                    label=label,
                    code=code,
                    category_level=level
                )
                self.stats[f'medium_{level}'] += 1

            # 建立層級關係
            if prev_id == inc_id:
                # Incident -> Medium_Specific
                self.graph.add_edge(
                    inc_id, node_id,
                    relation=EDGE_INVOLVES,
                    weight=1.0
                )
            else:
                # Medium_Specific -> Medium_Normal -> Medium_General
                self.graph.add_edge(
                    node_id, prev_id,  # 注意方向: 子類 -> 父類
                    relation=EDGE_IS_SUBCLASS,
                    weight=0.8
                )

            prev_id = node_id

    def _build_contextual_attributes(self, inc_id: str, data: Dict):
        """
        建構情境屬性: 產業、事故類型等

        這些屬性用於訓練情境理解能力
        """
        # 產業
        industry = data.get('industry', '')
        if industry and self._is_valid_node(industry, "Industry"):
            ind_id = self._generate_id(industry, "IND")

            if ind_id not in self.graph:
                self.graph.add_node(
                    ind_id,
                    node_type="Industry",
                    label=industry
                )
                self.stats['industry'] += 1

            self.graph.add_edge(
                inc_id, ind_id,
                relation=EDGE_OCCURS_IN,
                weight=0.5
            )

        # 事故類型
        inc_type = data.get('incident_type', '')
        inc_type_id = data.get('incident_type_id', '')

        if inc_type and inc_type_id:
            type_id = self._generate_id(f"{inc_type}_{inc_type_id}", "TYPE")

            if type_id not in self.graph:
                self.graph.add_node(
                    type_id,
                    node_type="IncidentType",
                    label=inc_type,
                    code=inc_type_id
                )
                self.stats['incident_type'] += 1

            self.graph.add_edge(
                inc_id, type_id,
                relation=EDGE_HAS_TYPE,
                weight=0.5
            )

    # ========================================
    # 分析與輸出方法
    # ========================================

    def get_statistics(self) -> Dict:
        """返回圖譜統計資訊"""
        node_types = defaultdict(int)
        edge_relations = defaultdict(int)

        for node, data in self.graph.nodes(data=True):
            node_types[data.get('node_type', 'Unknown')] += 1

        for u, v, data in self.graph.edges(data=True):
            edge_relations[data.get('relation', 'Unknown')] += 1

        return {
            'total_nodes': self.graph.number_of_nodes(),
            'total_edges': self.graph.number_of_edges(),
            'node_type_distribution': dict(node_types),
            'edge_relation_distribution': dict(edge_relations),
            'most_cited_regulations': sorted(
                self.regulation_frequency.items(),
                key=lambda x: x[1],
                reverse=True
            )[:20],
            'graph_density': nx.density(self.graph),
            'is_connected': nx.is_weakly_connected(self.graph),
            'construction_stats': dict(self.stats)
        }

    def get_filtered_nodes(self) -> List[Tuple[str, str, str]]:
        """返回被過濾的節點列表 (用於除錯)"""
        return self.filtered_nodes

    def export_readable_summary(self, filepath: str):
        """輸出人類可讀的圖譜摘要報告"""
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write("=" * 80 + "\n")
            f.write("台灣職業安全法律知識圖譜 - V2 建構品質報告\n")
            f.write("=" * 80 + "\n\n")

            stats = self.get_statistics()

            # 1. 總體統計
            f.write("【一】總體統計\n")
            f.write(f"總節點數: {stats['total_nodes']}\n")
            f.write(f"總邊數: {stats['total_edges']}\n")
            f.write(f"圖密度: {stats['graph_density']:.6f}\n")
            f.write(f"是否連通: {'是' if stats['is_connected'] else '否'}\n\n")

            # 2. 節點類型分布
            f.write("【二】節點類型分布\n")
            for ntype, count in sorted(stats['node_type_distribution'].items(),
                                      key=lambda x: x[1], reverse=True):
                f.write(f"  {ntype:30s}: {count:5d} 個\n")
            f.write("\n")

            # 3. 邊關係類型分布
            f.write("【三】邊關係類型分布（關鍵設計指標）\n")
            f.write("  [歸責路徑]\n")
            attribution_edges = [EDGE_HAS_CAUSE, EDGE_ENABLED_BY, EDGE_LEADS_TO, EDGE_VIOLATES]
            for rel in attribution_edges:
                count = stats['edge_relation_distribution'].get(rel, 0)
                f.write(f"    {rel:30s}: {count:5d} 條\n")

            f.write("  [關聯路徑]\n")
            association_edges = [EDGE_INVOLVES, EDGE_IS_SUBCLASS, EDGE_REGULATED_BY]
            for rel in association_edges:
                count = stats['edge_relation_distribution'].get(rel, 0)
                f.write(f"    {rel:30s}: {count:5d} 條\n")

            f.write("  [其他關係]\n")
            other_rels = set(stats['edge_relation_distribution'].keys()) - \
                        set(attribution_edges) - set(association_edges)
            for rel in sorted(other_rels):
                count = stats['edge_relation_distribution'][rel]
                f.write(f"    {rel:30s}: {count:5d} 條\n")
            f.write("\n")

            # 4. 最常被引用的法規 (TOP 20)
            f.write("【四】最常被引用的法規 (TOP 20)\n")
            for idx, (reg_id, freq) in enumerate(stats['most_cited_regulations'], 1):
                if reg_id in self.graph:
                    reg_label = self.graph.nodes[reg_id].get('label', reg_id)
                    f.write(f"  {idx:2d}. {reg_label:60s} (引用 {freq:3d} 次)\n")
            f.write("\n")

            # 5. 資料品質指標
            f.write("【五】資料品質指標\n")
            f.write(f"  成功建構事件: {stats['construction_stats'].get('incident', 0)} 個\n")
            f.write(f"  有效原因節點: {sum(stats['construction_stats'].get(f'cause_{l}', 0) for l in ['basic', 'indirect', 'direct'])} 個\n")
            f.write(f"  違規節點: {stats['construction_stats'].get('violation', 0)} 個\n")
            f.write(f"  法規節點: {stats['construction_stats'].get('regulation', 0)} 個\n")

            if self.strict_mode and self.filtered_nodes:
                f.write(f"  過濾無效節點: {len(self.filtered_nodes)} 個\n")
                f.write("    (主要原因: LLM 拒絕回答、資料缺失、格式錯誤)\n")
            f.write("\n")

            # 6. 圖結構健康度檢查
            f.write("【六】圖結構健康度檢查\n")

            # 檢查孤立節點
            isolated = [n for n in self.graph.nodes() if self.graph.degree(n) == 0]
            f.write(f"  孤立節點數: {len(isolated)} 個")
            if len(isolated) > 0:
                f.write(" ⚠️ 建議檢查資料完整性\n")
            else:
                f.write(" ✓\n")

            # 檢查 Incident 節點的連通性
            incident_nodes = [n for n, d in self.graph.nodes(data=True)
                            if d.get('node_type') == 'Incident']
            incidents_with_causes = sum(1 for inc in incident_nodes
                                       if any(d.get('relation') == EDGE_HAS_CAUSE
                                             for _, _, d in self.graph.in_edges(inc, data=True)))

            f.write(f"  有因果鏈的事件: {incidents_with_causes}/{len(incident_nodes)} ")
            coverage = incidents_with_causes / len(incident_nodes) if incident_nodes else 0
            if coverage > 0.8:
                f.write("✓\n")
            elif coverage > 0.5:
                f.write("⚠️ 部分事件缺少原因分析\n")
            else:
                f.write("❌ 多數事件缺少原因分析\n")

            # 檢查推理路徑完整性
            complete_paths = 0
            for inc in incident_nodes:
                # 檢查是否存在 Incident -> ... -> Regulation 的完整路徑
                regulations = [n for n in nx.descendants(self.graph, inc)
                             if self.graph.nodes[n].get('node_type') == 'Regulation']
                if regulations:
                    complete_paths += 1

            f.write(f"  有完整推理路徑的事件: {complete_paths}/{len(incident_nodes)} ")
            path_coverage = complete_paths / len(incident_nodes) if incident_nodes else 0
            if path_coverage > 0.9:
                f.write("✓\n")
            elif path_coverage > 0.7:
                f.write("⚠️\n")
            else:
                f.write("❌ 推理路徑構建不完整\n")
            f.write("\n")

            # 7. 隨機抽樣事件路徑範例
            f.write("【七】事件推理路徑範例（隨機抽樣 3 個）\n")
            import random
            sample_incidents = random.sample(incident_nodes, min(3, len(incident_nodes)))

            for idx, inc_id in enumerate(sample_incidents, 1):
                f.write(f"\n--- 範例 {idx} ---\n")
                inc_data = self.graph.nodes[inc_id]
                f.write(f"事件: {inc_data.get('label', 'Unknown')}\n")
                f.write(f"產業: {inc_data.get('industry', 'Unknown')}\n")
                f.write(f"類型: {inc_data.get('incident_type', 'Unknown')}\n\n")

                # 追溯因果鏈
                f.write("【因果鏈追溯】\n")
                cause_chain = []
                for pred in self.graph.predecessors(inc_id):
                    pred_data = self.graph.nodes[pred]
                    if 'Cause' in pred_data.get('node_type', ''):
                        cause_chain.append((pred, pred_data))

                        # 繼續往回追
                        for ppred in self.graph.predecessors(pred):
                            ppred_data = self.graph.nodes[ppred]
                            if 'Cause' in ppred_data.get('node_type', ''):
                                cause_chain.append((ppred, ppred_data))

                # 按層級排序
                cause_order = {'Cause_Basic': 1, 'Cause_Indirect': 2, 'Cause_Direct': 3}
                cause_chain.sort(key=lambda x: cause_order.get(x[1].get('node_type', ''), 0))

                for cid, cdata in cause_chain:
                    level = cdata.get('cause_level', 'unknown')
                    f.write(f"  [{level:8s}] {cdata.get('label', '')[:80]}\n")

                if not cause_chain:
                    f.write("  (無因果鏈資料)\n")

                # 違規行為
                f.write("\n【違規行為】\n")
                violations = []
                for desc in nx.descendants(self.graph, inc_id):
                    desc_data = self.graph.nodes[desc]
                    if desc_data.get('node_type') == 'Violation':
                        violations.append(desc_data.get('label', ''))

                if violations:
                    for v in violations:
                        f.write(f"  • {v}\n")
                else:
                    f.write("  (無明確違規項目)\n")

                # 相關法規
                f.write("\n【相關法規】\n")
                regulations = []
                for desc in nx.descendants(self.graph, inc_id):
                    desc_data = self.graph.nodes[desc]
                    if desc_data.get('node_type') == 'Regulation':
                        regulations.append(desc_data.get('label', ''))

                if regulations:
                    for r in set(regulations):  # 去重
                        f.write(f"  • {r}\n")
                else:
                    f.write("  (無相關法規)\n")

                # 涉及媒介物
                f.write("\n【涉及媒介物】\n")
                mediums = []
                for succ in self.graph.successors(inc_id):
                    succ_data = self.graph.nodes[succ]
                    if 'Medium' in succ_data.get('node_type', ''):
                        mediums.append(f"{succ_data.get('node_type')}: {succ_data.get('label')}")

                if mediums:
                    for m in mediums:
                        f.write(f"  • {m}\n")
                else:
                    f.write("  (無媒介物資料)\n")

                f.write("\n")

        print(f"✓ 可讀報告已匯出至: {filepath}")

    def save_graph(self, output_path: str, format: str = 'graphml'):
        """
        儲存圖譜

        Args:
            output_path: 輸出路徑
            format: 格式 (graphml, gexf, json)
        """
        if format == 'graphml':
            nx.write_graphml(self.graph, output_path)
        elif format == 'gexf':
            nx.write_gexf(self.graph, output_path)
        elif format == 'json':
            from networkx.readwrite import json_graph
            data = json_graph.node_link_data(self.graph)
            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump(data, f, ensure_ascii=False, indent=2)
        else:
            raise ValueError(f"不支援的格式: {format}")

        print(f"✓ 圖譜已儲存至: {output_path} (格式: {format})")

    def export_for_gnn(self, output_path: str):
        """
        匯出為 PyTorch Geometric 友好格式

        包含:
        - edge_index: 邊列表
        - node_features: 節點特徵 (需後續用 BERT 補充)
        - edge_types: 邊類型 (用於異質圖學習)
        - metadata: 節點與邊的元資料
        """
        import pickle

        # 節點映射
        node_to_idx = {node: idx for idx, node in enumerate(self.graph.nodes())}
        idx_to_node = {v: k for k, v in node_to_idx.items()}

        # 邊列表與邊類型
        edge_index = []
        edge_types = []
        edge_relation_to_id = {}

        for u, v, data in self.graph.edges(data=True):
            relation = data.get('relation', 'Unknown')

            # 為每種關係分配 ID
            if relation not in edge_relation_to_id:
                edge_relation_to_id[relation] = len(edge_relation_to_id)

            edge_index.append([node_to_idx[u], node_to_idx[v]])
            edge_types.append(edge_relation_to_id[relation])

        # 節點特徵 (簡單 one-hot，實際應用中應使用 BERT embeddings)
        node_type_to_id = {}
        node_features = []
        node_type_ids = []

        for node in self.graph.nodes():
            node_data = self.graph.nodes[node]
            node_type = node_data.get('node_type', 'Unknown')

            if node_type not in node_type_to_id:
                node_type_to_id[node_type] = len(node_type_to_id)

            node_type_ids.append(node_type_to_id[node_type])

            # 簡單特徵 (實際應該用 BERT)
            feature = [0] * len(node_type_to_id)
            feature[node_type_to_id[node_type]] = 1
            node_features.append(feature)

        # 元資料
        metadata = {
            'node_labels': [self.graph.nodes[node].get('label', '')
                          for node in self.graph.nodes()],
            'node_types': [self.graph.nodes[node].get('node_type', 'Unknown')
                         for node in self.graph.nodes()],
            'edge_relations': [self.graph.edges[u, v, k].get('relation', 'Unknown')
                             for u, v, k in self.graph.edges(keys=True)]
        }

        graph_data = {
            'edge_index': edge_index,
            'edge_types': edge_types,
            'node_features': node_features,
            'node_type_ids': node_type_ids,
            'node_to_idx': node_to_idx,
            'idx_to_node': idx_to_node,
            'edge_relation_to_id': edge_relation_to_id,
            'node_type_to_id': node_type_to_id,
            'metadata': metadata,
            'num_nodes': len(node_to_idx),
            'num_edges': len(edge_index),
            'num_edge_types': len(edge_relation_to_id),
            'num_node_types': len(node_type_to_id)
        }

        with open(output_path, 'wb') as f:
            pickle.dump(graph_data, f)

        print(f"✓ GNN 格式已匯出至: {output_path}")
        print(f"  節點數: {graph_data['num_nodes']}")
        print(f"  邊數: {graph_data['num_edges']}")
        print(f"  邊類型數: {graph_data['num_edge_types']}")
        print(f"  節點類型數: {graph_data['num_node_types']}")

    def export_for_llm_training(self, output_path: str):
        """
        匯出為 LLM 訓練友好格式 (JSON Lines)

        每一行是一個訓練樣本，包含:
        - incident: 事件描述
        - reasoning_path: 推理路徑 (Cause -> Violation -> Regulation)
        - context: 相關情境 (產業、媒介物等)
        """
        training_samples = []

        incident_nodes = [n for n, d in self.graph.nodes(data=True)
                         if d.get('node_type') == 'Incident']

        for inc_id in incident_nodes:
            inc_data = self.graph.nodes[inc_id]

            # 構建推理路徑
            reasoning_path = {
                'causes': [],
                'violations': [],
                'regulations': []
            }

            # 收集原因
            for pred in self.graph.predecessors(inc_id):
                pred_data = self.graph.nodes[pred]
                if 'Cause' in pred_data.get('node_type', ''):
                    reasoning_path['causes'].append({
                        'level': pred_data.get('cause_level', 'unknown'),
                        'description': pred_data.get('label', '')
                    })

                    # 繼續往回追
                    for ppred in self.graph.predecessors(pred):
                        ppred_data = self.graph.nodes[ppred]
                        if 'Cause' in ppred_data.get('node_type', ''):
                            reasoning_path['causes'].append({
                                'level': ppred_data.get('cause_level', 'unknown'),
                                'description': ppred_data.get('label', '')
                            })

            # 收集違規與法規
            for desc in nx.descendants(self.graph, inc_id):
                desc_data = self.graph.nodes[desc]
                node_type = desc_data.get('node_type', '')

                if node_type == 'Violation':
                    reasoning_path['violations'].append({
                        'description': desc_data.get('label', ''),
                        'actor': desc_data.get('actor', 'unknown')
                    })
                elif node_type == 'Regulation':
                    reasoning_path['regulations'].append({
                        'law_name': desc_data.get('law_name', ''),
                        'article': desc_data.get('article', ''),
                        'full_text': desc_data.get('label', '')
                    })

            # 收集情境
            context = {
                'industry': inc_data.get('industry', 'Unknown'),
                'incident_type': inc_data.get('incident_type', 'Unknown'),
                'mediums': []
            }

            for succ in self.graph.successors(inc_id):
                succ_data = self.graph.nodes[succ]
                if 'Medium' in succ_data.get('node_type', ''):
                    context['mediums'].append({
                        'type': succ_data.get('node_type', ''),
                        'name': succ_data.get('label', '')
                    })

            # 構建訓練樣本
            sample = {
                'id': inc_id,
                'incident': {
                    'description': inc_data.get('full_text', ''),
                    'summary': inc_data.get('label', '')
                },
                'reasoning_path': reasoning_path,
                'context': context
            }

            training_samples.append(sample)

        # 寫入 JSON Lines
        with open(output_path, 'w', encoding='utf-8') as f:
            for sample in training_samples:
                f.write(json.dumps(sample, ensure_ascii=False) + '\n')

        print(f"✓ LLM 訓練格式已匯出至: {output_path}")
        print(f"  訓練樣本數: {len(training_samples)}")

    def export_excel_summary(self, output_path: str):
        """
        匯出 Excel 格式的圖譜摘要 (需要 pandas 和 openpyxl)

        包含多個工作表:
        1. 節點列表
        2. 邊列表
        3. 統計資訊
        4. 法規排名
        """
        try:
            import pandas as pd

            # 工作表 1: 節點列表
            nodes_data = []
            for node, data in self.graph.nodes(data=True):
                nodes_data.append({
                    'Node_ID': node,
                    'Type': data.get('node_type', ''),
                    'Label': data.get('label', ''),
                    'In_Degree': self.graph.in_degree(node),
                    'Out_Degree': self.graph.out_degree(node),
                    'Industry': data.get('industry', ''),
                    'Code': data.get('code', '')
                })

            df_nodes = pd.DataFrame(nodes_data)

            # 工作表 2: 邊列表
            edges_data = []
            for u, v, data in self.graph.edges(data=True):
                u_label = self.graph.nodes[u].get('label', u)[:50]
                v_label = self.graph.nodes[v].get('label', v)[:50]
                edges_data.append({
                    'Source': u,
                    'Source_Label': u_label,
                    'Target': v,
                    'Target_Label': v_label,
                    'Relation': data.get('relation', ''),
                    'Weight': data.get('weight', 1.0)
                })

            df_edges = pd.DataFrame(edges_data)

            # 工作表 3: 統計資訊
            stats = self.get_statistics()
            stats_data = [
                {'指標': '總節點數', '數值': stats['total_nodes']},
                {'指標': '總邊數', '數值': stats['total_edges']},
                {'指標': '圖密度', '數值': f"{stats['graph_density']:.6f}"},
                {'指標': '是否連通', '數值': '是' if stats['is_connected'] else '否'}
            ]

            for ntype, count in stats['node_type_distribution'].items():
                stats_data.append({'指標': f'節點類型: {ntype}', '數值': count})

            df_stats = pd.DataFrame(stats_data)

            # 工作表 4: 法規排名
            regs_data = []
            for reg_id, freq in stats['most_cited_regulations']:
                if reg_id in self.graph:
                    reg_label = self.graph.nodes[reg_id].get('label', reg_id)
                    regs_data.append({
                        'Regulation_ID': reg_id,
                        'Label': reg_label,
                        'Citation_Count': freq
                    })

            df_regs = pd.DataFrame(regs_data)

            # 寫入 Excel
            with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
                df_nodes.to_excel(writer, sheet_name='節點列表', index=False)
                df_edges.to_excel(writer, sheet_name='邊列表', index=False)
                df_stats.to_excel(writer, sheet_name='統計資訊', index=False)
                df_regs.to_excel(writer, sheet_name='法規排名', index=False)

            print(f"✓ Excel 報告已匯出至: {output_path}")

        except ImportError:
            print("❌ 錯誤: 需要安裝 pandas 和 openpyxl 才能匯出 Excel 格式")
            print("   請執行: pip install pandas openpyxl")


def main():
    """
    主執行函式

    示範如何使用 OccupationalSafetyKnowledgeGraph 建構知識圖譜
    並匯出多種格式供不同用途使用
    """

    parser = argparse.ArgumentParser(description="Build Occupational Safety Knowledge Graph (V2)")

    ## input directory setting
    parser.add_argument("--input", "-i",
                        default=os.environ.get("OSH_INPUT", "osh_doc_merged.json"),
                        help="輸入事故 JSON 檔案 (預設: osh_doc_merged.json)")
    parser.add_argument("--out-dir", "-o",
                        default=os.environ.get("OSH_OUT_DIR", "./output"),
                        help="輸出資料夾 (預設: ./output)")
    parser.add_argument("--strict", action="store_true",
                        default=os.environ.get("OSH_STRICT", "1") != "0",
                        help="啟用嚴格模式 (預設: 開啟)")
    parser.add_argument("--no-semantic", action="store_true",
                        help="停用語義比對")

    # 修改這裡：在 Notebook 中執行時，強制傳入空參數列表，
    # 避免抓到 Jupyter 的 kernel 參數 (-f ...)
    args = parser.parse_args(args=[])

    input_path = Path(args.input)
    out_dir = Path(args.out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    print("=" * 80)
    print("台灣職業安全法律知識圖譜建構系統 V2")
    print(f"輸入檔案: {input_path}")
    print(f"輸出資料夾: {out_dir}")
    print("=" * 80)
    print()

    # === 1. 載入資料 ===
    print("【步驟 1】載入事故資料...")
    try:
        with input_path.open('r', encoding='utf-8') as f:
            incidents_data = json.load(f)
        print(f"✓ 成功載入 {len(incidents_data)} 筆事故資料\n")
    except FileNotFoundError:
        print(f"❌ 錯誤: 找不到 '{input_path}'")
        return
    except json.JSONDecodeError:
        print("❌ 錯誤: JSON 格式錯誤")
        return

    # === 2. 建構知識圖譜 ===
    print("【步驟 2】建構知識圖譜...")
    print(f"  嚴格模式: {args.strict}")
    print(f"  語義分析: {not args.no_semantic}\n")

    kg_builder = OccupationalSafetyKnowledgeGraph(
        strict_mode=args.strict,
        enable_semantic=not args.no_semantic
    )

    graph = kg_builder.build_graph(incidents_data)

    # === 3. 輸出統計資訊 ===
    print("\n【步驟 3】分析圖譜統計資訊...")
    stats = kg_builder.get_statistics()

    print(f"\n圖譜規模:")
    print(f"  節點: {stats['total_nodes']:,}")
    print(f"  邊: {stats['total_edges']:,}")
    print(f"  密度: {stats['graph_density']:.6f}")

    print(f"\n核心節點類型:")
    for ntype in ['Incident', 'Cause_Direct', 'Cause_Indirect', 'Cause_Basic',
                  'Violation', 'Regulation']:
        count = stats['node_type_distribution'].get(ntype, 0)
        print(f"  {ntype:20s}: {count:5d}")

    print(f"\n關鍵邊類型 (推理路徑):")
    for rel in [EDGE_HAS_CAUSE, EDGE_ENABLED_BY, EDGE_LEADS_TO, EDGE_VIOLATES]:
        count = stats['edge_relation_distribution'].get(rel, 0)
        print(f"  {rel:20s}: {count:5d}")

    # === 4. 匯出多種格式 ===
    print("\n【步驟 4】匯出多種格式...")

    ## output directory setting
    out_report = out_dir / 'knowledge_graph_report.txt'
    out_gnn = out_dir / 'knowledge_graph_gnn.pkl'
    out_llm = out_dir / 'knowledge_graph_llm_training.jsonl'
    out_graphml = out_dir / 'knowledge_graph.graphml'
    out_json = out_dir / 'knowledge_graph.json'
    out_excel = out_dir / 'knowledge_graph_summary.xlsx'
    out_stats = out_dir / 'knowledge_graph_stats.json'

    # 4.1 人類可讀的 TXT 報告
    print(f"\n  [1/6] 匯出可讀報告 -> {out_report}")
    kg_builder.export_readable_summary(str(out_report))

    # 4.2 GNN 訓練格式
    print(f"\n  [2/6] 匯出 GNN 格式 -> {out_gnn}")
    kg_builder.export_for_gnn(str(out_gnn))

    # 4.3 LLM 訓練格式
    print(f"\n  [3/6] 匯出 LLM 訓練格式 -> {out_llm}")
    kg_builder.export_for_llm_training(str(out_llm))

    # 4.4 標準圖格式 (GraphML, JSON)
    print(f"\n  [4/6] 匯出標準圖格式 -> {out_graphml}, {out_json}")
    kg_builder.save_graph(str(out_graphml), format='graphml')
    kg_builder.save_graph(str(out_json), format='json')

    # 4.5 Excel 報告 (需要 pandas)
    print(f"\n  [5/6] 匯出 Excel 報告 -> {out_excel}")
    try:
        kg_builder.export_excel_summary(str(out_excel))
    except Exception:
        print("     ⚠️ 跳過 Excel 匯出 (需安裝 pandas 和 openpyxl)")

    # 4.6 完整統計 JSON
    print(f"\n  [6/6] 匯出完整統計 -> {out_stats}")
    with out_stats.open('w', encoding='utf-8') as f:
        json.dump(stats, f, ensure_ascii=False, indent=2)
    print(f"     ✓ 已匯出至: {out_stats}")

    # === 5. 品質檢查報告 ===
    print("\n【步驟 5】品質檢查...")

    if kg_builder.strict_mode and kg_builder.filtered_nodes:
        print(f"  過濾了 {len(kg_builder.filtered_nodes)} 個無效節點")

        # 統計過濾原因
        filter_reasons = defaultdict(int)
        for _, _, reason in kg_builder.filtered_nodes:
            filter_reasons[reason] += 1

        print("  過濾原因分布:")
        for reason, count in filter_reasons.items():
            print(f"    {reason:25s}: {count:4d} 個")

    # === 完成 ===
    print("\n" + "=" * 80)
    print("✓ 全部完成!")
    print("=" * 80)

    print("\n建議查看順序:")
    print(f"  1️⃣  {out_report}")
    print("       → 完整的品質報告,包含統計、範例、健康度檢查")
    print()
    print(f"  2️⃣  {out_excel} (如有)")
    print("       → Excel 格式,可用於進一步分析")
    print()
    print(f"  3️⃣  {out_llm}")
    print("       → 用於訓練大型語言模型的推理資料")
    print()
    print(f"  4️⃣  {out_gnn}")
    print("       → 用於訓練圖神經網路")
    print()

    print("\n用途說明:")
    print("  • 訓練 LLM 法律推理能力:")
    print(f"    → 使用 {out_llm}")
    print("    → 每個樣本包含完整的推理路徑 (Cause→Violation→Regulation)")
    print()
    print("  • 訓練 GNN 預測模型:")
    print(f"    → 使用 {out_gnn}")
    print("    → 支援異質圖採樣與多跳推理")
    print()
    print("  • 視覺化與分析:")
    print(f"    → 使用 {out_graphml}")
    print("    → 可用 Gephi, Cytoscape 等工具開啟")
    print()


In [7]:
if __name__ == "__main__":
    main()

台灣職業安全法律知識圖譜建構系統 V2
輸入檔案: osh_doc_merged.json
輸出資料夾: output

【步驟 1】載入事故資料...
✓ 成功載入 339 筆事故資料

【步驟 2】建構知識圖譜...
  嚴格模式: True
  語義分析: True

開始構建知識圖譜，共 339 筆事件...
嚴格模式: True, 語義分析: True
已處理 10/339 筆資料
已處理 20/339 筆資料
已處理 30/339 筆資料
已處理 40/339 筆資料
已處理 50/339 筆資料
已處理 60/339 筆資料
已處理 70/339 筆資料
已處理 80/339 筆資料
已處理 90/339 筆資料
已處理 100/339 筆資料
已處理 110/339 筆資料
已處理 120/339 筆資料
已處理 130/339 筆資料
已處理 140/339 筆資料
已處理 150/339 筆資料
已處理 160/339 筆資料
已處理 170/339 筆資料
已處理 180/339 筆資料
已處理 190/339 筆資料
已處理 200/339 筆資料
已處理 210/339 筆資料
已處理 220/339 筆資料
已處理 230/339 筆資料
已處理 240/339 筆資料
已處理 250/339 筆資料
已處理 260/339 筆資料
已處理 270/339 筆資料
已處理 280/339 筆資料
已處理 290/339 筆資料
已處理 300/339 筆資料
已處理 310/339 筆資料
已處理 320/339 筆資料
已處理 330/339 筆資料

圖譜構建完成:
  節點數: 1678
  邊數: 15188
  過濾節點數: 102
  (詳見 get_filtered_nodes() 方法)


【步驟 3】分析圖譜統計資訊...

圖譜規模:
  節點: 1,678
  邊: 15,188
  密度: 0.005397

核心節點類型:
  Incident            :   276
  Cause_Direct        :   272
  Cause_Indirect      :     0
  Cause_Basic         :     1
  Violation           :   