In [1]:
file_path = "../18_rapids/result/20251203_053328/condensed_tree_object.pkl"
import pickle
with open(file_path, "rb") as f:
    condensed_tree = pickle.load(f)

In [2]:
import numpy as np
import pandas as pd
import typing
import itertools
import plotly.graph_objects as go

# --- Helpers lifted from backend (d3_data_manager.py) and extended with stability ---

def _get_leaves(raw_tree_df: pd.DataFrame):
    cluster_tree = raw_tree_df[raw_tree_df["child_size"] > 1]
    if cluster_tree.shape[0] == 0:
        return [raw_tree_df["parent"].min()]
    root = cluster_tree["parent"].min()

    def _dfs(current_node: int):
        children = cluster_tree[cluster_tree["parent"] == current_node]["child"]
        if len(children) == 0:
            return [current_node]
        out = []
        for child in children:
            out.extend(_dfs(int(child)))
        return out

    return _dfs(int(root))


def hdbscan_condensed_to_linkage(condensed_tree):
    """
    Convert HDBSCAN condensed tree to linkage matrix and ID map.
    Returns (linkage_mapped, old_new_id_map).
    linkage_mapped has columns: child1, child2, parent, distance, size.
    distance is inverted lambda (larger lambda -> smaller distance).
    """
    raw_tree = pd.DataFrame(condensed_tree._raw_tree)
    condensed_df = condensed_tree.to_pandas()

    cluster_tree = condensed_df[condensed_df["child_size"] > 1]
    sorted_tree = cluster_tree.sort_values(by=["lambda_val", "parent"], ascending=True)

    linkage_rows = []
    for i in range(0, len(sorted_tree), 2):
        if i + 1 >= len(sorted_tree):
            break
        row_a = sorted_tree.iloc[i]
        row_b = sorted_tree.iloc[i + 1]
        if row_a["lambda_val"] != row_b["lambda_val"]:
            continue
        parent_id = int(row_a["parent"])
        lam = float(row_a["lambda_val"])
        total_size_rows = raw_tree[raw_tree["child"] == parent_id]["child_size"]
        total_size = int(total_size_rows.iloc[0]) if len(total_size_rows) > 0 else int(row_a["child_size"] + row_b["child_size"])
        linkage_rows.append([
            int(row_a["child"]),
            int(row_b["child"]),
            parent_id,
            lam,
            total_size,
        ])

    # Map IDs to contiguous range
    old_new_id_map = {}
    current_id = 0
    leaves = _get_leaves(raw_tree)
    for leaf in leaves:
        old_new_id_map[int(leaf)] = current_id
        current_id += 1
    for row in reversed(linkage_rows):
        parent_id = row[2]
        if int(parent_id) not in old_new_id_map:
            old_new_id_map[int(parent_id)] = current_id
            current_id += 1

    max_lambda = max((r[3] for r in linkage_rows), default=1.0)
    linkage_mapped = [
        [
            old_new_id_map[int(r[0])],
            old_new_id_map[int(r[1])],
            old_new_id_map[int(r[2])],
            max_lambda - r[3],  # invert lambda to distance
            r[4],
        ]
        for r in reversed(linkage_rows)
    ]

    return np.array(linkage_mapped, dtype=float), old_new_id_map


def compute_stability(condensed_tree_like):
    """
    Compute HDBSCAN stability scores (Python port of cython logic).
    Accepts HDBSCAN condensed tree object or a pandas DataFrame with columns
    ['parent','child','lambda_val','child_size'].
    Returns dict {cluster_id: stability} using original cluster IDs.
    """
    if hasattr(condensed_tree_like, "to_pandas"):
        df = condensed_tree_like.to_pandas()
    elif isinstance(condensed_tree_like, pd.DataFrame):
        df = condensed_tree_like
    else:
        raise ValueError("Unsupported condensed tree format")

    smallest_cluster = int(df["parent"].min())
    largest_cluster = int(df["parent"].max())
    num_clusters = largest_cluster - smallest_cluster + 1
    largest_child = int(max(df["child"].max(), smallest_cluster))

    sorted_child = df.sort_values(["child", "lambda_val"])[["child", "lambda_val"]].to_numpy()
    births_arr = np.nan * np.ones(largest_child + 1, dtype=np.double)
    current_child = -1
    min_lambda = 0.0
    for child_val, lambda_val in sorted_child:
        child = int(child_val)
        lam = float(lambda_val)
        if child == current_child:
            min_lambda = min(min_lambda, lam)
        elif current_child != -1:
            births_arr[current_child] = min_lambda
            current_child = child
            min_lambda = lam
        else:
            current_child = child
            min_lambda = lam
    if current_child != -1:
        births_arr[current_child] = min_lambda
    births_arr[smallest_cluster] = 0.0

    result_arr = np.zeros(num_clusters, dtype=np.double)
    parents = df["parent"].to_numpy()
    sizes = df["child_size"].to_numpy()
    lambdas = df["lambda_val"].to_numpy()
    for parent_val, size_val, lambda_val in zip(parents, sizes, lambdas):
        parent = int(parent_val)
        child_size = float(size_val)
        lam = float(lambda_val)
        result_index = parent - smallest_cluster
        lambda_birth = births_arr[parent]
        result_arr[result_index] += (lam - lambda_birth) * child_size

    node_ids = np.arange(smallest_cluster, df["parent"].max() + 1)
    result_pre_dict = np.vstack((node_ids, result_arr)).T
    return dict(zip(result_pre_dict[:, 0].astype(int), result_pre_dict[:, 1]))


def compute_dendrogram_coords(Z: np.ndarray, n_points: int):
    """
    Compute dendrogram coordinates with size-aware leaf ordering.
    Returns (icoord, dcoord, leaf_order).
    """
    n_nodes = 2 * n_points - 1
    nodes = [{"x": None, "y": 0.0, "size": 1, "left": None, "right": None} for _ in range(n_points)]

    for i in range(n_points - 1):
        c1, c2, dist, count = Z[i]
        nodes.append({
            "x": None,
            "y": float(dist),
            "size": int(count),
            "left": int(c1),
            "right": int(c2),
        })

    def get_leaf_order_sorted(node_idx):
        node = nodes[node_idx]
        if node_idx < n_points:
            return [node_idx]
        left_idx, right_idx = node["left"], node["right"]
        size_left, size_right = nodes[left_idx]["size"], nodes[right_idx]["size"]
        if size_left < size_right:
            left_idx, right_idx = right_idx, left_idx
        order_left = get_leaf_order_sorted(left_idx)
        order_right = get_leaf_order_sorted(right_idx)
        return order_left + order_right

    def calculate_x_coord(node_idx, leaf_to_x):
        node = nodes[node_idx]
        if node_idx < n_points:
            node["x"] = leaf_to_x[node_idx]
            return node["x"]
        x_left = calculate_x_coord(node["left"], leaf_to_x)
        x_right = calculate_x_coord(node["right"], leaf_to_x)
        node["x"] = (x_left + x_right) / 2.0
        return node["x"]

    root_node_idx = n_points - 1 + (n_points - 1)
    leaf_order = get_leaf_order_sorted(root_node_idx)
    leaf_to_x = {leaf_idx: 2 * i + 1 for i, leaf_idx in enumerate(leaf_order)}
    calculate_x_coord(root_node_idx, leaf_to_x)

    icoord = []
    dcoord = []
    for i in range(n_points - 1):
        parent_idx = n_points + i
        left_idx = nodes[parent_idx]["left"]
        right_idx = nodes[parent_idx]["right"]
        y_parent = nodes[parent_idx]["y"]
        y_left = nodes[left_idx]["y"]
        y_right = nodes[right_idx]["y"]
        x_left = nodes[left_idx]["x"]
        x_right = nodes[right_idx]["x"]
        icoord.append([x_left, x_left, x_right, x_right])
        dcoord.append([y_left, y_parent, y_parent, y_right])

    return icoord, dcoord, leaf_order


def get_dendrogram_segments2(Z: np.ndarray):
    """Build dendrogram segments, node ids, and hover info from a linkage matrix."""
    n_points = Z.shape[0] + 1
    icoord, dcoord, leaf_order = compute_dendrogram_coords(Z, n_points)

    node_meta = {idx: {"distance": 0.0, "size": 1, "child1": None, "child2": None} for idx in range(n_points)}
    for i in range(n_points - 1):
        node_id = n_points + i
        c1, c2, dist, count = Z[i]
        node_meta[node_id] = {
            "distance": float(dist),
            "size": int(count),
            "child1": int(c1),
            "child2": int(c2),
        }

    segments = []
    segment_node_ids = []
    segment_infos = []
    for merge_idx, (icoords, dcoords) in enumerate(zip(icoord, dcoord)):
        node_id = n_points + merge_idx
        x1, x2, x3, x4 = icoords
        y1, y2, y3, y4 = dcoords
        info = node_meta.get(node_id, {"distance": 0.0, "size": 1, "child1": None, "child2": None})
        segments.append([(x1, y1), (x2, y2)])
        segment_node_ids.append(node_id)
        segment_infos.append(info)
        segments.append([(x2, y2), (x3, y3)])
        segment_node_ids.append(node_id)
        segment_infos.append(info)
        segments.append([(x4, y4), (x3, y3)])
        segment_node_ids.append(node_id)
        segment_infos.append(info)
    return segments, segment_node_ids, leaf_order, segment_infos


def plot_dendrogram_plotly(
    segments: typing.List,
    colors: typing.Optional[typing.List] = None,
    infos: typing.Optional[typing.List[typing.Dict]] = None,
    **kwargs,
):
    """Plot dendrogram segments with Plotly and hover text."""
    fig = go.Figure()
    for i, seg in enumerate(segments):
        x_coords = [seg[0][0], seg[1][0], None]
        y_coords = [seg[0][1], seg[1][1], None]
        color = "#888" if colors is None else colors[i]

        hover_lines = []
        if infos is not None and i < len(infos):
            info = infos[i]
            node_id = info.get("node_id")
            orig_id = info.get("original_id")
            size_val = info.get("size")
            dist_val = info.get("distance")
            child1 = info.get("child1")
            child2 = info.get("child2")
            child1_orig = info.get("child1_original")
            child2_orig = info.get("child2_original")
            node_type = info.get("node_type")
            stability_val = info.get("stability")
            if node_type:
                hover_lines.append(f"Type: {node_type}")
            if node_id is not None:
                hover_lines.append(f"Parent (mapped): {node_id}")
            if orig_id is not None:
                hover_lines.append(f"Parent (original): {orig_id}")
            if child1 is not None:
                hover_lines.append(f"Child1 (mapped): {child1}")
            if child1_orig is not None:
                hover_lines.append(f"Child1 (original): {child1_orig}")
            if child2 is not None:
                hover_lines.append(f"Child2 (mapped): {child2}")
            if child2_orig is not None:
                hover_lines.append(f"Child2 (original): {child2_orig}")
            if size_val is not None:
                hover_lines.append(f"Size: {size_val}")
            if dist_val is not None:
                hover_lines.append(f"Distance: {dist_val:.4f}")
            if stability_val is not None:
                hover_lines.append(f"Stability: {stability_val:.4f}")
        hover_text = "<br>".join(hover_lines) if hover_lines else None

        fig.add_trace(
            go.Scatter(
                x=x_coords,
                y=y_coords,
                mode="lines",
                line=dict(color=color, width=1.5),
                showlegend=False,
                hoverinfo="text" if hover_text else "skip",
                text=[hover_text] * len(x_coords) if hover_text else None,
            )
        )
    fig.update_layout(
        title="Simple Dendrogram Visualization",
        xaxis_title="Observation Index",
        yaxis_title="Distance / Height",
        hovermode="closest",
    )
    fig.update_layout(height=800, width=1000)
    return fig


def plot_dendrogram(
    linkage_mapped: np.ndarray,
    highlight_cluster_ids=None,
    old_new_id_map=None,
    stability_scores: typing.Optional[typing.Dict[int, float]] = None,
    sort_by_size: bool = True,
):
    """
    Plot dendrogram using custom coordinates so we can sort leaves by size.
    highlight_cluster_ids: original cluster IDs to highlight in orange.
    stability_scores: dict of original cluster ID -> stability to show on hover.
    sort_by_size: kept for API parity; sorting is handled by compute_dendrogram_coords.
    """
    if highlight_cluster_ids is None:
        highlight_cluster_ids = []

    linkage_for_coords = linkage_mapped[:, [0, 1, 3, 4]].astype(float)
    n_points = linkage_for_coords.shape[0] + 1
    segments, segment_node_ids, leaf_order, segment_infos = get_dendrogram_segments2(linkage_for_coords)

    colors = None
    if old_new_id_map and highlight_cluster_ids:
        highlight_new_ids = {old_new_id_map[cid] for cid in highlight_cluster_ids if cid in old_new_id_map}
        print(f"Highlighting clusters (original IDs): {highlight_cluster_ids}")
        colors = ["orange" if node_id in highlight_new_ids else "#888" for node_id in segment_node_ids]
        
        print(f"Highlighting {len(highlight_new_ids)} clusters in dendrogram.")

    reverse_map = {v: k for k, v in old_new_id_map.items()} if old_new_id_map else {}
    infos = []
    for node_id, info in zip(segment_node_ids, segment_infos):
        enriched = dict(info)
        enriched["node_id"] = node_id
        enriched["node_type"] = "leaf" if node_id < n_points else "internal"
        if node_id in reverse_map:
            enriched["original_id"] = reverse_map[node_id]
        c1 = info.get("child1")
        c2 = info.get("child2")
        if c1 is not None and c1 in reverse_map:
            enriched["child1_original"] = reverse_map[c1]
        if c2 is not None and c2 in reverse_map:
            enriched["child2_original"] = reverse_map[c2]
        orig_id = enriched.get("original_id")
        if stability_scores is not None and orig_id is not None:
            enriched["stability"] = stability_scores.get(orig_id)
        infos.append(enriched)

    fig = plot_dendrogram_plotly(segments, colors=colors, infos=infos)
    fig.update_layout(height=500, width=900, title="Dendrogram (HDBSCAN condensed → linkage)", showlegend=False)
    return fig


In [3]:
# Override plot_dendrogram with leaf-aware highlighting so leaf cluster IDs also color their merge segments.
def plot_dendrogram(
    linkage_mapped: np.ndarray,
    highlight_cluster_ids=None,
    old_new_id_map=None,
    stability_scores: typing.Optional[typing.Dict[int, float]] = None,
    sort_by_size: bool = True,
    highlight_color: str = "orange",
):
    """
    Plot dendrogram using custom coordinates so we can sort leaves by size.

    highlight_cluster_ids: original cluster IDs to highlight in orange. Both internal clusters and
    cluster-tree leaves are supported. If a highlighted ID corresponds to a leaf (mapped id < n_points),
    its immediate merge segment will also be highlighted.
    stability_scores: dict of original cluster ID -> stability to show on hover.
    sort_by_size: kept for API parity; sorting is handled by compute_dendrogram_coords.
    highlight_color: color for highlighted branches.
    """
    if highlight_cluster_ids is None:
        highlight_cluster_ids = []

    linkage_for_coords = linkage_mapped[:, [0, 1, 3, 4]].astype(float)
    n_points = linkage_for_coords.shape[0] + 1
    segments, segment_node_ids, leaf_order, segment_infos = get_dendrogram_segments2(linkage_for_coords)

    colors = None
    if old_new_id_map and highlight_cluster_ids:
        highlight_new_ids = {old_new_id_map[cid] for cid in highlight_cluster_ids if cid in old_new_id_map}
        leaf_highlights = {hid for hid in highlight_new_ids if hid < n_points}

        # If a highlighted id is a leaf, also highlight its immediate merge segment so it becomes visible.
        highlight_nodes = set(highlight_new_ids)
        if leaf_highlights:
            for node_id, info in zip(segment_node_ids, segment_infos):
                if info.get("child1") in leaf_highlights or info.get("child2") in leaf_highlights:
                    highlight_nodes.add(node_id)

        print(f"Highlighting clusters (original IDs): {highlight_cluster_ids}")
        print(f"Highlight mapped ids: {sorted(list(highlight_new_ids))}")
        colors = [highlight_color if node_id in highlight_nodes else "#888" for node_id in segment_node_ids]

    reverse_map = {v: k for k, v in old_new_id_map.items()} if old_new_id_map else {}
    infos = []
    for node_id, info in zip(segment_node_ids, segment_infos):
        enriched = dict(info)
        enriched["node_id"] = node_id
        enriched["node_type"] = "leaf" if node_id < n_points else "internal"
        if node_id in reverse_map:
            enriched["original_id"] = reverse_map[node_id]
        c1 = info.get("child1")
        c2 = info.get("child2")
        if c1 is not None and c1 in reverse_map:
            enriched["child1_original"] = reverse_map[c1]
        if c2 is not None and c2 in reverse_map:
            enriched["child2_original"] = reverse_map[c2]
        orig_id = enriched.get("original_id")
        if stability_scores is not None and orig_id is not None:
            enriched["stability"] = stability_scores.get(orig_id)
        infos.append(enriched)

    fig = plot_dendrogram_plotly(segments, colors=colors, infos=infos)
    fig.update_layout(height=500, width=900, title="Dendrogram (HDBSCAN condensed → linkage)", showlegend=False)
    return fig

# Plot all

In [4]:
linkage_mapped, old_new_id_map = hdbscan_condensed_to_linkage(condensed_tree)
print("linkage shape:", linkage_mapped.shape)
print("sample rows:", linkage_mapped[:3])

# Optional: compute stability (original cluster IDs)
stability_scores = compute_stability(condensed_tree)

# TODO: set the cluster IDs you want to highlight (original IDs)
highlight_cluster_ids = []  # e.g., [115760, 115761]
fig = plot_dendrogram(
    linkage_mapped,
    highlight_cluster_ids,
    old_new_id_map,
    stability_scores=stability_scores,
)
fig.show()

linkage shape: (442, 5)
sample rows: [[1.74000000e+02 1.75000000e+02 4.43000000e+02 0.00000000e+00
  9.00000000e+01]
 [1.76000000e+02 1.77000000e+02 4.44000000e+02 6.14290237e-02
  4.90000000e+01]
 [4.43000000e+02 4.44000000e+02 4.45000000e+02 1.24295950e-01
  1.67000000e+02]]


# plot selected

In [None]:
selected_cluster_id = 115760  # Example cluster ID to inspect

# nearby clusters
neighbor_ids = []


In [5]:
import pandas as pd

class ClusterInspector:
    def __init__(self, condensed_tree, stability_map):
        self.tree_df = condensed_tree.to_pandas()
        # クラスタとしての分岐のみを抽出
        self.cluster_tree = self.tree_df[self.tree_df["child_size"] > 1].copy()
        
        # 親への逆引きマップ (Cluster ID -> Parent Cluster ID)
        self.child_to_parent = dict(zip(self.cluster_tree["child"].astype(int), 
                                        self.cluster_tree["parent"].astype(int)))
        
        # ノードサイズマップ (親ノードID -> そのノードの合計サイズ)
        self.node_size_map = self.cluster_tree.groupby("parent")["child_size"].max().to_dict()
        
        # Stabilityマップの取得
        self.stability_map = stability_map

    def _get_cluster_leaves_under(self, node_id: int):
        """配下にある『末端のクラスタ(Cluster Leaf)』を再帰的に取得"""
        children = self.cluster_tree[self.cluster_tree["parent"] == node_id]["child"]
        if len(children) == 0:
            return [node_id]
        
        out = []
        for child in children:
            out.extend(self._get_cluster_leaves_under(int(child)))
        return out

    def get_parent_cluster_by_stability(self, cluster_leaf_id: int, min_stability: float):
        """1. Stabilityが一定値以上の親を探し、(リーフ群, 遡上回数)を返す"""
        current = int(cluster_leaf_id)
        steps = 0
        
        while True:
            parent = self.child_to_parent.get(current)
            # print(f"Current: {current}, Parent: {parent}, stability: {self.stability_map.get(parent, 0.0)}")
            if parent is None: 
                break # ルート到達
            
            steps += 1
            # 親の安定性をチェック
            if self.stability_map.get(parent, 0.0) >= min_stability:
                current = parent
                print(f"Found stable parent: {parent} with stability {self.stability_map.get(parent, 0.0)}")
                break
            current = parent
            print(f"Moving up to parent: {parent} with stability {self.stability_map.get(parent, 0.0)}")
            
        return self._get_cluster_leaves_under(current), steps

    def get_parent_cluster_by_size_ratio(self, cluster_leaf_id: int, ratio_threshold: float):
        """2. サイズ比率がいびつな親を探し、(リーフ群, 遡上回数)を返す"""
        current = int(cluster_leaf_id)
        steps = 0
        
        while True:
            parent = self.child_to_parent.get(current)
            if parent is None: 
                break # ルート到達
                
            steps += 1
            current_size = self.node_size_map.get(current, 0)
            parent_size = self.node_size_map.get(parent, 0)
            
            # 比率が閾値を超えた ＝ 「小さな断片ではなく、意味のある合流」と判断
            if parent_size > 0 and (current_size / parent_size) >= ratio_threshold:
                current = parent
                print(f"Found size-ratio parent: {parent} with ratio {current_size / parent_size:.4f}, sizes: {current_size}/{parent_size}")
                break
            current = parent
            print(f"Moving up to parent: {parent} with ratio {current_size / parent_size:.4f}, sizes: {current_size}/{parent_size}")
            
        return self._get_cluster_leaves_under(current), steps

In [6]:
# クラスの初期化
inspector = ClusterInspector(condensed_tree, stability_scores)



In [119]:
target_leaf_cluster = 116543

In [79]:


# そのクラスタが含まれる「安定した親」を探し、その親からぶら下がる他の末端クラスタもまとめて取得
related_cluster_leaves, steps = inspector.get_parent_cluster_by_stability(target_leaf_cluster, min_stability=30.0)
print(f"length of related clusters by stability: {len(related_cluster_leaves)}")
print(f"step: {steps}")
print(f"起点クラスタ {target_leaf_cluster} と共に、一つの安定した塊を構成する末端クラスタ群: {related_cluster_leaves}")

Moving up to parent: 116471 with stability 5.334946990013123
Moving up to parent: 116462 with stability 0.18368196487426758
Moving up to parent: 116421 with stability 2.4670742750167847
Moving up to parent: 116416 with stability 6.26696503162384
Moving up to parent: 116415 with stability 6.055968761444092
Moving up to parent: 116414 with stability 1.1982548236846924
Moving up to parent: 116413 with stability 4.8161386251449585
Moving up to parent: 116410 with stability 11.740297198295593
Moving up to parent: 116407 with stability 5.220691561698914
Moving up to parent: 116406 with stability 1.9214601516723633
Moving up to parent: 116403 with stability 3.9511200189590454
Moving up to parent: 116402 with stability 0.9275788068771362
Moving up to parent: 116400 with stability 11.435438632965088
Moving up to parent: 116398 with stability 3.9020520448684692
Moving up to parent: 116397 with stability 4.2570754289627075
Found stable parent: 116392 with stability 44.74266004562378
length of rel

In [123]:
related_cluster_leaves, steps = inspector.get_parent_cluster_by_size_ratio(target_leaf_cluster, ratio_threshold=0.80)
print(f"length of related clusters by size ratio: {len(related_cluster_leaves)}")
print(f"step: {steps}") 
print(f"起点クラスタ {target_leaf_cluster} と共に、一つのまとまりを構成する末端クラスタ群: {related_cluster_leaves}")


Moving up to parent: 116488 with ratio 0.0000, sizes: 0/39
Moving up to parent: 116485 with ratio 0.6500, sizes: 39/60
Moving up to parent: 116451 with ratio 0.5769, sizes: 60/104
Moving up to parent: 116449 with ratio 0.7536, sizes: 104/138
Found size-ratio parent: 116408 with ratio 0.8846, sizes: 138/156
length of related clusters by size ratio: 10
step: 5
起点クラスタ 116543 と共に、一つのまとまりを構成する末端クラスタ群: [116448, 116484, 116492, 116525, 116526, 116514, 116518, 116543, 116468, 116472]


In [124]:
highlight_cluster_ids = related_cluster_leaves
fig = plot_dendrogram(
    linkage_mapped,
    highlight_cluster_ids,
    old_new_id_map,
    stability_scores=stability_scores,
).show()

Highlighting clusters (original IDs): [116448, 116484, 116492, 116525, 116526, 116514, 116518, 116543, 116468, 116472]
Highlight mapped ids: [225, 226, 227, 228, 229, 230, 231, 232, 233, 234]


# セグメンテーション

In [10]:
import plotly.colors as pc

def plot_segmented_dendrogram(
    linkage_mapped, 
    segment_results, # 先ほどの result リスト
    old_new_id_map, 
    stability_scores=None
):
    """
    サイズ比率で分けたセグメントごとにデンドログラムを着色する
    """
    # 1. セグメントのルートIDと色のマップを作成
    # Plotlyのクオリティの高いカラーパレットを使用
    palette = pc.qualitative.Plotly  
    segment_colors = {}
    
    # leaf_to_color_map を作成 (どのオリジナルIDがどの色になるか)
    # また、どの segment_root(original_id) がどの色になるかも保持
    node_to_segment_color = {}
    
    for i, seg in enumerate(segment_results):
        color = palette[i % len(palette)]
        root_id = seg["segment_root"]
        segment_colors[root_id] = color
        
        # そのセグメントに属する全リーフをその色で登録
        for leaf_id in seg["leaves"]:
            node_to_segment_color[leaf_id] = color

    # 2. Linkageデータとセグメント情報の準備
    linkage_for_coords = linkage_mapped[:, [0, 1, 3, 4]].astype(float)
    n_points = linkage_for_coords.shape[0] + 1
    segments, segment_node_ids, leaf_order, segment_infos = get_dendrogram_segments2(linkage_for_coords)
    
    reverse_map = {v: k for k, v in old_new_id_map.items()}

    # 3. 各枝（セグメント）の色を決定する
    colors = []
    for node_id, info in zip(segment_node_ids, segment_infos):
        # このノード（枝）に対応するオリジナルIDを取得
        orig_id = reverse_map.get(node_id)
        
        # 色の判定ロジック:
        # このノード自体がセグメントに登録されているか、あるいは
        # 子ノードのいずれかが特定のセグメントに属しているかを確認
        branch_color = "#888" # デフォルトはグレー
        
        if orig_id in node_to_segment_color:
            branch_color = node_to_segment_color[orig_id]
        else:
            # 子ノード(original_id)をチェック
            c1_orig = reverse_map.get(info.get("child1"))
            c2_orig = reverse_map.get(info.get("child2"))
            
            if c1_orig in node_to_segment_color:
                branch_color = node_to_segment_color[c1_orig]
            elif c2_orig in node_to_segment_color:
                branch_color = node_to_segment_color[c2_orig]
        
        colors.append(branch_color)

    # 4. 情報の付与と描画 (ご提示のロジックを継承)
    infos = []
    for node_id, info in zip(segment_node_ids, segment_infos):
        enriched = dict(info)
        orig_id = reverse_map.get(node_id)
        if orig_id is not None:
            enriched["original_id"] = orig_id
            if stability_scores:
                enriched["stability"] = stability_scores.get(orig_id)
        infos.append(enriched)

    fig = plot_dendrogram_plotly(segments, colors=colors, infos=infos)
    fig.update_layout(
        height=600, width=1000, 
        title=f"Segmented Dendrogram (Ratio Threshold Segmentation)",
        showlegend=False
    )
    return fig

AttributeError: 'ClusterInspector' object has no attribute 'get_segments_by_size'

In [9]:
import pandas as pd

class ClusterInspector:
    def __init__(self, condensed_tree, stability_map):
        self.tree_df = condensed_tree.to_pandas()
        # クラスタとしての分岐のみを抽出
        self.cluster_tree = self.tree_df[self.tree_df["child_size"] > 1].copy()
        
        # 親への逆引きマップ (Cluster ID -> Parent Cluster ID)
        self.child_to_parent = dict(zip(self.cluster_tree["child"].astype(int), 
                                        self.cluster_tree["parent"].astype(int)))
        
        # ノードサイズマップ (親ノードID -> そのノードの合計サイズ)
        self.node_size_map = self.cluster_tree.groupby("parent")["child_size"].max().to_dict()
        
        # Stabilityマップの取得
        self.stability_map = stability_map

    def _get_cluster_leaves_under(self, node_id: int):
        """配下にある『末端のクラスタ(Cluster Leaf)』を再帰的に取得"""
        children = self.cluster_tree[self.cluster_tree["parent"] == node_id]["child"]
        if len(children) == 0:
            return [node_id]
        
        out = []
        for child in children:
            out.extend(self._get_cluster_leaves_under(int(child)))
        return out

    def get_parent_cluster_by_stability(self, cluster_leaf_id: int, min_stability: float):
        """1. Stabilityが一定値以上の親を探し、(リーフ群, 遡上回数)を返す"""
        current = int(cluster_leaf_id)
        steps = 0
        
        while True:
            parent = self.child_to_parent.get(current)
            # print(f"Current: {current}, Parent: {parent}, stability: {self.stability_map.get(parent, 0.0)}")
            if parent is None: 
                break # ルート到達
            
            steps += 1
            # 親の安定性をチェック
            if self.stability_map.get(parent, 0.0) >= min_stability:
                current = parent
                print(f"Found stable parent: {parent} with stability {self.stability_map.get(parent, 0.0)}")
                break
            current = parent
            print(f"Moving up to parent: {parent} with stability {self.stability_map.get(parent, 0.0)}")
            
        return self._get_cluster_leaves_under(current), steps

    def get_parent_cluster_by_size_ratio(self, cluster_leaf_id: int, ratio_threshold: float):
        """2. サイズ比率がいびつな親を探し、(リーフ群, 遡上回数)を返す"""
        current = int(cluster_leaf_id)
        steps = 0
        
        while True:
            parent = self.child_to_parent.get(current)
            if parent is None: 
                break # ルート到達
                
            steps += 1
            current_size = self.node_size_map.get(current, 0)
            parent_size = self.node_size_map.get(parent, 0)
            
            # 比率が閾値を超えた ＝ 「小さな断片ではなく、意味のある合流」と判断
            if parent_size > 0 and (current_size / parent_size) >= ratio_threshold:
                current = parent
                print(f"Found size-ratio parent: {parent} with ratio {current_size / parent_size:.4f}, sizes: {current_size}/{parent_size}")
                break
            current = parent
            print(f"Moving up to parent: {parent} with ratio {current_size / parent_size:.4f}, sizes: {current_size}/{parent_size}")
            
        return self._get_cluster_leaves_under(current), steps
    
    def segment_by_leaf_climbing(self, ratio_threshold: float):
        """
        全てのリーフから遡上を行い、到達した親ノードごとにセグメントを形成する
        """
        # 1. 全ての末端クラスタ（リーフ）を取得
        # cluster_treeのchildのうち、parentとして一度も登場しないものが末端
        all_children = set(self.cluster_tree["child"].astype(int))
        all_parents = set(self.cluster_tree["parent"].astype(int))
        leaf_cluster_ids = list(all_children - all_parents)
        
        # 2. リーフごとに遡上して「所属する親」を特定
        # root_map: parent_node_id -> list of leaf_ids
        root_map = {}
        
        for leaf_id in leaf_cluster_ids:
            # あなたが定義したロジックで親を探す
            # (簡単のためクラス内メソッドを呼び出す想定)
            leaves_under, _ = self.get_parent_cluster_by_size_ratio(leaf_id, ratio_threshold)
            
            # 遡上して到達した範囲の「根（親）」を特定
            # _get_cluster_leaves_under(current) の current 自体をキーにする必要があるため、
            # get_parent_cluster_by_size_ratio を current を返すように少し調整するか、
            # ここでは「そのリーフが辿り着いた共通の親」を保持します。
            
            # --- 再現ロジック ---
            current = leaf_id
            while True:
                parent = self.child_to_parent.get(current)
                if parent is None: break
                
                current_size = self.node_size_map.get(current, 0)
                parent_size = self.node_size_map.get(parent, 0)
                
                if parent_size > 0 and (current_size / parent_size) >= ratio_threshold:
                    current = parent
                    break
                current = parent
            
            # current がこのリーフが所属すると決まったセグメントの代表ID
            if current not in root_map:
                root_map[current] = set()
            root_map[current].add(leaf_id)

        # 3. 結果の整形
        final_segments = []
        for root_id, leaf_set in root_map.items():
            final_segments.append({
                "segment_root": root_id,
                "size": self.node_size_map.get(root_id, 0),
                "leaves": list(leaf_set)
            })
        
        return final_segments

In [6]:
import plotly.colors as pc
import plotly.graph_objects as go

def plot_leaf_climbing_dendrogram(linkage_mapped, segments, old_new_id_map, stability_scores=None):
    """
    ボトムアップ遡上で決定したセグメントに基づきデンドログラムをプロットする
    """
    # 1. カラーマップの作成 (Leaf Original ID -> Color)
    # どのリーフがどの色に属するかを完全にマッピングする
    palette = pc.qualitative.Prism + pc.qualitative.Safe
    leaf_to_color = {}
    segment_root_to_color = {}

    # サイズの大きいセグメントから順に色を割り当て
    sorted_segs = sorted(segments, key=lambda x: x["size"], reverse=True)
    for i, seg in enumerate(sorted_segs):
        color = palette[i % len(palette)]
        root_id = seg["segment_root"]
        segment_root_to_color[root_id] = color
        for leaf_id in seg["leaves"]:
            leaf_to_color[leaf_id] = color

    # 2. デンドログラムの座標と構造を取得
    # linkage_mapped[:, [0, 1, 3, 4]] -> [child1, child2, distance, size]
    linkage_for_coords = linkage_mapped[:, [0, 1, 3, 4]].astype(float)
    coords, segment_node_ids, leaf_order, segment_infos = get_dendrogram_segments2(linkage_for_coords)
    
    # 内部IDからオリジナルIDへの逆引き
    reverse_map = {v: k for k, v in old_new_id_map.items()}

    # 3. 枝（セグメント）ごとの色決定
    colors = []
    for node_id, info in zip(segment_node_ids, segment_infos):
        # この枝が表す「オリジナルノードID」を取得
        orig_id = reverse_map.get(node_id)
        
        # 色付けの優先順位:
        # 1. このノード自体がセグメントのルート、あるいはリーフとして登録されているか
        # 2. 子要素(child1, child2)が共通のセグメント色を持っているか
        color = "#888" # デフォルト（未所属）
        
        if orig_id in leaf_to_color:
            color = leaf_to_color[orig_id]
        elif orig_id in segment_root_to_color:
            color = segment_root_to_color[orig_id]
        else:
            # 子ノードのオリジナルIDを確認
            c1_orig = reverse_map.get(info.get("child1"))
            c2_orig = reverse_map.get(info.get("child2"))
            
            # 子が両方同じ色、または片方がその色なら、その枝はそのセグメントの一部とみなす
            color1 = leaf_to_color.get(c1_orig) or segment_root_to_color.get(c1_orig)
            color2 = leaf_to_color.get(c2_orig) or segment_root_to_color.get(c2_orig)
            
            if color1 and color2 and color1 == color2:
                color = color1
            elif color1: # 片方の色を継承（マージ過程の可視化）
                color = color1
            elif color2:
                color = color2
            
        colors.append(color)

    # 4. ホバー情報の整理
    infos = []
    for node_id, info in zip(segment_node_ids, segment_infos):
        enriched = dict(info)
        orig_id = reverse_map.get(node_id)
        if orig_id is not None:
            enriched["original_id"] = orig_id
            if stability_scores and orig_id in stability_scores:
                enriched["stability"] = stability_scores[orig_id]
        infos.append(enriched)

    # 5. Plotlyで描画
    fig = plot_dendrogram_plotly(coords, colors=colors, infos=infos)
    
    fig.update_layout(
        title=f"Leaf-Climbing Segmentation (Found {len(segments)} segments)",
        width=1000,
        height=600,
        showlegend=False,
        hovermode="closest"
    )
    
    return fig

In [14]:
# 1. あなたのロジックを全リーフに適用してセグメントを作る
inspector = ClusterInspector(condensed_tree, stability_scores)
segments = inspector.segment_by_leaf_climbing(ratio_threshold=0.8)

# 2. プロット
fig = plot_leaf_climbing_dendrogram(
    linkage_mapped, 
    segments, 
    old_new_id_map, 
    stability_scores
)
fig.show()

Moving up to parent: 115754 with ratio 0.0000, sizes: 0/97497
Moving up to parent: 115756 with ratio 0.0000, sizes: 0/79229
Found size-ratio parent: 115755 with ratio 0.8618, sizes: 79229/91932
Moving up to parent: 115758 with ratio 0.0000, sizes: 0/72223
Found size-ratio parent: 115756 with ratio 0.9116, sizes: 72223/79229
Moving up to parent: 115755 with ratio 0.0000, sizes: 0/91932
Found size-ratio parent: 115754 with ratio 0.9429, sizes: 91932/97497
Moving up to parent: 115761 with ratio 0.0000, sizes: 0/68304
Found size-ratio parent: 115758 with ratio 0.9457, sizes: 68304/72223
Moving up to parent: 115764 with ratio 0.0000, sizes: 0/56476
Found size-ratio parent: 115761 with ratio 0.8268, sizes: 56476/68304
Moving up to parent: 115765 with ratio 0.0000, sizes: 0/55184
Found size-ratio parent: 115764 with ratio 0.9771, sizes: 55184/56476
Moving up to parent: 115767 with ratio 0.0000, sizes: 0/54558
Found size-ratio parent: 115765 with ratio 0.9887, sizes: 54558/55184
Moving up to p

In [29]:
def plot_segmented_dendrogram(linkage_mapped, segments, old_new_id_map):
    import plotly.colors as pc
    
    # 1. 色の準備 (Segment Root ID -> Color)
    palette = pc.qualitative.Prism + pc.qualitative.Safe # 多めの色を用意
    node_to_color = {}
    
    # サイズの大きいセグメントから順に色を割り当て
    sorted_segs = sorted(segments, key=lambda x: x["size"], reverse=True)
    for i, seg in enumerate(sorted_segs):
        color = palette[i % len(palette)]
        # そのセグメント配下の全リーフに色を波及させる
        for leaf_id in seg["leaves"]:
            node_to_color[leaf_id] = color
        # ルート自身にも色を付ける
        node_to_color[seg["segment_root"]] = color

    # 2. 座標計算 (ユーティリティ関数)
    linkage_for_coords = linkage_mapped[:, [0, 1, 3, 4]].astype(float)
    coords, segment_node_ids, leaf_order, segment_infos = get_dendrogram_segments2(linkage_for_coords)
    reverse_map = {v: k for k, v in old_new_id_map.items()}

    # 3. 枝の色決定
    colors = []
    for node_id, info in zip(segment_node_ids, segment_infos):
        orig_id = reverse_map.get(node_id)
        
        # その枝（またはその下流）がセグメントに属するかチェック
        # child1 か child2 のどちらかがセグメント色を持っていれば継承する
        c1_orig = reverse_map.get(info.get("child1"))
        c2_orig = reverse_map.get(info.get("child2"))
        
        color = "#888" # デフォルト
        if orig_id in node_to_color:
            color = node_to_color[orig_id]
        elif c1_orig in node_to_color:
            color = node_to_color[c1_orig]
        elif c2_orig in node_to_color:
            color = node_to_color[c2_orig]
            
        colors.append(color)

    # 4. Plotly描画
    fig = plot_dendrogram_plotly(coords, colors=colors, infos=segment_infos)
    fig.update_layout(title="Size-Ratio Based Segmentation", showlegend=False)
    return fig

# 実行例
inspector = ClusterInspector(condensed_tree, stability_scores)
segments = inspector.get_segments_by_size(ratio_threshold=0.15)
print(f"Number of segments identified: {len(segments)}")
fig = plot_segmented_dendrogram(linkage_mapped, segments, old_new_id_map)
fig.show()

Number of segments identified: 6


TypeError: '<' not supported between instances of 'NoneType' and 'NoneType'

In [11]:
def get_final_merged_segments(inspector, ratio_threshold: float):
    """
    1. 全リーフから遡上
    2. 到達した親ノード配下のリーフ群をグループ化
    3. 共通の葉を持つグループを統合
    """
    # 全ての末端リーフIDを取得
    all_children = set(inspector.cluster_tree["child"].astype(int))
    all_parents = set(inspector.cluster_tree["parent"].astype(int))
    leaf_cluster_ids = list(all_children - all_parents)

    # 遡上結果の収集 (親ID -> その親に属すると判定されたリーフの集合)
    root_to_leaves = {}
    for leaf_id in leaf_cluster_ids:
        # 遡上ロジック
        current = leaf_id
        while True:
            parent = inspector.child_to_parent.get(current)
            if parent is None: break
            
            current_size = inspector.node_size_map.get(current, 0)
            parent_size = inspector.node_size_map.get(parent, 0)
            
            if parent_size > 0 and (current_size / parent_size) >= ratio_threshold:
                current = parent
                break
            current = parent
        
        if current not in root_to_leaves:
            root_to_leaves[current] = set()
        root_to_leaves[current].add(leaf_id)

    # --- 統合処理 (共通の葉があるなら統合) ---
    merged_groups = list(root_to_leaves.values())
    changed = True
    while changed:
        changed = False
        final_groups = []
        while merged_groups:
            current = merged_groups.pop(0)
            found_overlap = False
            for i, existing in enumerate(final_groups):
                if not current.isdisjoint(existing): # 重なりあり
                    final_groups[i] = existing.union(current)
                    found_overlap = True
                    changed = True
                    break
            if not found_overlap:
                final_groups.append(current)
        merged_groups = final_groups

    # 最終的なセグメントリストの形式に整形
    final_segments = []
    for i, leaf_set in enumerate(merged_groups):
        leaf_list = list(leaf_set)
        # 代表となるルートIDを再特定（もっとも共通の親に近いもの）
        # ここでは便宜上、各セグメントにユニークなIDとサイズを付与
        final_segments.append({
            "segment_id": i,
            "leaves": leaf_list,
            "size": sum([inspector.node_size_map.get(l, 1) for l in leaf_list])
        })
    
    return final_segments

In [17]:
def plot_final_segments(linkage_mapped, segments, old_new_id_map, inspector):
    # 1. カラーマップ作成 (各リーフID -> 統合セグメントの色)
    palette = pc.qualitative.Prism + pc.qualitative.Safe
    leaf_to_color = {}
    
    # サイズ順に並び替えて色を割り当て（大きい塊をハッキリさせる）
    sorted_segments = sorted(segments, key=lambda x: x["size"], reverse=True)
    
    for i, seg in enumerate(sorted_segments):
        color = palette[i % len(palette)]
        for leaf_id in seg["leaves"]:
            leaf_to_color[leaf_id] = color

    # 2. 座標とノード情報の取得
    linkage_for_coords = linkage_mapped[:, [0, 1, 3, 4]].astype(float)
    coords, segment_node_ids, leaf_order, segment_infos = get_dendrogram_segments2(linkage_for_coords)
    reverse_map = {v: k for k, v in old_new_id_map.items()}

    # 3. 枝（エッジ）の着色
    colors = []
    for node_id, info in zip(segment_node_ids, segment_infos):
        orig_id = reverse_map.get(node_id)
        
        # 自身または子が持つ色を取得
        c1_orig = reverse_map.get(info.get("child1"))
        c2_orig = reverse_map.get(info.get("child2"))
        
        # 色の判定（ボトムアップ）
        color = leaf_to_color.get(orig_id)
        if not color:
            # 子が共通の色を持っていればその色にする
            col1 = leaf_to_color.get(c1_orig)
            col2 = leaf_to_color.get(c2_orig)
            if col1 and col2 and col1 == col2:
                color = col1
            elif col1: color = col1
            elif col2: color = col2
            else: color = "#888" # どこにも属さない場合はグレー
            
        colors.append(color)

    # 4. 可視化
    fig = plot_dendrogram_plotly(coords, colors=colors, infos=segment_infos)
    fig.update_layout(title="Fully Merged Leaf-Climbing Segmentation", width=2000, height=600)
    return fig

# --- 実行コード ---
final_segments = get_final_merged_segments(inspector, ratio_threshold=0.80)
fig = plot_final_segments(linkage_mapped, final_segments, old_new_id_map, inspector)
fig.show()

In [18]:
def get_segment_statistics(final_segments):
    """
    セグメント数と各セグメントのサイズ分布を返す
    """
    # セグメント総数
    num_segments = len(final_segments)
    
    # 各セグメントのサイズ（含まれる全データ数）を抽出
    # size属性がデータ総数であることを前提としています
    segment_sizes = [seg["size"] for seg in final_segments]
    
    # 統計情報の計算
    stats = {
        "total_segments": num_segments,
        "mean_size": np.mean(segment_sizes),
        "median_size": np.median(segment_sizes),
        "max_size": np.max(segment_sizes),
        "min_size": np.min(segment_sizes),
        "total_data_points": sum(segment_sizes)
    }
    
    return stats, segment_sizes

# 実行
stats, sizes = get_segment_statistics(final_segments)

print(f"--- セグメンテーション結果 ---")
print(f"合計セグメント数: {stats['total_segments']}")
print(f"最大セ4グメントサイズ: {stats['max_size']}")
print(f"中央値サイズ: {stats['median_size']}")
print(f"総データ数: {stats['total_data_points']}")

--- セグメンテーション結果 ---
合計セグメント数: 367
最大セ4グメントサイズ: 9
中央値サイズ: 1.0
総データ数: 443


In [30]:
import pandas as pd
import plotly.express as px

# 1. 各セグメントのサイズをリスト化
all_sizes = [seg["size"] for seg in final_segments]

# 2. サイズごとの出現回数をカウント
size_counts = pd.Series(all_sizes).value_counts().reset_index()
size_counts.columns = ["Segment Size", "Frequency"]
size_counts = size_counts.sort_values("Segment Size")

# 3. プロット (棒グラフ)
fig_dist = px.bar(
    size_counts, 
    x="Segment Size", 
    y="Frequency",
    labels={"Segment Size": "セグメント内のデータ件数", "Frequency": "セグメント数"},
    title=f"セグメントサイズの分布 (全 {len(final_segments)} セグメント)",
    text_auto=True # 棒の上に数値を表示
)

fig_dist.update_layout(width=900, height=500)
fig_dist.show()

# 統計値の表示
print(f"セグメント総数: {len(final_segments)}")
print(f"ユニークなサイズの種類数: {len(size_counts)}")

# print
for idx, row in size_counts.iterrows():
    print(f"サイズ {row['Segment Size']} のセグメント数: {row['Frequency']}")

セグメント総数: 367
ユニークなサイズの種類数: 7
サイズ 1 のセグメント数: 319
サイズ 2 のセグメント数: 34
サイズ 3 のセグメント数: 8
サイズ 4 のセグメント数: 3
サイズ 5 のセグメント数: 1
サイズ 6 のセグメント数: 1
サイズ 9 のセグメント数: 1


In [26]:
def plot_dendrogram_highlight_major(linkage_mapped, segments, old_new_id_map, inspector):
    # 1. カラーマップ作成
    palette = pc.qualitative.Prism + pc.qualitative.Safe
    leaf_to_color = {}
    
    # サイズ順にソート
    sorted_segments = sorted(segments, key=lambda x: x["size"], reverse=True)
    
    color_idx = 0
    bg_subtle_color = "#EEEEEE" # ここでお好みの薄い色を指定

    for seg in sorted_segments:
        if seg["size"] <= 1:
            # サイズ1以下は背景に近い薄いグレー
            color = bg_subtle_color
        else:
            # 2以上の主要セグメントにだけ鮮やかな色を割り当て
            color = palette[color_idx % len(palette)]
            color_idx += 1
            
        for leaf_id in seg["leaves"]:
            leaf_to_color[leaf_id] = color

    # 2. 座標とノード情報の取得
    linkage_for_coords = linkage_mapped[:, [0, 1, 3, 4]].astype(float)
    coords, segment_node_ids, leaf_order, segment_infos = get_dendrogram_segments2(linkage_for_coords)
    reverse_map = {v: k for k, v in old_new_id_map.items()}

    # 3. 枝（エッジ）の着色ロジック
    colors = []
    for node_id, info in zip(segment_node_ids, segment_infos):
        orig_id = reverse_map.get(node_id)
        c1_orig = reverse_map.get(info.get("child1"))
        c2_orig = reverse_map.get(info.get("child2"))
        
        # ボトムアップで色を判定
        color = leaf_to_color.get(orig_id)
        if not color:
            col1 = leaf_to_color.get(c1_orig)
            col2 = leaf_to_color.get(c2_orig)
            
            # 両方の子が白（または未定義）なら、親の枝も白にする
            # どちらかが色を持っていれば、その色を継承する
            if col1 and col1 != "rgba(255,255,255,0)":
                color = col1
            elif col2 and col2 != "rgba(255,255,255,0)":
                color = col2
            else:
                color = "rgba(255,255,255,0)" # 背景へ
            
        colors.append(color)

    # 4. 可視化
    fig = plot_dendrogram_plotly(coords, colors=colors, infos=segment_infos)
    
    # 背景が白だと「白」が見えなくなるため、背景を少しグレーにするか、
    # 枝の線を細いグレーの枠線で囲むなどの調整が可能です
    fig.update_layout(
        title="Major Segments Highlights (Size > 1)",
        paper_bgcolor='white',
        plot_bgcolor='#F9F9F9', # わずかにグレーにして白の枝を「消す」
        width=1000, 
        height=600
    )
    return fig

In [27]:
plot_dendrogram_highlight_major(linkage_mapped, final_segments, old_new_id_map, inspector).show()

# 2d上の近傍と比較する

In [32]:
with open("../21_cluster_similarity/cluster_similarity_distances_lowdim.pkl", "rb") as f:
    cluster_similarities_low = pickle.load(f)


print(cluster_similarities_low.keys())


dict_keys(['medoid_distances', 'centroid_distances', 'single_linkage_distances'])


In [None]:
# clusterid, clusterid -> distance
cluster_similarities_low["centroid_distances"]




In [36]:
import pandas as pd

def evaluate_segment_consistency(final_segments, centroid_distances_dict):
    results = []

    for seg in final_segments:
        seg_id = seg["segment_id"]
        leaves = set(seg["leaves"])
        k = len(leaves)
        
        if k <= 1:
            continue  # サイズ1は自分自身しかいないため評価対象外
            
        # 1. 代表リーフを1つ選択
        rep_leaf = list(leaves)[0]
        
        # 2. 距離辞書から rep_leaf を起点とした距離をリスト化
        dist_from_rep = []
        
        # 自分自身を距離0として追加（これを忘れるとmissがnになる）
        dist_from_rep.append((rep_leaf, 0.0))
        
        for (id1, id2), dist in centroid_distances_dict.items():
            if id1 == rep_leaf:
                dist_from_rep.append((id2, dist))
            elif id2 == rep_leaf:
                dist_from_rep.append((id1, dist))
        
        # 3. 距離が近い順に上位 k 個を取得
        # 自分自身(dist=0)が必ず先頭に来ます
        sorted_neighbors = sorted(dist_from_rep, key=lambda x: x[1])
        spatial_neighbors = set([n[0] for n in sorted_neighbors[:k]])
        
        # 4. 一致数と取りこぼし数の計算
        matches = leaves.intersection(spatial_neighbors)
        match_count = len(matches)
        
        # 最大取りこぼし数は n - 1 (自分自身は必ず一致するため)
        miss_count = k - match_count
        recall = match_count / k
        
        results.append({
            "segment_id": seg_id,
            "segment_size": k,
            "matches": match_count,
            "misses": miss_count,
            "recall": recall
        })

    return pd.DataFrame(results)

# 実行
eval_df = evaluate_segment_consistency(final_segments, cluster_similarities_low["centroid_distances"])

# 結果のサマリー
print(f"平均適合率: {eval_df['recall'].mean():.4f}")

# all print
for idx, row in eval_df.iterrows():
    print(f"セグメントID: {row['segment_id']}, サイズ: {row['segment_size']}, 一致数: {row['matches']}, 取りこぼし数: {row['misses']}, 適合率: {row['recall']:.4f}")
        

平均適合率: 0.5846
セグメントID: 0.0, サイズ: 2.0, 一致数: 1.0, 取りこぼし数: 1.0, 適合率: 0.5000
セグメントID: 104.0, サイズ: 2.0, 一致数: 1.0, 取りこぼし数: 1.0, 適合率: 0.5000
セグメントID: 117.0, サイズ: 2.0, 一致数: 2.0, 取りこぼし数: 0.0, 適合率: 1.0000
セグメントID: 128.0, サイズ: 2.0, 一致数: 1.0, 取りこぼし数: 1.0, 適合率: 0.5000
セグメントID: 159.0, サイズ: 2.0, 一致数: 1.0, 取りこぼし数: 1.0, 適合率: 0.5000
セグメントID: 160.0, サイズ: 2.0, 一致数: 1.0, 取りこぼし数: 1.0, 適合率: 0.5000
セグメントID: 182.0, サイズ: 2.0, 一致数: 2.0, 取りこぼし数: 0.0, 適合率: 1.0000
セグメントID: 185.0, サイズ: 2.0, 一致数: 1.0, 取りこぼし数: 1.0, 適合率: 0.5000
セグメントID: 190.0, サイズ: 3.0, 一致数: 1.0, 取りこぼし数: 2.0, 適合率: 0.3333
セグメントID: 210.0, サイズ: 3.0, 一致数: 1.0, 取りこぼし数: 2.0, 適合率: 0.3333
セグメントID: 224.0, サイズ: 2.0, 一致数: 1.0, 取りこぼし数: 1.0, 適合率: 0.5000
セグメントID: 227.0, サイズ: 2.0, 一致数: 1.0, 取りこぼし数: 1.0, 適合率: 0.5000
セグメントID: 230.0, サイズ: 2.0, 一致数: 1.0, 取りこぼし数: 1.0, 適合率: 0.5000
セグメントID: 233.0, サイズ: 2.0, 一致数: 1.0, 取りこぼし数: 1.0, 適合率: 0.5000
セグメントID: 235.0, サイズ: 5.0, 一致数: 1.0, 取りこぼし数: 4.0, 適合率: 0.2000
セグメントID: 236.0, サイズ: 2.0, 一致数: 2.0, 取りこぼし数: 0.0, 適合率: 1.0000
セグメントID: 240

In [37]:
import pandas as pd

# 評価の実行
eval_df = evaluate_segment_consistency(final_segments, cluster_similarities_low["centroid_distances"])

# --- データ整形ロジック ---

# A. サイズ2のセグメント：統計要約（平均）
size2_df = eval_df[eval_df["segment_size"] == 2]
if not size2_df.empty:
    size2_summary = {
        "Segment Category": "Size 2 (Summary)",
        "Count": len(size2_df),
        "Avg. Matches": f"{size2_df['matches'].mean():.2f}",
        "Avg. Misses": f"{size2_df['misses'].mean():.2f}",
        "Avg. Recall (%)": f"{size2_df['recall'].mean() * 100:.1f}%"
    }
else:
    size2_summary = None

# B. サイズ3以上のセグメント：全件リスト
major_df = eval_df[eval_df["segment_size"] >= 3].copy()
major_df = major_df.sort_values("segment_size", ascending=False)
major_df["recall (%)"] = (major_df["recall"] * 100).map("{:.1f}%".format)

# --- 出力 ---

print("【論文掲載用：セグメント整合性評価表】")
print("-" * 60)

# サイズ2の要約表示
if size2_summary:
    print(f"■ 小規模セグメント要約")
    summary_print = pd.DataFrame([size2_summary])
    print(summary_print.to_string(index=False))
    print("\n" + "-" * 60)

# サイズ3以上の詳細表示
print(f"■ 主要セグメント詳細 (Size >= 3)")
# 論文用にカラム名をきれいに整形
output_major = major_df.rename(columns={
    "segment_id": "ID",
    "segment_size": "Size (n)",
    "matches": "Matches",
    "misses": "Misses",
    "recall (%)": "Consistency (%)"
})
print(output_major[["ID", "Size (n)", "Matches", "Misses", "Consistency (%)"]].to_string(index=False))
print("-" * 60)

【論文掲載用：セグメント整合性評価表】
------------------------------------------------------------
■ 小規模セグメント要約
Segment Category  Count Avg. Matches Avg. Misses Avg. Recall (%)
Size 2 (Summary)     34         1.32        0.68           66.2%

------------------------------------------------------------
■ 主要セグメント詳細 (Size >= 3)
 ID  Size (n)  Matches  Misses Consistency (%)
306         9        4       5           44.4%
365         6        2       4           33.3%
235         5        1       4           20.0%
244         4        2       2           50.0%
296         4        1       3           25.0%
316         4        2       2           50.0%
190         3        1       2           33.3%
210         3        1       2           33.3%
248         3        1       2           33.3%
278         3        2       1           66.7%
304         3        2       1           66.7%
309         3        1       2           33.3%
326         3        1       2           33.3%
329         3        1       2  