In [1]:
file_path = "../18_rapids/result/20251203_053328/condensed_tree_object.pkl"
import pickle
with open(file_path, "rb") as f:
    condensed_tree = pickle.load(f)

In [None]:
import numpy as np
import pandas as pd
import typing
import itertools
import plotly.graph_objects as go

# --- Helpers lifted from backend (d3_data_manager.py) and extended with stability ---

def _get_leaves(raw_tree_df: pd.DataFrame):
    cluster_tree = raw_tree_df[raw_tree_df["child_size"] > 1]
    if cluster_tree.shape[0] == 0:
        return [raw_tree_df["parent"].min()]
    root = cluster_tree["parent"].min()

    def _dfs(current_node: int):
        children = cluster_tree[cluster_tree["parent"] == current_node]["child"]
        if len(children) == 0:
            return [current_node]
        out = []
        for child in children:
            out.extend(_dfs(int(child)))
        return out

    return _dfs(int(root))


def hdbscan_condensed_to_linkage(condensed_tree):
    """
    Convert HDBSCAN condensed tree to linkage matrix and ID map.
    Returns (linkage_mapped, old_new_id_map).
    linkage_mapped has columns: child1, child2, parent, distance, size.
    distance is inverted lambda (larger lambda -> smaller distance).
    """
    raw_tree = pd.DataFrame(condensed_tree._raw_tree)
    condensed_df = condensed_tree.to_pandas()

    cluster_tree = condensed_df[condensed_df["child_size"] > 1]
    sorted_tree = cluster_tree.sort_values(by=["lambda_val", "parent"], ascending=True)

    linkage_rows = []
    for i in range(0, len(sorted_tree), 2):
        if i + 1 >= len(sorted_tree):
            break
        row_a = sorted_tree.iloc[i]
        row_b = sorted_tree.iloc[i + 1]
        if row_a["lambda_val"] != row_b["lambda_val"]:
            continue
        parent_id = int(row_a["parent"])
        lam = float(row_a["lambda_val"])
        total_size_rows = raw_tree[raw_tree["child"] == parent_id]["child_size"]
        total_size = int(total_size_rows.iloc[0]) if len(total_size_rows) > 0 else int(row_a["child_size"] + row_b["child_size"])
        linkage_rows.append([
            int(row_a["child"]),
            int(row_b["child"]),
            parent_id,
            lam,
            total_size,
        ])

    # Map IDs to contiguous range
    old_new_id_map = {}
    current_id = 0
    leaves = _get_leaves(raw_tree)
    for leaf in leaves:
        old_new_id_map[int(leaf)] = current_id
        current_id += 1
    for row in reversed(linkage_rows):
        parent_id = row[2]
        if int(parent_id) not in old_new_id_map:
            old_new_id_map[int(parent_id)] = current_id
            current_id += 1

    max_lambda = max((r[3] for r in linkage_rows), default=1.0)
    linkage_mapped = [
        [
            old_new_id_map[int(r[0])],
            old_new_id_map[int(r[1])],
            old_new_id_map[int(r[2])],
            max_lambda - r[3],  # invert lambda to distance
            r[4],
        ]
        for r in reversed(linkage_rows)
    ]

    return np.array(linkage_mapped, dtype=float), old_new_id_map


def compute_stability(condensed_tree_like):
    """
    Compute HDBSCAN stability scores (Python port of cython logic).
    Accepts HDBSCAN condensed tree object or a pandas DataFrame with columns
    ['parent','child','lambda_val','child_size'].
    Returns dict {cluster_id: stability} using original cluster IDs.
    """
    if hasattr(condensed_tree_like, "to_pandas"):
        df = condensed_tree_like.to_pandas()
    elif isinstance(condensed_tree_like, pd.DataFrame):
        df = condensed_tree_like
    else:
        raise ValueError("Unsupported condensed tree format")

    smallest_cluster = int(df["parent"].min())
    largest_cluster = int(df["parent"].max())
    num_clusters = largest_cluster - smallest_cluster + 1
    largest_child = int(max(df["child"].max(), smallest_cluster))

    sorted_child = df.sort_values(["child", "lambda_val"])[["child", "lambda_val"]].to_numpy()
    births_arr = np.nan * np.ones(largest_child + 1, dtype=np.double)
    current_child = -1
    min_lambda = 0.0
    for child_val, lambda_val in sorted_child:
        child = int(child_val)
        lam = float(lambda_val)
        if child == current_child:
            min_lambda = min(min_lambda, lam)
        elif current_child != -1:
            births_arr[current_child] = min_lambda
            current_child = child
            min_lambda = lam
        else:
            current_child = child
            min_lambda = lam
    if current_child != -1:
        births_arr[current_child] = min_lambda
    births_arr[smallest_cluster] = 0.0

    result_arr = np.zeros(num_clusters, dtype=np.double)
    parents = df["parent"].to_numpy()
    sizes = df["child_size"].to_numpy()
    lambdas = df["lambda_val"].to_numpy()
    for parent_val, size_val, lambda_val in zip(parents, sizes, lambdas):
        parent = int(parent_val)
        child_size = float(size_val)
        lam = float(lambda_val)
        result_index = parent - smallest_cluster
        lambda_birth = births_arr[parent]
        result_arr[result_index] += (lam - lambda_birth) * child_size

    node_ids = np.arange(smallest_cluster, df["parent"].max() + 1)
    result_pre_dict = np.vstack((node_ids, result_arr)).T
    return dict(zip(result_pre_dict[:, 0].astype(int), result_pre_dict[:, 1]))


def compute_dendrogram_coords(Z: np.ndarray, n_points: int):
    """
    Compute dendrogram coordinates with size-aware leaf ordering.
    Returns (icoord, dcoord, leaf_order).
    """
    n_nodes = 2 * n_points - 1
    nodes = [{"x": None, "y": 0.0, "size": 1, "left": None, "right": None} for _ in range(n_points)]

    for i in range(n_points - 1):
        c1, c2, dist, count = Z[i]
        nodes.append({
            "x": None,
            "y": float(dist),
            "size": int(count),
            "left": int(c1),
            "right": int(c2),
        })

    def get_leaf_order_sorted(node_idx):
        node = nodes[node_idx]
        if node_idx < n_points:
            return [node_idx]
        left_idx, right_idx = node["left"], node["right"]
        size_left, size_right = nodes[left_idx]["size"], nodes[right_idx]["size"]
        if size_left < size_right:
            left_idx, right_idx = right_idx, left_idx
        order_left = get_leaf_order_sorted(left_idx)
        order_right = get_leaf_order_sorted(right_idx)
        return order_left + order_right

    def calculate_x_coord(node_idx, leaf_to_x):
        node = nodes[node_idx]
        if node_idx < n_points:
            node["x"] = leaf_to_x[node_idx]
            return node["x"]
        x_left = calculate_x_coord(node["left"], leaf_to_x)
        x_right = calculate_x_coord(node["right"], leaf_to_x)
        node["x"] = (x_left + x_right) / 2.0
        return node["x"]

    root_node_idx = n_points - 1 + (n_points - 1)
    leaf_order = get_leaf_order_sorted(root_node_idx)
    leaf_to_x = {leaf_idx: 2 * i + 1 for i, leaf_idx in enumerate(leaf_order)}
    calculate_x_coord(root_node_idx, leaf_to_x)

    icoord = []
    dcoord = []
    for i in range(n_points - 1):
        parent_idx = n_points + i
        left_idx = nodes[parent_idx]["left"]
        right_idx = nodes[parent_idx]["right"]
        y_parent = nodes[parent_idx]["y"]
        y_left = nodes[left_idx]["y"]
        y_right = nodes[right_idx]["y"]
        x_left = nodes[left_idx]["x"]
        x_right = nodes[right_idx]["x"]
        icoord.append([x_left, x_left, x_right, x_right])
        dcoord.append([y_left, y_parent, y_parent, y_right])

    return icoord, dcoord, leaf_order


def get_dendrogram_segments2(Z: np.ndarray):
    """Build dendrogram segments, node ids, and hover info from a linkage matrix."""
    n_points = Z.shape[0] + 1
    icoord, dcoord, leaf_order = compute_dendrogram_coords(Z, n_points)

    node_meta = {idx: {"distance": 0.0, "size": 1, "child1": None, "child2": None} for idx in range(n_points)}
    for i in range(n_points - 1):
        node_id = n_points + i
        c1, c2, dist, count = Z[i]
        node_meta[node_id] = {
            "distance": float(dist),
            "size": int(count),
            "child1": int(c1),
            "child2": int(c2),
        }

    segments = []
    segment_node_ids = []
    segment_infos = []
    for merge_idx, (icoords, dcoords) in enumerate(zip(icoord, dcoord)):
        node_id = n_points + merge_idx
        x1, x2, x3, x4 = icoords
        y1, y2, y3, y4 = dcoords
        info = node_meta.get(node_id, {"distance": 0.0, "size": 1, "child1": None, "child2": None})
        segments.append([(x1, y1), (x2, y2)])
        segment_node_ids.append(node_id)
        segment_infos.append(info)
        segments.append([(x2, y2), (x3, y3)])
        segment_node_ids.append(node_id)
        segment_infos.append(info)
        segments.append([(x4, y4), (x3, y3)])
        segment_node_ids.append(node_id)
        segment_infos.append(info)
    return segments, segment_node_ids, leaf_order, segment_infos


def plot_dendrogram_plotly(
    segments: typing.List,
    colors: typing.Optional[typing.List] = None,
    infos: typing.Optional[typing.List[typing.Dict]] = None,
    **kwargs,
):
    """Plot dendrogram segments with Plotly and hover text."""
    fig = go.Figure()
    for i, seg in enumerate(segments):
        x_coords = [seg[0][0], seg[1][0], None]
        y_coords = [seg[0][1], seg[1][1], None]
        color = "#888" if colors is None else colors[i]

        hover_lines = []
        if infos is not None and i < len(infos):
            info = infos[i]
            node_id = info.get("node_id")
            orig_id = info.get("original_id")
            size_val = info.get("size")
            dist_val = info.get("distance")
            child1 = info.get("child1")
            child2 = info.get("child2")
            child1_orig = info.get("child1_original")
            child2_orig = info.get("child2_original")
            node_type = info.get("node_type")
            stability_val = info.get("stability")
            if node_type:
                hover_lines.append(f"Type: {node_type}")
            if node_id is not None:
                hover_lines.append(f"Parent (mapped): {node_id}")
            if orig_id is not None:
                hover_lines.append(f"Parent (original): {orig_id}")
            if child1 is not None:
                hover_lines.append(f"Child1 (mapped): {child1}")
            if child1_orig is not None:
                hover_lines.append(f"Child1 (original): {child1_orig}")
            if child2 is not None:
                hover_lines.append(f"Child2 (mapped): {child2}")
            if child2_orig is not None:
                hover_lines.append(f"Child2 (original): {child2_orig}")
            if size_val is not None:
                hover_lines.append(f"Size: {size_val}")
            if dist_val is not None:
                hover_lines.append(f"Distance: {dist_val:.4f}")
            if stability_val is not None:
                hover_lines.append(f"Stability: {stability_val:.4f}")
        hover_text = "<br>".join(hover_lines) if hover_lines else None

        fig.add_trace(
            go.Scatter(
                x=x_coords,
                y=y_coords,
                mode="lines",
                line=dict(color=color, width=1.5),
                showlegend=False,
                hoverinfo="text" if hover_text else "skip",
                text=[hover_text] * len(x_coords) if hover_text else None,
            )
        )
    fig.update_layout(
        title="Simple Dendrogram Visualization",
        xaxis_title="Observation Index",
        yaxis_title="Distance / Height",
        hovermode="closest",
    )
    fig.update_layout(height=800, width=1000)
    return fig


def plot_dendrogram(
    linkage_mapped: np.ndarray,
    highlight_cluster_ids=None,
    old_new_id_map=None,
    stability_scores: typing.Optional[typing.Dict[int, float]] = None,
    sort_by_size: bool = True,
):
    """
    Plot dendrogram using custom coordinates so we can sort leaves by size.
    highlight_cluster_ids: original cluster IDs to highlight in orange.
    stability_scores: dict of original cluster ID -> stability to show on hover.
    sort_by_size: kept for API parity; sorting is handled by compute_dendrogram_coords.
    """
    if highlight_cluster_ids is None:
        highlight_cluster_ids = []

    linkage_for_coords = linkage_mapped[:, [0, 1, 3, 4]].astype(float)
    n_points = linkage_for_coords.shape[0] + 1
    segments, segment_node_ids, leaf_order, segment_infos = get_dendrogram_segments2(linkage_for_coords)

    colors = None
    if old_new_id_map and highlight_cluster_ids:
        highlight_new_ids = {old_new_id_map[cid] for cid in highlight_cluster_ids if cid in old_new_id_map}
        print(f"Highlighting clusters (original IDs): {highlight_cluster_ids}")
        colors = ["orange" if node_id in highlight_new_ids else "#888" for node_id in segment_node_ids]
        
        print(f"Highlighting {len(highlight_new_ids)} clusters in dendrogram.")

    reverse_map = {v: k for k, v in old_new_id_map.items()} if old_new_id_map else {}
    infos = []
    for node_id, info in zip(segment_node_ids, segment_infos):
        enriched = dict(info)
        enriched["node_id"] = node_id
        enriched["node_type"] = "leaf" if node_id < n_points else "internal"
        if node_id in reverse_map:
            enriched["original_id"] = reverse_map[node_id]
        c1 = info.get("child1")
        c2 = info.get("child2")
        if c1 is not None and c1 in reverse_map:
            enriched["child1_original"] = reverse_map[c1]
        if c2 is not None and c2 in reverse_map:
            enriched["child2_original"] = reverse_map[c2]
        orig_id = enriched.get("original_id")
        if stability_scores is not None and orig_id is not None:
            enriched["stability"] = stability_scores.get(orig_id)
        infos.append(enriched)

    fig = plot_dendrogram_plotly(segments, colors=colors, infos=infos)
    fig.update_layout(height=500, width=900, title="Dendrogram (HDBSCAN condensed → linkage)", showlegend=False)
    return fig


In [65]:
# Override plot_dendrogram with leaf-aware highlighting so leaf cluster IDs also color their merge segments.
def plot_dendrogram(
    linkage_mapped: np.ndarray,
    highlight_cluster_ids=None,
    old_new_id_map=None,
    stability_scores: typing.Optional[typing.Dict[int, float]] = None,
    sort_by_size: bool = True,
    highlight_color: str = "orange",
):
    """
    Plot dendrogram using custom coordinates so we can sort leaves by size.

    highlight_cluster_ids: original cluster IDs to highlight in orange. Both internal clusters and
    cluster-tree leaves are supported. If a highlighted ID corresponds to a leaf (mapped id < n_points),
    its immediate merge segment will also be highlighted.
    stability_scores: dict of original cluster ID -> stability to show on hover.
    sort_by_size: kept for API parity; sorting is handled by compute_dendrogram_coords.
    highlight_color: color for highlighted branches.
    """
    if highlight_cluster_ids is None:
        highlight_cluster_ids = []

    linkage_for_coords = linkage_mapped[:, [0, 1, 3, 4]].astype(float)
    n_points = linkage_for_coords.shape[0] + 1
    segments, segment_node_ids, leaf_order, segment_infos = get_dendrogram_segments2(linkage_for_coords)

    colors = None
    if old_new_id_map and highlight_cluster_ids:
        highlight_new_ids = {old_new_id_map[cid] for cid in highlight_cluster_ids if cid in old_new_id_map}
        leaf_highlights = {hid for hid in highlight_new_ids if hid < n_points}

        # If a highlighted id is a leaf, also highlight its immediate merge segment so it becomes visible.
        highlight_nodes = set(highlight_new_ids)
        if leaf_highlights:
            for node_id, info in zip(segment_node_ids, segment_infos):
                if info.get("child1") in leaf_highlights or info.get("child2") in leaf_highlights:
                    highlight_nodes.add(node_id)

        print(f"Highlighting clusters (original IDs): {highlight_cluster_ids}")
        print(f"Highlight mapped ids: {sorted(list(highlight_new_ids))}")
        colors = [highlight_color if node_id in highlight_nodes else "#888" for node_id in segment_node_ids]

    reverse_map = {v: k for k, v in old_new_id_map.items()} if old_new_id_map else {}
    infos = []
    for node_id, info in zip(segment_node_ids, segment_infos):
        enriched = dict(info)
        enriched["node_id"] = node_id
        enriched["node_type"] = "leaf" if node_id < n_points else "internal"
        if node_id in reverse_map:
            enriched["original_id"] = reverse_map[node_id]
        c1 = info.get("child1")
        c2 = info.get("child2")
        if c1 is not None and c1 in reverse_map:
            enriched["child1_original"] = reverse_map[c1]
        if c2 is not None and c2 in reverse_map:
            enriched["child2_original"] = reverse_map[c2]
        orig_id = enriched.get("original_id")
        if stability_scores is not None and orig_id is not None:
            enriched["stability"] = stability_scores.get(orig_id)
        infos.append(enriched)

    fig = plot_dendrogram_plotly(segments, colors=colors, infos=infos)
    fig.update_layout(height=500, width=900, title="Dendrogram (HDBSCAN condensed → linkage)", showlegend=False)
    return fig

# Plot all

In [66]:
linkage_mapped, old_new_id_map = hdbscan_condensed_to_linkage(condensed_tree)
print("linkage shape:", linkage_mapped.shape)
print("sample rows:", linkage_mapped[:3])

# Optional: compute stability (original cluster IDs)
stability_scores = compute_stability(condensed_tree)

# TODO: set the cluster IDs you want to highlight (original IDs)
highlight_cluster_ids = []  # e.g., [115760, 115761]
fig = plot_dendrogram(
    linkage_mapped,
    highlight_cluster_ids,
    old_new_id_map,
    stability_scores=stability_scores,
)
fig.show()

linkage shape: (442, 5)
sample rows: [[1.74000000e+02 1.75000000e+02 4.43000000e+02 0.00000000e+00
  9.00000000e+01]
 [1.76000000e+02 1.77000000e+02 4.44000000e+02 6.14290237e-02
  4.90000000e+01]
 [4.43000000e+02 4.44000000e+02 4.45000000e+02 1.24295950e-01
  1.67000000e+02]]


# plot selected

In [None]:
selected_cluster_id = 115760  # Example cluster ID to inspect

# nearby clusters
neighbor_ids = []


In [113]:
import pandas as pd

class ClusterInspector:
    def __init__(self, condensed_tree, stability_map):
        self.tree_df = condensed_tree.to_pandas()
        # クラスタとしての分岐のみを抽出
        self.cluster_tree = self.tree_df[self.tree_df["child_size"] > 1].copy()
        
        # 親への逆引きマップ (Cluster ID -> Parent Cluster ID)
        self.child_to_parent = dict(zip(self.cluster_tree["child"].astype(int), 
                                        self.cluster_tree["parent"].astype(int)))
        
        # ノードサイズマップ (親ノードID -> そのノードの合計サイズ)
        self.node_size_map = self.cluster_tree.groupby("parent")["child_size"].max().to_dict()
        
        # Stabilityマップの取得
        self.stability_map = stability_map

    def _get_cluster_leaves_under(self, node_id: int):
        """配下にある『末端のクラスタ(Cluster Leaf)』を再帰的に取得"""
        children = self.cluster_tree[self.cluster_tree["parent"] == node_id]["child"]
        if len(children) == 0:
            return [node_id]
        
        out = []
        for child in children:
            out.extend(self._get_cluster_leaves_under(int(child)))
        return out

    def get_parent_cluster_by_stability(self, cluster_leaf_id: int, min_stability: float):
        """1. Stabilityが一定値以上の親を探し、(リーフ群, 遡上回数)を返す"""
        current = int(cluster_leaf_id)
        steps = 0
        
        while True:
            parent = self.child_to_parent.get(current)
            # print(f"Current: {current}, Parent: {parent}, stability: {self.stability_map.get(parent, 0.0)}")
            if parent is None: 
                break # ルート到達
            
            steps += 1
            # 親の安定性をチェック
            if self.stability_map.get(parent, 0.0) >= min_stability:
                current = parent
                print(f"Found stable parent: {parent} with stability {self.stability_map.get(parent, 0.0)}")
                break
            current = parent
            print(f"Moving up to parent: {parent} with stability {self.stability_map.get(parent, 0.0)}")
            
        return self._get_cluster_leaves_under(current), steps

    def get_parent_cluster_by_size_ratio(self, cluster_leaf_id: int, ratio_threshold: float):
        """2. サイズ比率がいびつな親を探し、(リーフ群, 遡上回数)を返す"""
        current = int(cluster_leaf_id)
        steps = 0
        
        while True:
            parent = self.child_to_parent.get(current)
            if parent is None: 
                break # ルート到達
                
            steps += 1
            current_size = self.node_size_map.get(current, 0)
            parent_size = self.node_size_map.get(parent, 0)
            
            # 比率が閾値を超えた ＝ 「小さな断片ではなく、意味のある合流」と判断
            if parent_size > 0 and (current_size / parent_size) >= ratio_threshold:
                current = parent
                print(f"Found size-ratio parent: {parent} with ratio {current_size / parent_size:.4f}, sizes: {current_size}/{parent_size}")
                break
            current = parent
            print(f"Moving up to parent: {parent} with ratio {current_size / parent_size:.4f}, sizes: {current_size}/{parent_size}")
            
        return self._get_cluster_leaves_under(current), steps

In [115]:
# クラスの初期化
inspector = ClusterInspector(condensed_tree, stability_scores)



In [119]:
target_leaf_cluster = 116543

In [79]:


# そのクラスタが含まれる「安定した親」を探し、その親からぶら下がる他の末端クラスタもまとめて取得
related_cluster_leaves, steps = inspector.get_parent_cluster_by_stability(target_leaf_cluster, min_stability=30.0)
print(f"length of related clusters by stability: {len(related_cluster_leaves)}")
print(f"step: {steps}")
print(f"起点クラスタ {target_leaf_cluster} と共に、一つの安定した塊を構成する末端クラスタ群: {related_cluster_leaves}")

Moving up to parent: 116471 with stability 5.334946990013123
Moving up to parent: 116462 with stability 0.18368196487426758
Moving up to parent: 116421 with stability 2.4670742750167847
Moving up to parent: 116416 with stability 6.26696503162384
Moving up to parent: 116415 with stability 6.055968761444092
Moving up to parent: 116414 with stability 1.1982548236846924
Moving up to parent: 116413 with stability 4.8161386251449585
Moving up to parent: 116410 with stability 11.740297198295593
Moving up to parent: 116407 with stability 5.220691561698914
Moving up to parent: 116406 with stability 1.9214601516723633
Moving up to parent: 116403 with stability 3.9511200189590454
Moving up to parent: 116402 with stability 0.9275788068771362
Moving up to parent: 116400 with stability 11.435438632965088
Moving up to parent: 116398 with stability 3.9020520448684692
Moving up to parent: 116397 with stability 4.2570754289627075
Found stable parent: 116392 with stability 44.74266004562378
length of rel

In [123]:
related_cluster_leaves, steps = inspector.get_parent_cluster_by_size_ratio(target_leaf_cluster, ratio_threshold=0.80)
print(f"length of related clusters by size ratio: {len(related_cluster_leaves)}")
print(f"step: {steps}") 
print(f"起点クラスタ {target_leaf_cluster} と共に、一つのまとまりを構成する末端クラスタ群: {related_cluster_leaves}")


Moving up to parent: 116488 with ratio 0.0000, sizes: 0/39
Moving up to parent: 116485 with ratio 0.6500, sizes: 39/60
Moving up to parent: 116451 with ratio 0.5769, sizes: 60/104
Moving up to parent: 116449 with ratio 0.7536, sizes: 104/138
Found size-ratio parent: 116408 with ratio 0.8846, sizes: 138/156
length of related clusters by size ratio: 10
step: 5
起点クラスタ 116543 と共に、一つのまとまりを構成する末端クラスタ群: [116448, 116484, 116492, 116525, 116526, 116514, 116518, 116543, 116468, 116472]


In [124]:
highlight_cluster_ids = related_cluster_leaves
fig = plot_dendrogram(
    linkage_mapped,
    highlight_cluster_ids,
    old_new_id_map,
    stability_scores=stability_scores,
).show()

Highlighting clusters (original IDs): [116448, 116484, 116492, 116525, 116526, 116514, 116518, 116543, 116468, 116472]
Highlight mapped ids: [225, 226, 227, 228, 229, 230, 231, 232, 233, 234]
