In [1]:
import numpy as np
from scipy.linalg import inv, det
import warnings
from sklearn.covariance import LedoitWolf
from typing import Dict
# --- 1. 分布のパラメータ推定 ---

def estimate_single_gaussian_params(X_data: np.ndarray) -> Dict[str, np.ndarray]:
    """
    単一の高次元データセットから、多変量正規分布の平均ベクトルと共分散行列を推定する。
    サンプル数が次元数より少ない場合、Ledoit-Wolf収縮法を用いて共分散行列を頑健に推定する。

    Args:
        X_data (np.ndarray): 単一のクラスタ（またはデータセット）に属するデータ (N, D)。

    Returns:
        Dict: {'mu': 平均ベクトル, 'Sigma': 共分散行列}
    """
    N, D = X_data.shape # サンプル数 N, 次元数 D

    if N == 0:
        raise ValueError("Input data array must not be empty.")
    
    # 1. 平均ベクトル (mu) の推定
    mu = np.mean(X_data, axis=0)

    # 2. 共分散行列 (Sigma) の推定
    if N == 1:
        # サンプル数が1の場合、共分散はゼロ
        warnings.warn("N=1. Covariance matrix is set to zero (plus regularization).")
        Sigma = np.zeros((D, D))
        
    elif N < D + 1:
        # N < D+1 の場合: 特異行列になるリスクが高いため、Ledoit-Wolf収縮推定を使用
        warnings.warn(f"N={N} < D+1={D+1}. Using Ledoit-Wolf shrinkage for robust covariance estimation.")
        
        # Ledoit-Wolf収縮推定器を初期化・学習
        lw = LedoitWolf()
        lw.fit(X_data)
        Sigma = lw.covariance_
        
    else:
        # N >= D+1 の場合: 標準的な最尤推定を使用
        Sigma = np.cov(X_data, rowvar=False)

    # 3. 最終的な正則化チェック
    # 収縮推定を行ってもなお条件数が悪い場合に、微小な値を加算
    if np.linalg.cond(Sigma) > 1e15: # より厳しい条件数でチェック
        warnings.warn("Covariance matrix highly ill-conditioned. Applying final small regularization.")
        Sigma += np.eye(D) * 1e-6
        
    return {'mu': mu, 'Sigma': Sigma}

# --- 2. 類似度測定 (非類似度) ---

def kl_divergence_gaussian(mu1, Sigma1, mu2, Sigma2) -> float:
    """
    多変量ガウス分布 N1 から N2 へのKL情報量 D_KL(N1 || N2) を計算する。
    """
    D = mu1.shape[0]

    try:
        Sigma2_inv = inv(Sigma2)
    except np.linalg.LinAlgError:
        warnings.warn("Sigma2 is singular. KL divergence is undefined.")
        return np.nan

    # 1. ログデターミナント項
    log_det_term = np.log(det(Sigma2) / det(Sigma1))

    # 2. トレース項
    trace_term = np.trace(Sigma2_inv @ Sigma1)

    # 3. マハラノビス距離項 (平均の違い)
    diff_mu = mu2 - mu1
    mahalanobis_term = diff_mu.T @ Sigma2_inv @ diff_mu

    kl_div = 0.5 * (log_det_term + trace_term + mahalanobis_term - D)

    return kl_div

# --- 3. 類似度測定 (重なり) ---

def bhattacharyya_coefficient_gaussian(mu1, Sigma1, mu2, Sigma2) -> float:
    """
    多変量ガウス分布間のバタチャリヤ係数 BC を計算する。
    """
    D = mu1.shape[0]
    
    # 共分散行列の平均 Sigma = 0.5 * (Sigma1 + Sigma2)
    Sigma = 0.5 * (Sigma1 + Sigma2)

    try:
        Sigma_inv = inv(Sigma)
    except np.linalg.LinAlgError:
        warnings.warn("Sigma (mean covariance) is singular. Bhattacharyya Coefficient is undefined.")
        return np.nan

    # 1. バタチャリヤ距離の平均項（マハラノビス距離の変形）
    diff_mu = mu1 - mu2
    db_mu_term = 0.125 * diff_mu.T @ Sigma_inv @ diff_mu

    # 2. バタチャリヤ距離の共分散項
    db_cov_term = 0.5 * np.log(det(Sigma) / np.sqrt(det(Sigma1) * det(Sigma2)))

    # バタチャリヤ距離
    db_distance = db_mu_term + db_cov_term
    
    # バタチャリヤ係数 = exp(-距離)
    bc = np.exp(-db_distance)

    return bc

# --- 4. マハラノビス距離 (参考) ---

def mahalanobis_distance(mu1, mu2, Sigma_pooled) -> float:
    """
    プールされた共分散行列 Sigma_pooled を用いて、2つの平均間のマハラノビス距離を計算する。
    """
    try:
        Sigma_inv = inv(Sigma_pooled)
    except np.linalg.LinAlgError:
        warnings.warn("Sigma_pooled is singular. Mahalanobis distance is undefined.")
        return np.nan
    
    diff_mu = mu1 - mu2
    # 距離の2乗
    dist_sq = diff_mu.T @ Sigma_inv @ diff_mu
    
    return np.sqrt(dist_sq)





# medoid, centroid

In [2]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# medoid or centroid
def calculate_cluster_similarity(vecs_a, vecs_b, mode='medoid'):
    """
    2つのベクトル集合間の類似度/距離を計算する
    
    Parameters:
    vecs_a, vecs_b: np.array (n_samples, n_features)
    mode: 'medoid' (代表点同士), 'average' (全体の平均)
    
    Returns:
    dict: 類似度と距離
    """
    
    if mode == 'medoid':
        # 各集合内で「他の点との距離の総和が最小」な点をメドイドとする
        # 簡易的に中心点に最も近い実データを選択
        def get_medoid(vecs):
            centroid = np.mean(vecs, axis=0).reshape(1, -1)
            # 重心に最も近いデータのインデックスを探す
            sim = cosine_similarity(vecs, centroid)
            return vecs[np.argmax(sim)].reshape(1, -1)
        
        rep_a = get_medoid(vecs_a)
        rep_b = get_medoid(vecs_b)
    else:
        # 平均ベクトル（セントロイド）を使用
        rep_a = np.mean(vecs_a, axis=0).reshape(1, -1)
        rep_b = np.mean(vecs_b, axis=0).reshape(1, -1)

    # コサイン類似度を算出
    similarity = cosine_similarity(rep_a, rep_b)[0][0]
    
    # 距離に変換 (0.0: 同一 ～ 2.0: 正反対)
    distance = 1 - similarity

    # ユークリッド距離
    euclidean_distance = np.linalg.norm(rep_a - rep_b)
    
    return {
        "similarity": similarity,
        "distance": distance,
        "euclidean_distance": euclidean_distance
    }

In [8]:
from sklearn.metrics.pairwise import cosine_similarity

def calculate_cluster_centroid(vecs):
    """
    ベクトル集合のセントロイド（平均ベクトル）を計算する
    """
    return np.mean(vecs, axis=0)
def calculate_cluster_medoid(vecs):
    """
    ベクトル集合のメドイドを計算する
    """
    centroid = np.mean(vecs, axis=0).reshape(1, -1)
    sim = cosine_similarity(vecs, centroid)
    return vecs[np.argmax(sim)].reshape(1, -1)


In [9]:
# 2つのベクトル集合から、medoidの距離を返す
def calculate_medoid_distance(vecs_a, vecs_b):
    medoid_a = calculate_cluster_medoid(vecs_a)
    medoid_b = calculate_cluster_medoid(vecs_b)
    distance = np.linalg.norm(medoid_a - medoid_b)
    return distance

# 2つのベクトル集合から、centroidの距離を返す
def calculate_centroid_distance(vecs_a, vecs_b):
    centroid_a = calculate_cluster_centroid(vecs_a)
    centroid_b = calculate_cluster_centroid(vecs_b)
    distance = np.linalg.norm(centroid_a - centroid_b)
    return distance

In [6]:
# 2つのベクトル集合から、最短隣接距離を返す
def calculate_single_linkage_distance(vecs_a, vecs_b):
    """
    2つのベクトル集合間の最短隣接距離（Single Linkage Distance）を計算する
    
    Parameters:
    vecs_a, vecs_b: np.array (n_samples, n_features)
    
    Returns:
    float: 最短隣接距離
    """
    # 全てのペア間のユークリッド距離を計算
    from sklearn.metrics import pairwise_distances
    distances = pairwise_distances(vecs_a, vecs_b, metric='euclidean')
    
    # 最小距離を返す
    return np.min(distances)

In [None]:
# クラスタの

# 実データ


In [3]:
import numpy as np
# ベクトル
vectors = np.load("../../d3-app/data/vector.npy")
# point_to_cluster_map
point_to_cluster_map = np.load("../../d3-app/data/point_cluster_map.npy")

# unique clusters
unique_clusters = np.unique(point_to_cluster_map)

# 低次元投影
projected_vectors = np.load("../../d3-app/data/projection.npy")

# leave IDs
leave_ids = np.load("../../d3-app/data/leaves.npy")

In [4]:
# cluster数
print(f"Number of unique clusters: {len(unique_clusters)}")
print(f"sample unique clusters: {unique_clusters[:10]}")

# point cluster map
print(f"Number of points: {len(point_to_cluster_map)}")
print(f"Sample point to cluster map: {point_to_cluster_map[:10]}")
# leave ids
print(f"Number of leave IDs: {len(leave_ids)}")
print(f"Sample leave IDs: {leave_ids[:10]}")

Number of unique clusters: 871
sample unique clusters: [115754 115755 115756 115757 115758 115759 115760 115761 115762 115763]
Number of points: 115754
Sample point to cluster map: [115756 115754 115754 115928 115754 115924 115756 115754 116203 115754]
Number of leave IDs: 443
Sample leave IDs: [115760 115763 115769 115771 115774 115779 115781 115783 115786 115788]


In [16]:
# 115760 is in unique clusters
print(115760 in unique_clusters)

True


In [23]:
# leaves idの点の数の分布(histogram), point_cluster_mapはクラスタidの列

leave_points = { cluster_id: np.where(point_to_cluster_map == cluster_id)[0] for cluster_id in leave_ids}

leave_sizes = { cluster_id: len(leave_points[cluster_id]) for cluster_id in leave_points}
import plotly.express as px
fig = px.histogram(x=list(leave_sizes.values()), nbins=30, title="Distribution of Leave Sizes")
fig.show()

# 0の数
num_zero_size = sum(1 for size in leave_sizes.values() if size == 0)
print(f"Number of leaves with size 0: {num_zero_size}")

# 最小サイズ
min_size = min(leave_sizes.values())
print(f"Minimum leave size: {min_size}")
# 最大サイズ
max_size = max(leave_sizes.values())
print(f"Maximum leave size: {max_size}")



Number of leaves with size 0: 0
Minimum leave size: 5
Maximum leave size: 176


# continity, trustworthiness( medoid, centroid, )

In [None]:
import numpy as np
from tqdm import tqdm
from sklearn.manifold import trustworthiness
from sklearn.metrics.pairwise import cosine_similarity

def calculate_cluster_representative_metrics(X_high, X_low, point_to_cluster_map, leaves_ids, mode='medoid', k=5):
    """
    leaves_idsで指定された各クラスタを代表点(メドイド/セントロイド)に集約し、
    クラスタ間の配置維持率を計算する
    """
    
    rep_high_list = []
    rep_low_list = []
    valid_ids = []

    print(f"Calculating {mode}s for each cluster...")
    for cluster_id in leaves_ids:
        mask = (point_to_cluster_map == cluster_id)
        vecs_h = X_high[mask]
        vecs_l = X_low[mask]
        
        if len(vecs_h) == 0:
            continue
            
        if mode == 'medoid':
            # 高次元での平均を基準にメドイドのインデックスを特定
            centroid_h = np.mean(vecs_h, axis=0).reshape(1, -1)
            sim = cosine_similarity(vecs_h, centroid_h)
            medoid_idx = np.argmax(sim)
            
            rep_high_list.append(vecs_h[medoid_idx])
            rep_low_list.append(vecs_l[medoid_idx])
        else: # centroid
            rep_high_list.append(np.mean(vecs_h, axis=0))
            rep_low_list.append(np.mean(vecs_l, axis=0))
        
        valid_ids.append(cluster_id)

    # 代表点の行列を作成
    rep_high = np.array(rep_high_list)
    rep_low = np.array(rep_low_list)

    if len(rep_high) <= k:
        raise ValueError(f"クラスタ数が近傍数 k={k} 以下のため計算できません。")

    # 全体としての維持スコアを計算
    # Trustworthiness: 低次元の近傍が、高次元でも近傍か
    overall_t = trustworthiness(rep_high, rep_low, n_neighbors=k)
    # Continuity: 高次元の近傍が、低次元でも近傍か
    overall_c = trustworthiness(rep_low, rep_high, n_neighbors=k)

    return {
        "overall_trustworthiness": overall_t,
        "overall_continuity": overall_c,
        "cluster_count": len(valid_ids)
    }



In [38]:
for k in [2, 3, 4, 5, 10, 15, 40, 100]:
    results = calculate_cluster_representative_metrics(
        vectors, 
        projected_vectors, 
        point_to_cluster_map, 
        leaves_ids=leave_ids, 
        mode='medoid', 
        k=k
    )
    print(f"param: k={k}")
    print(results)

for k in [2, 3, 4, 5, 10, 15, 40, 100]:
    results = calculate_cluster_representative_metrics(
        vectors, 
        projected_vectors, 
        point_to_cluster_map, 
        leaves_ids=leave_ids, 
        mode='average', 
        k=k
    )
    print(f"param: k={k}")
    print(results)

Calculating medoids for each cluster...
param: k=2
{'overall_trustworthiness': 0.7955762371050624, 'overall_continuity': 0.8835686972421462, 'cluster_count': 443}
Calculating medoids for each cluster...
param: k=3
{'overall_trustworthiness': 0.7842869462740207, 'overall_continuity': 0.872333371127397, 'cluster_count': 443}
Calculating medoids for each cluster...
param: k=4
{'overall_trustworthiness': 0.7827501234682822, 'overall_continuity': 0.8576649885323177, 'cluster_count': 443}
Calculating medoids for each cluster...
param: k=5
{'overall_trustworthiness': 0.778995874523235, 'overall_continuity': 0.8445281648114995, 'cluster_count': 443}
Calculating medoids for each cluster...
param: k=10
{'overall_trustworthiness': 0.7678333531345294, 'overall_continuity': 0.8067376869562921, 'cluster_count': 443}
Calculating medoids for each cluster...
param: k=15
{'overall_trustworthiness': 0.754054247733706, 'overall_continuity': 0.7778415564871547, 'cluster_count': 443}
Calculating medoids for

# 距離・類似度を計算、保存

In [None]:
# 組み合わせごとに距離・類似度を計算、辞書に保存

from itertools import combinations
medoid_distances = {}
centroid_distances = {}
single_linkage_distances = {}
for cluster_a, cluster_b in combinations(leave_ids, 2):
    # 各クラスタのベクトルを抽出
    vecs_a = vectors[point_to_cluster_map == cluster_a]
    vecs_b = vectors[point_to_cluster_map == cluster_b]
    
    # medoid の距離
    medoid_distance = calculate_medoid_distance(vecs_a, vecs_b)
    medoid_distances[(cluster_a, cluster_b)] = medoid_distance

    # centroid の距離
    centroid_distance = calculate_centroid_distance(vecs_a, vecs_b)
    centroid_distances[(cluster_a, cluster_b)] = centroid_distance

    # single linkage の距離
    single_linkage_distance = calculate_single_linkage_distance(vecs_a, vecs_b)
    single_linkage_distances[(cluster_a, cluster_b)] = single_linkage_distance


import pickle
cluster_similarities = {
    "medoid_distances": medoid_distances,
    "centroid_distances": centroid_distances,
    "single_linkage_distances": single_linkage_distances
}
with open("./cluster_similarity_distances.pkl", "wb") as f:
    pickle.dump(cluster_similarities, f)




# save
# np.save("src/experiments/21_cluster_similarity/cluster_similarity_distances.npy", similarity_results)
    

In [19]:
# (低次元埋め込み)組み合わせごとに距離・類似度を計算、辞書に保存

from itertools import combinations
medoid_distances = {}
centroid_distances = {}
single_linkage_distances = {}
for cluster_a, cluster_b in combinations(leave_ids, 2):
    # 各クラスタのベクトルを抽出
    vecs_a = projected_vectors[point_to_cluster_map == cluster_a]
    vecs_b = projected_vectors[point_to_cluster_map == cluster_b]
    
    # medoid の距離
    medoid_distance = calculate_medoid_distance(vecs_a, vecs_b)
    medoid_distances[(cluster_a, cluster_b)] = medoid_distance

    # centroid の距離
    centroid_distance = calculate_centroid_distance(vecs_a, vecs_b)
    centroid_distances[(cluster_a, cluster_b)] = centroid_distance

    # single linkage の距離
    single_linkage_distance = calculate_single_linkage_distance(vecs_a, vecs_b)
    single_linkage_distances[(cluster_a, cluster_b)] = single_linkage_distance

with open("./cluster_similarity_distances_lowdim.pkl", "wb") as f:
    pickle.dump({
        "medoid_distances": medoid_distances,
        "centroid_distances": centroid_distances,
        "single_linkage_distances": single_linkage_distances
    }, f)


# trustworthiness, continuity


In [21]:
with open("./cluster_similarity_distances_lowdim.pkl", "rb") as f:
    cluster_similarities_low = pickle.load(f)
with open("./cluster_similarity_distances.pkl", "rb") as f:
    cluster_similarities_high = pickle.load(f)

print(cluster_similarities_low.keys())
print(cluster_similarities_high.keys())

dict_keys(['medoid_distances', 'centroid_distances', 'single_linkage_distances'])
dict_keys(['medoid_distances', 'centroid_distances', 'single_linkage_distances'])


In [23]:
import numpy as np
import plotly.graph_objects as go

def dict_to_matrix(dist_dict, cluster_ids):
    """(id1, id2) の辞書を ID順の距離行列に変換する"""
    n = len(cluster_ids)
    id_map = {cid: i for i, cid in enumerate(cluster_ids)}
    mat = np.zeros((n, n))
    
    for (id1, id2), dist in dist_dict.items():
        if id1 in id_map and id2 in id_map:
            i, j = id_map[id1], id_map[id2]
            mat[i, j] = mat[j, i] = dist
    return mat

def calculate_metrics_for_k_range(D_high, D_low, k_list):
    """複数のkに対してT/Cを計算する"""
    n = D_high.shape[0]
    # 距離の順位を取得 (0位は自分自身)
    R_high = np.argsort(np.argsort(D_high, axis=1), axis=1)
    R_low = np.argsort(np.argsort(D_low, axis=1), axis=1)
    
    trust_scores = []
    cont_scores = []
    
    for k in k_list:
        # Trustworthiness ペナルティ
        t_penalty = 0
        for i in range(n):
            low_neighbors = np.where((R_low[i] <= k) & (R_low[i] > 0))[0]
            for j in low_neighbors:
                if R_high[i, j] > k:
                    t_penalty += (R_high[i, j] - k)
        t = 1 - (2 / (n * k * (2 * n - 3 * k - 1))) * t_penalty
        
        # Continuity ペナルティ
        c_penalty = 0
        for i in range(n):
            high_neighbors = np.where((R_high[i] <= k) & (R_high[i] > 0))[0]
            for j in high_neighbors:
                if R_low[i, j] > k:
                    c_penalty += (R_low[i, j] - k)
        c = 1 - (2 / (n * k * (2 * n - 3 * k - 1))) * c_penalty
        
        trust_scores.append(t)
        cont_scores.append(c)
        
    return trust_scores, cont_scores

In [42]:
# 評価対象のIDリストを固定（順序を保証するため）
leave_ids_sorted = sorted(list(leave_ids))
k_list = [1, 5, 10, 20, 50, 80, 100] # クラスタ数500に対する近傍範囲
methods = ["medoid_distances", "centroid_distances", "single_linkage_distances"]



from plotly.subplots import make_subplots
import plotly.graph_objects as go

# サブプロットの作成 (1行2列)
fig = make_subplots(
    rows=1, cols=2, 
    subplot_titles=("Trustworthiness (Is the map lying?)", "Continuity (Is the map tearing?)"),
    shared_xaxes=True,
    horizontal_spacing=0.1
)

# 距離定義ごとの色設定
method_colors = {
    "medoid_distances": "#FF4B00",        # 赤
    "centroid_distances": "#005AFF",      # 青
    "single_linkage_distances": "#03AF7A" # 緑
}

for m in methods:
    # 1. 距離行列の作成 (前回定義した関数を使用)
    mat_high = dict_to_matrix(cluster_similarities_high[m], leave_ids_sorted)
    mat_low = dict_to_matrix(cluster_similarities_low[m], leave_ids_sorted)
    
    # 2. スコア計算
    trusts, conts = calculate_metrics_for_k_range(mat_high, mat_low, k_list)
    
    # 左側：Trustworthiness の描画
    fig.add_trace(
        go.Scatter(x=k_list, y=trusts, name=f'Trust ({m})',
                   mode='lines+markers', line=dict(color=method_colors[m])),
        row=1, col=1
    )
    
    # 右側：Continuity の描画
    fig.add_trace(
        go.Scatter(x=k_list, y=conts, name=f'Cont ({m})',
                   mode='lines+markers', line=dict(color=method_colors[m])),
        row=1, col=2
    )
    print(f"Plotted metrics for method: {m}")
    print(f"Trustworthiness scores: {trusts}")
    print(f"Continuity scores: {conts}")

# レイアウトの一括設定
fig.update_layout(
    title_text=f"Cluster-level Structural Evaluation (N_clusters={len(leave_ids_sorted)})",
    height=500,
    width=1100,
    template="plotly_white",
    hovermode="x unified",
    showlegend=True
)

# Y軸の範囲を固定 (0 to 1)
fig.update_yaxes(range=[0, 1.05], tickformat=".2f")
fig.update_xaxes(title_text="Neighborhood size (k)")

fig.show()
fig.write_image("./cluster_level_trust_cont.png")


Plotted metrics for method: medoid_distances
Trustworthiness scores: [0.7694599284409023, 0.7724127552476583, 0.7615096431824482, 0.7373418154456529, 0.7034645045377067, 0.6854749155686213, 0.6742736972082344]
Continuity scores: [0.9020080567968346, 0.8404213694507148, 0.8040742940873629, 0.7529177098296738, 0.6907701048816818, 0.665713860745096, 0.6546370318921109]
Plotted metrics for method: centroid_distances
Trustworthiness scores: [0.8093139437866945, 0.8124210580939779, 0.7997634417118794, 0.7737962924960667, 0.7434105741619447, 0.7310420494514147, 0.7235633501186549]
Continuity scores: [0.9236856518378608, 0.8777654964842635, 0.8515887159584439, 0.8176077707093509, 0.7706187558544862, 0.7492752025478153, 0.7382095657039224]
Plotted metrics for method: single_linkage_distances
Trustworthiness scores: [0.837927345505546, 0.8296110635427207, 0.8216889100101646, 0.7970063615842397, 0.7613212327820519, 0.7423808773863895, 0.7328295421658853]
Continuity scores: [0.9320137385277663, 0.

## スコアの低い要因となっているクラスタ

In [41]:
def identify_outlier_clusters(D_high, D_low, cluster_ids, k=10, top_n=10):
    n = len(cluster_ids)
    R_high = np.argsort(np.argsort(D_high, axis=1), axis=1)
    R_low = np.argsort(np.argsort(D_low, axis=1), axis=1)
    
    trust_penalties = np.zeros(n)
    cont_penalties = np.zeros(n)
    
    for i in range(n):
        # Trustworthinessのペナルティ (低次元で近いのに高次元で遠い)
        low_neighbors = np.where((R_low[i] <= k) & (R_low[i] > 0))[0]
        for j in low_neighbors:
            if R_high[i, j] > k:
                trust_penalties[i] += (R_high[i, j] - k)
        
        # Continuityのペナルティ (高次元で近いのに低次元で遠い)
        high_neighbors = np.where((R_high[i] <= k) & (R_high[i] > 0))[0]
        for j in high_neighbors:
            if R_low[i, j] > k:
                cont_penalties[i] += (R_low[i, j] - k)
    
    # スコアが低い（ペナルティが高い）順にソート
    worst_trust_idx = np.argsort(-trust_penalties)[:top_n]
    worst_cont_idx = np.argsort(-cont_penalties)[:top_n]
    
    results = {
        "worst_trustworthiness": [(cluster_ids[i], trust_penalties[i]) for i in worst_trust_idx],
        "worst_continuity": [(cluster_ids[i], cont_penalties[i]) for i in worst_cont_idx]
    }
    return results

for m in methods:
    mat_high = dict_to_matrix(cluster_similarities_high[m], leave_ids_sorted)
    mat_low = dict_to_matrix(cluster_similarities_low[m], leave_ids_sorted)
    # 実行例 (例: Single Linkage, k=10)
    outliers = identify_outlier_clusters(mat_high, mat_low, leave_ids_sorted, k=5)

  

    print("--- Trustworthinessを下げている（偽の近傍を作っている）TOP10 ---")
    for cid, p in outliers["worst_trustworthiness"]:
        print(f"Cluster ID: {cid}, Penalty: {p:.1f}")

    print("\n--- Continuityを下げている（引き裂かれている）TOP10 ---")
    for cid, p in outliers["worst_continuity"]:
        print(f"Cluster ID: {cid}, Penalty: {p:.1f}")

--- Trustworthinessを下げている（偽の近傍を作っている）TOP10 ---
Cluster ID: 116056, Penalty: 1869.0
Cluster ID: 116167, Penalty: 1690.0
Cluster ID: 116275, Penalty: 1651.0
Cluster ID: 115987, Penalty: 1599.0
Cluster ID: 116260, Penalty: 1524.0
Cluster ID: 116586, Penalty: 1507.0
Cluster ID: 115908, Penalty: 1491.0
Cluster ID: 115964, Penalty: 1435.0
Cluster ID: 115806, Penalty: 1418.0
Cluster ID: 116593, Penalty: 1396.0

--- Continuityを下げている（引き裂かれている）TOP10 ---
Cluster ID: 116030, Penalty: 1996.0
Cluster ID: 116585, Penalty: 1760.0
Cluster ID: 116335, Penalty: 1637.0
Cluster ID: 115862, Penalty: 1594.0
Cluster ID: 115908, Penalty: 1592.0
Cluster ID: 116489, Penalty: 1586.0
Cluster ID: 115759, Penalty: 1506.0
Cluster ID: 116275, Penalty: 1488.0
Cluster ID: 115941, Penalty: 1434.0
Cluster ID: 116288, Penalty: 1408.0
--- Trustworthinessを下げている（偽の近傍を作っている）TOP10 ---
Cluster ID: 115759, Penalty: 1623.0
Cluster ID: 116548, Penalty: 1538.0
Cluster ID: 116167, Penalty: 1427.0
Cluster ID: 115933, Penalty: 1348.0
C

# other

In [11]:
# check
with open("./../../d3-app/data/cluster_similarities.pkl", "rb") as f:
    loaded_similarities = pickle.load(f)

In [15]:
print(loaded_similarities.keys())
ma = loaded_similarities["mahalanobis_distance"]
print(f"length of mahalanobis distances: {len(ma)}  ")

dict_keys(['kl_divergence', 'bhattacharyya_coefficient', 'mahalanobis_distance'])
length of mahalanobis distances: 391170  


In [16]:
# 平方根
NUM = len(ma)
print(f"平方根 of number of distances: {np.sqrt(NUM)}")

平方根 of number of distances: 625.4358480291963
