In [None]:
# 

In [15]:
import numpy as np
import pickle
import os
import plotly.express as px
import plotly.graph_objects as go
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from umap import UMAP

In [16]:
# GoogleNewsデータセットの読み込み

from gensim.models import KeyedVectors

file_path = 'GoogleNews-vectors-negative300.bin'
# model = KeyedVectors.load_word2vec_format(file_path, binary=True)
model = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin", binary=True)

words = model.index_to_key
print(f"語彙数: {len(words)}")



語彙数: 3000000


In [17]:
# 上位1000語のベクトルを取得
top_n = 10000
top_words = words[:top_n]
top_vectors = np.array([model[word] for word in top_words])
print(f"ベクトルの形状: {top_vectors.shape}")


ベクトルの形状: (10000, 300)


In [17]:
# pcaの実行とプロット
def plot_pca_2d(data, labels = None):
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(data)
    fig = px.scatter(x=pca_result[:, 0], y=pca_result[:, 1], text=labels)
    fig.update_layout(height=800, width=800)
    fig.show()



In [18]:
plot_pca_2d(top_vectors, top_words)

# 埋め込みベクトルに対する解釈
- 代表語彙の選択
- 特定の単語による意味付け
- ICA
- NMF
- 外部知識

In [12]:
# ICA
from sklearn.decomposition import FastICA

def ica(data, labels, n_components=10):
    ica = FastICA(n_components=n_components)
    ica_result = ica.fit_transform(data)
    components = ica.components_

    # 代表単語
    for i in range(n_components):

        component = components[i]
        top_indices = np.argsort(np.abs(component))[-5:]
        top_words = [words[idx] for idx in top_indices]
        print(f"成分 {i+1}: {top_words}")


In [19]:
ica(top_vectors, top_words)

成分 1: ['3', 'system', 'many', 'our', 'top']
成分 2: ['was', 'hit', '##,###', 'officials', 'can']
成分 3: ['be', 'into', 'first', 'points', 'report']
成分 4: ['another', 'being', 'one', '#.#', 'or']
成分 5: ['did', 'great', 'You', 'They', 'In']
成分 6: ['here', 'week', 'year', 'old', 'which']
成分 7: ['big', '###', 'years', 'As', 'ago']
成分 8: ['think', 'hit', 'week', 'him', 'me']
成分 9: ['days', 'people', 'have', 'another', 'like']
成分 10: ['until', 'season', 'good', 'really', 'State']


In [14]:
# PCAと代表単語
def explain_pca(data, labels):
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(data)
    components = pca.components_

    # 各主成分に対する代表単語(正負それぞれ5つずつ)
    for i in range(2):
        component = components[i]
        top_pos_indices = np.argsort(component)[-5:]
        top_neg_indices = np.argsort(component)[:5]
        top_pos_words = [labels[idx] for idx in top_pos_indices]
        top_neg_words = [labels[idx] for idx in top_neg_indices]
        print(f"主成分 {i+1} (正): {top_pos_words}")
        print(f"主成分 {i+1} (負): {top_neg_words}")

    
    # 可視化して確認してみる
    plot_pca_2d(data, labels)

In [20]:
explain_pca(top_vectors, top_words)

主成分 1 (正): ['##.#', 'He', 'And', 'big', 'support']
主成分 1 (負): ['does', 'U.S.', 'did', 'called', 'years']
主成分 2 (正): ['months', 'use', 'told', 'which', 'then']
主成分 2 (負): ['report', 'including', '#.#', 'up', 'what']


# metric

In [32]:

import numpy as np
from sklearn.metrics import pairwise_distances
from sklearn.neighbors import NearestNeighbors

# 信頼性
def trustworthiness(X, X_proj, n_neighbors=7):
    N = X.shape[0]
    # Original and projected distances
    dist_X = pairwise_distances(X)
    dist_proj = pairwise_distances(X_proj)

    # Rank of distances
    rank_X = np.argsort(np.argsort(dist_X, axis=1), axis=1)
    
    # K-nearest neighbors in projected space
    nn_proj = NearestNeighbors(n_neighbors=n_neighbors+1).fit(X_proj)
    neighbors_proj = nn_proj.kneighbors(return_distance=False)[:, 1:]

    # For each i, for each neighbor in projection, get the rank in original space
    t_sum = 0
    for i in range(N):
        for j in neighbors_proj[i]:
            r = rank_X[i, j]
            if r >= n_neighbors:
                t_sum += r - n_neighbors + 1
    
    norm = 2 / (N * n_neighbors * (2 * N - 3 * n_neighbors - 1))
    T = 1 - norm * t_sum
    return T



# 連続性
def continuity(X, X_proj, n_neighbors=7):
    N = X.shape[0]
    # Original and projected distances
    dist_X = pairwise_distances(X)
    dist_proj = pairwise_distances(X_proj)

    # Rank of distances in projected space
    rank_proj = np.argsort(np.argsort(dist_proj, axis=1), axis=1)

    # K-nearest neighbors in original space
    nn_orig = NearestNeighbors(n_neighbors=n_neighbors+1).fit(X)
    neighbors_orig = nn_orig.kneighbors(return_distance=False)[:, 1:]

    c_sum = 0
    for i in range(N):
        for j in neighbors_orig[i]:
            r = rank_proj[i, j]
            if r >= n_neighbors:
                c_sum += r - n_neighbors + 1
    
    norm = 2 / (N * n_neighbors * (2 * N - 3 * n_neighbors - 1))
    C = 1 - norm * c_sum
    return C

# ストレス係数: 元の距離行列と射影後の距離行列の差を測る指標
def normalized_stress(X, X_proj):
    D_n = pairwise_distances(X)
    D_q = pairwise_distances(X_proj)
    # 正規化
    D_n = D_n / np.max(D_n)
    D_q = D_q / np.max(D_q)
    numerator = np.sum((D_n - D_q) ** 2)
    denominator = np.sum(D_n ** 2)
    return numerator / denominator



# Shepard Diagram helper (returns distances in original and projected space)
def shepard_diagram_data(X, X_proj):
    D_n = pairwise_distances(X)
    D_q = pairwise_distances(X_proj)
    # Use upper triangle without diagonal to avoid redundancy
    i_upper = np.triu_indices_from(D_n, k=1)
    return D_n[i_upper], D_q[i_upper]

# Average Local Error (optional, related to trustworthiness-like structure)
def average_local_error(X, X_proj, n_neighbors=5):
    N = X.shape[0]
    dist_X = pairwise_distances(X)
    dist_proj = pairwise_distances(X_proj)

    nn_orig = NearestNeighbors(n_neighbors=n_neighbors+1).fit(X)
    neighbors_orig = nn_orig.kneighbors(return_distance=False)[:, 1:]

    error_sum = 0
    for i in range(N):
        for j in neighbors_orig[i]:
            error_sum += abs(dist_X[i, j] - dist_proj[i, j])
    
    return error_sum / (N * n_neighbors)

### ラベルあり ###

# シルエットスコア
from sklearn.metrics import silhouette_score
def silhouette(X_proj, labels):
    return silhouette_score(X_proj, labels)

# Neiborhood hit :次元削減した空間で、近傍点が同じラベルかどうか
def neighborhood_hit(X_proj, labels, n_neighbors=7):
    N = X_proj.shape[0]
    nn = NearestNeighbors(n_neighbors=n_neighbors+1).fit(X_proj)
    neighbors = nn.kneighbors(return_distance=False)[:, 1:]

    hit_sum = 0
    for i in range(N):
        hit_sum += np.sum(labels[neighbors[i]] == labels[i])

    return hit_sum / (N * n_neighbors)


from scipy.stats import spearmanr, pearsonr

import numpy as np
from sklearn.metrics import pairwise_distances
from scipy.stats import pearsonr, spearmanr

def cluster_center_distance_correlation(X_high, X_low, y):
    
    def compute_cluster_centers(X, labels):
        centers = []
        unique_labels = np.unique(labels)
        for lab in unique_labels:
            centers.append(X[labels == lab].mean(axis=0))
        return np.array(centers), unique_labels
    
    # 高次元/低次元でのクラスタ中心
    centers_high, labels_unique = compute_cluster_centers(X_high, y)
    centers_low, _ = compute_cluster_centers(X_low, y)
    
    # クラスタ間距離行列
    dist_high = pairwise_distances(centers_high)
    dist_low = pairwise_distances(centers_low)
    
    # 上三角部分をベクトル化
    dist_high_flat = dist_high[np.triu_indices(len(labels_unique), k=1)]
    dist_low_flat = dist_low[np.triu_indices(len(labels_unique), k=1)]
    
    # 相関計算
    pearson_corr, _ = pearsonr(dist_high_flat, dist_low_flat)
    spearman_corr, _ = spearmanr(dist_high_flat, dist_low_flat)
    
    return pearson_corr, spearman_corr





In [5]:
pca = PCA(n_components=2)
tsne = TSNE(n_components=2, random_state=42)
proj_pca = pca.fit_transform(top_vectors)
proj_tsne = tsne.fit_transform(top_vectors)
import umap
proj_umap = umap.UMAP(n_components=2, random_state=42).fit_transform(top_vectors)


print(f"PCA normalized_stress: {normalized_stress(top_vectors, proj_pca):.4f}")
for k in [5, 10, 15, 50, 100]:
    print(f"n_neighbors={k}")
    print(f" trustworthiness (PCA): {trustworthiness(top_vectors, proj_pca, n_neighbors=k):.4f}")
    print(f" continuity (PCA): {continuity(top_vectors, proj_pca, n_neighbors=k):.4f}")
    print()

print(f"t-SNE normalized_stress: {normalized_stress(top_vectors, proj_tsne):.4f}")
for k in [5, 10, 15, 50, 100]:
    print(f"n_neighbors={k}")
    print(f" trustworthiness (t-SNE): {trustworthiness(top_vectors, proj_tsne, n_neighbors=k):.4f}")
    print(f" continuity (t-SNE): {continuity(top_vectors, proj_tsne, n_neighbors=k):.4f}")
    print()

print(f"UMAP normalized_stress: {normalized_stress(top_vectors, proj_umap):.4f}")
for k in [5, 10, 15, 50, 100]:
    print(f"n_neighbors={k}")
    print(f" trustworthiness (UMAP): {trustworthiness(top_vectors, proj_umap, n_neighbors=k):.4f}")
    print(f" continuity (UMAP): {continuity(top_vectors, proj_umap, n_neighbors=k):.4f}")
    print()


  warn(


PCA normalized_stress: 0.6830
n_neighbors=5
 trustworthiness (PCA): 0.5954
 continuity (PCA): 0.8451

n_neighbors=10
 trustworthiness (PCA): 0.5943
 continuity (PCA): 0.8304

n_neighbors=15
 trustworthiness (PCA): 0.5938
 continuity (PCA): 0.8192

n_neighbors=50
 trustworthiness (PCA): 0.5941
 continuity (PCA): 0.7755

n_neighbors=100
 trustworthiness (PCA): 0.5949
 continuity (PCA): 0.7505

t-SNE normalized_stress: 300.0641
n_neighbors=5
 trustworthiness (t-SNE): 0.8918
 continuity (t-SNE): 0.8959

n_neighbors=10
 trustworthiness (t-SNE): 0.8488
 continuity (t-SNE): 0.8719

n_neighbors=15
 trustworthiness (t-SNE): 0.8217
 continuity (t-SNE): 0.8563

n_neighbors=50
 trustworthiness (t-SNE): 0.7318
 continuity (t-SNE): 0.8081

n_neighbors=100
 trustworthiness (t-SNE): 0.6866
 continuity (t-SNE): 0.7789

UMAP normalized_stress: 0.3032
n_neighbors=5
 trustworthiness (UMAP): 0.7713
 continuity (UMAP): 0.9158

n_neighbors=10
 trustworthiness (UMAP): 0.7613
 continuity (UMAP): 0.8900

n_neig

In [12]:
print(f"UMAP normalized_stress: {normalized_stress(top_vectors, proj_umap):.4f}")
print(f"t-SNE normalized_stress: {normalized_stress(top_vectors, proj_tsne):.4f}")
print(f"PCA normalized_stress: {normalized_stress(top_vectors, proj_pca):.4f}")

UMAP normalized_stress: 0.3815
t-SNE normalized_stress: 0.4748
PCA normalized_stress: 0.2233


In [14]:
# pca plot
fig_pca = px.scatter(x=proj_pca[:, 0], y=proj_pca[:, 1], hover_name=top_words, title='PCA Projection of Word Vectors')
fig_pca.update_traces(marker=dict(size=2)) # size 1
fig_pca.update_layout(height=800, width=800) # size 2
fig_pca.show()

# tsne plot
fig_tsne = px.scatter(x=proj_tsne[:, 0], y=proj_tsne[:, 1], hover_name=top_words, title='t-SNE Projection of Word Vectors')
fig_tsne.update_layout(height=800, width=800)
fig_tsne.show() 

# umap plot
fig_umap = px.scatter(x=proj_umap[:, 0], y=proj_umap[:, 1], hover_name=top_words, title='UMAP Projection of Word Vectors')
fig_umap.update_layout(height=800, width=800)
fig_umap.show()


In [19]:
# DBSCAN
pca = PCA(n_components=2)
tsne = TSNE(n_components=2, random_state=42)
proj_pca = pca.fit_transform(top_vectors)
proj_tsne = tsne.fit_transform(top_vectors)
import umap
proj_umap = umap.UMAP(n_components=2, random_state=42).fit_transform(top_vectors)

from sklearn.cluster import DBSCAN


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [25]:
for eps in [0.04, 0.05, 0.06, 0.07, 0.1, 0.3, 0.5, 0.7, 1.0]:
    for min_samples in [3, 5, 10]:
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        labels = dbscan.fit_predict(proj_pca)
        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        n_noise = list(labels).count(-1) # クラスタに含まれない点の数
        if n_clusters > 1:  # クラスタが1つ以上の場合のみ評価
            silhouette_avg = silhouette(proj_pca, labels)
            print(f"eps={eps}, min_samples={min_samples} => clusters: {n_clusters}, noise: {n_noise}, silhouette: {silhouette_avg:.4f}")
            # fig = px.scatter(x=proj_umap[:, 0], y=proj_umap[:, 1], color=labels.astype(str), hover_name=top_words, title=f'UMAP + DBSCAN (eps={eps}, min_samples={min_samples})')
            # fig.update_traces(marker=dict(size=2))
            # fig.update_layout(height=800, width=800)
            # fig.show()

eps=0.04, min_samples=3 => clusters: 73, noise: 395, silhouette: -0.2387
eps=0.04, min_samples=5 => clusters: 21, noise: 672, silhouette: -0.1218
eps=0.04, min_samples=10 => clusters: 18, noise: 1342, silhouette: -0.3235
eps=0.05, min_samples=3 => clusters: 53, noise: 263, silhouette: -0.1343
eps=0.05, min_samples=5 => clusters: 13, noise: 503, silhouette: 0.0772
eps=0.05, min_samples=10 => clusters: 8, noise: 850, silhouette: -0.0337
eps=0.06, min_samples=3 => clusters: 38, noise: 149, silhouette: -0.0622
eps=0.06, min_samples=5 => clusters: 17, noise: 360, silhouette: 0.1657
eps=0.06, min_samples=10 => clusters: 4, noise: 663, silhouette: 0.1435
eps=0.07, min_samples=3 => clusters: 23, noise: 103, silhouette: 0.0123
eps=0.07, min_samples=5 => clusters: 20, noise: 223, silhouette: 0.2502
eps=0.07, min_samples=10 => clusters: 3, noise: 530, silhouette: 0.2306
eps=0.1, min_samples=3 => clusters: 6, noise: 40, silhouette: 0.0997
eps=0.1, min_samples=5 => clusters: 3, noise: 73, silhouett

In [92]:
from sklearn.cluster import DBSCAN
import numpy as np

def recursive_dbscan(X, eps=0.15, min_samples=5, max_cluster_size=300, depth=0, max_depth=6, label_offset=0):
    """
    再帰 DBSCAN + 最後に残ったノイズを再クラスタリング
    """
    if depth > max_depth:
        return np.full(X.shape[0], -1)  # 最大深度で全てノイズ

    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    labels = dbscan.fit_predict(X)
    unique_labels = set(labels)

    final_labels = np.full(X.shape[0], -1)
    current_label = label_offset

    for lab in unique_labels:
        if lab == -1:
            continue
        cluster_indices = np.where(labels == lab)[0]
        cluster_size = len(cluster_indices)

        if cluster_size > max_cluster_size:
            # 大きすぎるクラスタを再帰分割
            sub_labels = recursive_dbscan(
                X[cluster_indices],
                eps=max(eps*0.9, 0.05),
                min_samples=min_samples,
                max_cluster_size=max_cluster_size,
                depth=depth+1,
                max_depth=max_depth,
                label_offset=current_label
            )
            # sub_labels の -1 は無視して、残りを final_labels に割り当て
            for sub_lab in set(sub_labels):
                if sub_lab == -1:
                    continue
                idx_sub = cluster_indices[np.where(sub_labels == sub_lab)[0]]
                final_labels[idx_sub] = sub_lab
            current_label = final_labels.max() + 1
        else:
            final_labels[cluster_indices] = current_label
            current_label += 1

    # --- ノイズまとめ再クラスタリング ---
    noise_idx = np.where(final_labels == -1)[0]
    if len(noise_idx) > 0:
        noise_X = X[noise_idx]
        # eps は少し大きめにして再クラスタリング
        noise_db = DBSCAN(eps=eps*1.1, min_samples=min_samples)
        noise_labels = noise_db.fit_predict(noise_X)
        # ノイズラベルにオフセットを付けて衝突防止
        new_labels = np.where(noise_labels != -1, noise_labels + current_label, -1)
        final_labels[noise_idx] = new_labels

    print(f"cluster num { len(set(final_labels)) - (1 if -1 in final_labels else 0)} at depth {depth}")

    return final_labels


recursive_labels = recursive_dbscan(proj_umap, eps=0.1, min_samples=10, max_cluster_size=100, max_depth=4)
fig = px.scatter(x=proj_umap[:, 0], y=proj_umap[:, 1], color=recursive_labels.astype(str), hover_name=top_words, title=f'UMAP + Recursive DBSCAN')
fig.update_traces(marker=dict(size=2))
fig.update_layout(height=800, width=800)
fig.show()

cluster num 6 at depth 4
cluster num 7 at depth 4
cluster num 31 at depth 3
cluster num 44 at depth 2
cluster num 1 at depth 3
cluster num 3 at depth 2
cluster num 4 at depth 2
cluster num 3 at depth 2
cluster num 2 at depth 2
cluster num 88 at depth 1
cluster num 4 at depth 1
cluster num 2 at depth 1
cluster num 2 at depth 1
cluster num 2 at depth 1
cluster num 10 at depth 1
cluster num 1 at depth 4
cluster num 1 at depth 3
cluster num 1 at depth 2
cluster num 2 at depth 1
cluster num 1 at depth 4
cluster num 1 at depth 3
cluster num 1 at depth 2
cluster num 2 at depth 1
cluster num 246 at depth 0


In [29]:
dbscan_pca = DBSCAN(eps=0.05, min_samples=5)
def plot_scatter_2d(proj, labels, color, title='2D Projection of Word Vectors'):
    fig = px.scatter(x=proj[:, 0], y=proj[:, 1], color=color, hover_name=labels, title=title)
    fig.update_traces(marker=dict(size=2)) # size 1
    fig.update_layout(height=800, width=800) # size 2
    fig.show()
def plot_dbscan_results(proj, true_labels, eps, min_samples):
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    labels = dbscan.fit_predict(proj)
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise = list(labels).count(-1) # クラスタに含まれない点の数
    if n_clusters > 1:  # クラスタが1つ以上の場合のみ評価
        silhouette_avg = silhouette(proj, labels)
        print(f"eps={eps}, min_samples={min_samples} => clusters: {n_clusters}, noise: {n_noise}, silhouette: {silhouette_avg:.4f}")
        plot_scatter_2d(proj, true_labels, color=labels.astype(str), title=f'DBSCAN (eps={eps}, min_samples={min_samples})')

plot_dbscan_results(proj_pca, top_words, eps=0.01, min_samples=3)




eps=0.01, min_samples=3 => clusters: 858, noise: 4415, silhouette: -0.1839


In [31]:
plot_dbscan_results(proj_umap, top_words, eps=0.1, min_samples=10)

eps=0.1, min_samples=10 => clusters: 128, noise: 1701, silhouette: -0.0724


In [36]:
 # dbscanでつけたラベルを使って、クラスタごとのシルエットスコア、クラスタ間距離の相関をとる 0は大きすぎるので除く

dbscan = DBSCAN(eps=0.1, min_samples=10)
labels = dbscan.fit_predict(proj_umap)
# ラベルが0または-1は除く
mask = (labels != 0) & (labels != -1)
_labels = labels[mask]
_proj_umap = proj_umap[mask]
_top_vectors = top_vectors[mask]

# クラスタごとのシルエットスコア
if len(set(_labels)) > 1:
    silhouette_avg = silhouette(_proj_umap, _labels)
    print(f"Silhouette score (excluding label 0 and -1): {silhouette_avg:.4f}")
else:
    print("クラスタ数が1つしかありません")

# クラスタ間距離の相関
pearson_corr, spearman_corr = cluster_center_distance_correlation(_top_vectors, _proj_umap, _labels)
print(f"Cluster center distance correlation (Pearson): {pearson_corr:.4f}")
print(f"Cluster center distance correlation (Spearman): {spearman_corr:.4f}")





# # クラスタ間距離の正しさをヒートマップで可視化
# def plot_cluster_distance_heatmap(X_high, X_low, labels):
#     def compute_cluster_centers(X, labels):
#         centers = []
#         unique_labels = np.unique(labels)
#         for lab in unique_labels:
#             centers.append(X[labels == lab].mean(axis=0))
#         return np.array(centers), unique_labels
    
#     centers_high, labels_unique = compute_cluster_centers(X_high, labels)
#     centers_low, _ = compute_cluster_centers(X_low, labels)
    
#     dist_high = pairwise_distances(centers_high)
#     dist_low = pairwise_distances(centers_low)
    
#     fig = go.Figure(data=go.Heatmap(
#         z=dist_high,
#         x=labels_unique,
#         y=labels_unique,
#         colorscale='Viridis',
#         colorbar=dict(title='High-Dim Distances')
#     ))
#     fig.update_layout(title='Cluster Center Distances in High-Dimensional Space')
#     fig.show()

#     fig = go.Figure(data=go.Heatmap(
#         z=dist_low,
#         x=labels_unique,
#         y=labels_unique,
#         colorscale='Viridis',
#         colorbar=dict(title='Low-Dim Distances')
#     ))
#     fig.update_layout(title='Cluster Center Distances in Low-Dimensional Space')
#     fig.show()

# plot_cluster_distance_heatmap(top_vectors, proj_umap, labels)



Silhouette score (excluding label 0 and -1): 0.5179
Cluster center distance correlation (Pearson): 0.2805
Cluster center distance correlation (Spearman): 0.2647


In [40]:
import numpy as np
import plotly.graph_objects as go
from scipy.spatial.distance import cdist
from plotly.subplots import make_subplots
def plot_cluster_distance_heatmap_plotly(high_dim, low_dim, labels):
    # クラスタごとの重心を計算
    unique_labels = np.unique(labels)
    centers_high = []
    centers_low = []
    for lab in unique_labels:
        idx = labels == lab
        centers_high.append(high_dim[idx].mean(axis=0))
        centers_low.append(low_dim[idx].mean(axis=0))
    centers_high = np.vstack(centers_high)
    centers_low = np.vstack(centers_low)

    # 距離行列を計算
    dist_high = cdist(centers_high, centers_high)
    dist_low = cdist(centers_low, centers_low)

    # Figure 作成
    fig = make_subplots(rows=3, cols=1, subplot_titles=("High-dimensional", "Low-dimensional"))

    fig.add_trace(
        go.Heatmap(
            z=dist_high,
            x=unique_labels,
            y=unique_labels,
            colorscale="Blues",
            text=np.round(dist_high, 2),
            texttemplate="%{text}",
            colorbar=dict(title="Distance", x=0.45)
        ),
        row=1, col=1
    )

    fig.add_trace(
        go.Heatmap(
            z=dist_low,
            x=unique_labels,
            y=unique_labels,
            colorscale="Reds",
            text=np.round(dist_low, 2),
            texttemplate="%{text}",
            colorbar=dict(title="Distance", x=1.05)
        ),
        row=2, col=1
    )
    # 差分
    fig.add_trace(
        go.Heatmap(
            z=dist_high - dist_low,
            x=unique_labels,
            y=unique_labels,
            colorscale="RdBu",
            text=np.round(dist_high - dist_low, 2),
            texttemplate="%{text}",
            colorbar=dict(title="Distance Difference", x=1.65)
        ),
        row=3, col=1
    )

    fig.update_layout(
        title="Cluster center distance comparison",
        width=900,
        height=400,
        showlegend=False
    )

    fig.show()

    return dist_high, dist_low

plot_cluster_distance_heatmap_plotly(top_vectors, proj_umap, labels)


(array([[0.        , 0.39516412, 1.5427491 , ..., 1.45847591, 2.31184074,
         1.59096748],
        [0.39516412, 0.        , 1.52275694, ..., 1.54451295, 2.37090897,
         1.57396431],
        [1.5427491 , 1.52275694, 0.        , ..., 2.18784692, 2.84433209,
         2.25880898],
        ...,
        [1.45847591, 1.54451295, 2.18784692, ..., 0.        , 2.74291746,
         2.16921367],
        [2.31184074, 2.37090897, 2.84433209, ..., 2.74291746, 0.        ,
         2.84221812],
        [1.59096748, 1.57396431, 2.25880898, ..., 2.16921367, 2.84221812,
         0.        ]]),
 array([[0.        , 1.07242818, 4.10368841, ..., 1.28637014, 2.4437567 ,
         2.93116916],
        [1.07242818, 0.        , 4.16784512, ..., 2.28597482, 2.74077629,
         2.59599369],
        [4.10368841, 4.16784512, 0.        , ..., 3.75425202, 6.54743283,
         1.88142544],
        ...,
        [1.28637014, 2.28597482, 3.75425202, ..., 0.        , 3.19755461,
         3.19813635],
        [2.4

In [62]:
import numpy as np
from sklearn.metrics import pairwise_distances

def cluster_trustworthiness_continuity(X_high, X_low, labels, k=5):
    """
    クラスタ代表点（重心）単位で Trustworthiness / Continuity を計算
    
    Parameters
    ----------
    X_high : ndarray, shape (n_samples, n_features)
        高次元データ
    X_low : ndarray, shape (n_samples, low_dim)
        低次元データ（可視化後）
    labels : ndarray, shape (n_samples,)
        クラスタラベル
    k : int
        近傍数
    
    Returns
    -------
    trustworthiness : float
    continuity : float
    """
    
    # クラスタ中心
    unique_labels = np.unique(labels)
    centers_high = np.array([X_high[labels==lab].mean(axis=0) for lab in unique_labels])
    centers_low  = np.array([X_low[labels==lab].mean(axis=0)  for lab in unique_labels])
    
    n_clusters = len(unique_labels)
    
    # 高次元・低次元距離行列
    dist_high = pairwise_distances(centers_high)
    dist_low  = pairwise_distances(centers_low)
    
    # 近傍のインデックス
    neighbors_high = np.argsort(dist_high, axis=1)[:, 1:k+1]  # 自分除く
    neighbors_low  = np.argsort(dist_low, axis=1)[:, 1:k+1]
    
    # --- Trustworthiness ---
    trust_sum = 0
    for i in range(n_clusters):
        for rank, j in enumerate(neighbors_low[i], start=1):
            if j not in neighbors_high[i]:
                # 高次元での順位を計算
                rank_in_high = np.where(np.argsort(dist_high[i])[1:] == j)[0][0] + 1
                trust_sum += rank_in_high - k
    
    trustworthiness = 1 - (2 / (n_clusters * k * (2*n_clusters - 3*k -1))) * trust_sum
    
    # --- Continuity ---
    cont_sum = 0
    for i in range(n_clusters):
        for rank, j in enumerate(neighbors_high[i], start=1):
            if j not in neighbors_low[i]:
                rank_in_low = np.where(np.argsort(dist_low[i])[1:] == j)[0][0] + 1
                cont_sum += rank_in_low - k
    
    continuity = 1 - (2 / (n_clusters * k * (2*n_clusters - 3*k -1))) * cont_sum
    
    return trustworthiness, continuity


trust, cont = cluster_trustworthiness_continuity(_top_vectors, _proj_umap, _labels, k=85)
print(f"Cluster-level Trustworthiness: {trust:.4f}")
print(f"Cluster-level Continuity: {cont:.4f}")

Cluster-level Trustworthiness: 6.4522
Cluster-level Continuity: 6.5151


In [None]:
import numpy as np
import pandas as pd
from dash import Dash, dcc, html, Input, Output
import plotly.express as px
from scipy.spatial.distance import cdist


# --- データ準備 ---
df = pd.DataFrame({
    'x': proj_umap[:,0],
    'y': proj_umap[:,1],
    'cluster': recursive_labels,
})

# -1（ノイズ）を除いたクラスタラベル
unique_labels = np.unique(recursive_labels)
valid_labels = unique_labels[unique_labels != -1]

# 高次元クラスタ中心
centers_high = np.array([top_vectors[recursive_labels==lab].mean(axis=0) for lab in valid_labels])
centers_low  = np.array([proj_umap[recursive_labels==lab].mean(axis=0) for lab in valid_labels])

# Dash アプリ
app = Dash(__name__)

app.layout = html.Div([
    html.H3("Cluster distance coloring by clicked cluster"),
    dcc.Graph(id='scatter-graph')
])

@app.callback(
    Output('scatter-graph', 'figure'),
    Input('scatter-graph', 'clickData')
)
def update_scatter(clickData):
    # -1クラスタは除外
    df_plot = df[df['cluster'] != -1].copy()
    
    if clickData is None:
        fig = px.scatter(df_plot, x='x', y='y', color='cluster', 
                         title="Click a point to color by high-dimensional cluster distance")
        return fig
    
    # クリックした点のインデックスを取得
    point_index = clickData['points'][0]['pointIndex']
    clicked_cluster = recursive_labels[point_index]
    if clicked_cluster == -1:
        fig = px.scatter(df_plot, x='x', y='y', color='cluster', 
                         title="Noise point clicked. Showing cluster coloring.")
        return fig

    # クリッククラスタ中心（高次元）
    idx_clicked = np.where(valid_labels == clicked_cluster)[0][0]
    center_clicked_high = centers_high[idx_clicked]

    # 高次元クラスタ中心間距離
    dist_to_clicked = cdist(center_clicked_high.reshape(1,-1), centers_high).flatten()
    # 順位に変換
    dist_to_clicked = np.argsort(np.argsort(dist_to_clicked))
    # scaling(小さい値付近を強調)
    dist_to_clicked = np.sqrt(dist_to_clicked)
    # 各点に対応するクラスタ距離を割り当て
    cluster_distance_map = dict(zip(valid_labels, dist_to_clicked))
    df_plot['color'] = df_plot['cluster'].map(cluster_distance_map)
    # Plotlyで散布図
    fig = px.scatter(df_plot, x='x', y='y', color='color',
                     color_continuous_scale='Plasma',
                     hover_data=['cluster'],
                     title=f"Distance from clicked cluster {clicked_cluster} (high-dim)")
    fig.update_layout(coloraxis_colorbar=dict(title='Distance'), height=800, width=800)
    fig.update_traces(marker=dict(size=3))
    fig.update_traces(marker=dict(size=3))
    return fig

if __name__ == '__main__':
    app.run(debug=True)


In [None]:
import numpy as np
import pandas as pd
from dash import Dash, dcc, html, Input, Output
import plotly.express as px
from scipy.spatial.distance import cdist

# --- データ準備 ---
df = pd.DataFrame({
    'x': proj_umap[:,0],
    'y': proj_umap[:,1],
    'cluster': labels
})

# Dash アプリ
app = Dash(__name__)

app.layout = html.Div([
    html.H3("Distance to clicked cluster (high-dimensional)"),
    dcc.Graph(id='scatter-graph')
])

@app.callback(
    Output('scatter-graph', 'figure'),
    Input('scatter-graph', 'clickData')
)
def update_scatter(clickData):
    df_plot = df.copy()
    
    if clickData is None:
        # 初期表示: クラスタごとの色
        fig = px.scatter(df_plot, x='x', y='y', color='cluster', 
                         title="Click a point to color by high-dimensional distance to clicked cluster")
        return fig
    
    # クリックした点のインデックス
    point_index = clickData['points'][0]['pointIndex']
    clicked_cluster = labels[point_index]

    # クリッククラスタの全点の高次元ベクトル
    clicked_indices = np.where(labels == clicked_cluster)[0]
    clicked_vectors = top_vectors[clicked_indices]

    # 各点との距離（最小距離でも平均距離でも可）
    # ここでは各点に対して「クリッククラスタ内の点との最小距離」を使う例
    dist = np.min(cdist(top_vectors, clicked_vectors), axis=1)
    
    # scaling(小さい値付近を強調)
    # dist = (dist - np.min(dist)) / (np.max(dist) - np.min(dist))
    dist = np.sqrt(dist)  # 非線形変換で小さい値を強調

    
    # データフレームに追加
    df_plot['color'] = dist

    # Plotly で散布図
    fig = px.scatter(df_plot, x='x', y='y', color='color',
                     color_continuous_scale='Plasma', # 他には 'Plasma', 'Cividis', 'Inferno', 'Magma'
                     hover_data=['cluster'],
                     title=f"Distance to clicked cluster {clicked_cluster} (high-dim)")
    fig.update_layout(coloraxis_colorbar=dict(title='Distance'), height=800, width=800)
    fig.update_traces(marker=dict(size=3))

    return fig

if __name__ == '__main__':
    app.run(debug=True)


In [30]:
# dbscanのパラメータ調整
for eps in [0.04, 0.05, 0.06, 0.07, 0.1, 0.3, 0.5, 0.7, 1.0]:
    for min_samples in [3, 5, 10]:
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        labels = dbscan.fit_predict(proj_umap)
        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        n_noise = list(labels).count(-1) # クラスタに含まれない点の数
        if n_clusters > 1:  # クラスタが1つ以上の場合のみ評価
            silhouette_avg = silhouette(proj_umap, labels)
            print(f"eps={eps}, min_samples={min_samples} => clusters: {n_clusters}, noise: {n_noise}, silhouette: {silhouette_avg:.4f}")
            

eps=0.04, min_samples=3 => clusters: 857, noise: 3296, silhouette: -0.0169
eps=0.04, min_samples=5 => clusters: 388, noise: 5982, silhouette: -0.3351
eps=0.04, min_samples=10 => clusters: 56, noise: 9010, silhouette: -0.5875
eps=0.05, min_samples=3 => clusters: 711, noise: 1896, silhouette: 0.0600
eps=0.05, min_samples=5 => clusters: 422, noise: 4084, silhouette: -0.1306
eps=0.05, min_samples=10 => clusters: 97, noise: 8103, silhouette: -0.5574
eps=0.06, min_samples=3 => clusters: 489, noise: 1076, silhouette: -0.0153
eps=0.06, min_samples=5 => clusters: 390, noise: 2510, silhouette: 0.0040
eps=0.06, min_samples=10 => clusters: 148, noise: 6715, silhouette: -0.4153
eps=0.07, min_samples=3 => clusters: 316, noise: 652, silhouette: -0.1417
eps=0.07, min_samples=5 => clusters: 285, noise: 1546, silhouette: -0.0602
eps=0.07, min_samples=10 => clusters: 170, noise: 5290, silhouette: -0.2566
eps=0.1, min_samples=3 => clusters: 104, noise: 149, silhouette: -0.2932
eps=0.1, min_samples=5 => cl

In [20]:
# DBSCANで可視化結果をクラスタリングして色付け
dbscan_pca = DBSCAN(eps=3, min_samples=5).fit(proj_pca)
dbscan_tsne = DBSCAN(eps=3, min_samples=5).fit(proj_tsne)
dbscan_umap = DBSCAN(eps=3, min_samples=5).fit(proj_umap)

# plot
fig_pca_dbscan = px.scatter(x=proj_pca[:, 0], y=proj_pca[:, 1], color=dbscan_pca.labels_.astype(str), hover_name=top_words, title='PCA Projection with DBSCAN Clusters')
fig_pca_dbscan.update_traces(marker=dict(size=2)) # size 1
fig_pca_dbscan.update_layout(height=800, width=800) # size 2
fig_pca_dbscan.show()

fig_tsne_dbscan = px.scatter(x=proj_tsne[:, 0], y=proj_tsne[:, 1], color=dbscan_tsne.labels_.astype(str), hover_name=top_words, title='t-SNE Projection with DBSCAN Clusters')
fig_tsne_dbscan.update_layout(height=800, width=800)
fig_tsne_dbscan.show()
fig_umap_dbscan = px.scatter(x=proj_umap[:, 0], y=proj_umap[:, 1], color=dbscan_umap.labels_.astype(str), hover_name=top_words, title='UMAP Projection with DBSCAN Clusters')
fig_umap_dbscan.update_layout(height=800, width=800)
fig_umap_dbscan.show()

