In [11]:
# import 
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
import plotly.express as px
from sklearn.datasets import fetch_openml
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from umap import UMAP

## mnist

In [12]:
# mnist

mnist = fetch_openml('mnist_784', version=1, as_frame=False)
num_samples = 2000
X = mnist.data[:num_samples] / 255.0
labels = mnist.target.astype(int)[:num_samples]
print(X.shape, labels.shape)


(2000, 784) (2000,)


In [3]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import pairwise_distances

k = 500  # ランドマーク数（調整）

# ランドマークを k-means のクラスタ中心として選ぶ（ランダムでも可）
kmeans = KMeans(n_clusters=k, random_state=0, n_init=10).fit(X)
landmarks = kmeans.cluster_centers_  # (k, D)

# ランドマークで PCA を学習
n_components = 2
pca_land = PCA(n_components=n_components)
land_emb = pca_land.fit_transform(landmarks)  # (k, n_components)

# 全データをランドマークの PCA 基底で射影（線形変換を直接使う）
# PCA は線形なので transform を使えば OK
X_proj_via_land = pca_land.transform(X)  # (N, n_components)

# 比較: ランドマークで学習したPCA vs 全データで学習したPCA（もし計算できれば）
# 全データPCA（小さいデータなら実行可）
# pca_all = PCA(n_components=n_components).fit(X)
# X_pca_all = pca_all.transform(X)

fig = px.scatter(x=X_proj_via_land[:, 0], y=X_proj_via_land[:, 1], color=labels.astype(str),
                 title=f"PCA via {k} landmarks",
                 labels={'x': 'PC1', 'y': 'PC2', 'color': 'Digit'})
fig.update_traces(marker=dict(size=3))
fig.show()


In [4]:
# pca

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
fig = px.scatter(x=X_pca[:, 0], y=X_pca[:, 1], color=labels.astype(str),
                 title="PCA (all data)",
                 labels={'x': 'PC1', 'y': 'PC2', 'color': 'Digit'})
fig.update_traces(marker=dict(size=3))
fig.show()

In [7]:
import numpy as np
from sklearn.metrics import pairwise_distances
from sklearn.manifold import MDS

# データ例
# X = np.random.rand(1000, 50)  # 1000点, 50次元?
k = 500  # ランドマーク数

# 1. ランドマーク選択
landmark_idx = np.random.choice(X.shape[0], k, replace=False)
landmarks = X[landmark_idx]

# 2. ランドマーク間距離
D_landmark = pairwise_distances(landmarks)

# 3. MDSで低次元に埋め込み
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=0)
landmark_emb = mds.fit_transform(D_landmark)

# 4. 他の点の埋め込みを近似
D_all = pairwise_distances(X, landmarks)  # 全点とランドマークの距離
weights = 1 / (D_all + 1e-8)  # 逆距離重み
weights /= weights.sum(axis=1, keepdims=True)
X_emb = weights @ landmark_emb  # 重み付き平均で補間

fig = px.scatter(x=X_emb[:, 0], y=X_emb[:, 1], color=labels.astype(str),
                 title=f"MDS via {k} landmarks",
                 labels={'x': 'Dim1', 'y': 'Dim2', 'color': 'Digit'})
fig.update_traces(marker=dict(size=3))
fig.show()



The default value of `n_init` will change from 4 to 1 in 1.9.



In [8]:
# 素のmds
mds = MDS(n_components=2, dissimilarity="euclidean", random_state=0)
X_mds = mds.fit_transform(X)
fig = px.scatter(x=X_mds[:, 0], y=X_mds[:, 1], color=labels.astype(str),
                 title="MDS (all data)",
                 labels={'x': 'Dim1', 'y': 'Dim2', 'color': 'Digit'})
fig.update_traces(marker=dict(size=3))
fig.show()


The default value of `n_init` will change from 4 to 1 in 1.9.



KeyboardInterrupt: 

In [15]:
import numpy as np
import umap


# --- ランドマークをサブサンプリング ---
landmark_size = 500
idx_landmarks = np.random.choice(X.shape[0], landmark_size, replace=False)
X_landmarks = X[idx_landmarks]

# --- ランドマークUMAP学習 ---
umap_model = umap.UMAP(n_components=2, random_state=42)
umap_model.fit(X_landmarks)  # ランドマークのみ学習

# --- 全データを射影 ---
X_umap_all = umap_model.transform(X)

# --- 可視化 ---
fig = px.scatter(x=X_umap_all[:, 0], y=X_umap_all[:, 1], color=labels.astype(str),
                 title=f"UMAP via {landmark_size} landmarks",
                 labels={'x': 'UMAP1', 'y': 'UMAP2', 'color': 'Digit'})
fig.update_traces(marker=dict(size=3))
fig.show()



n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [13]:
# umap
umap = UMAP(n_components=2, random_state=42)
X_umap = umap.fit_transform(X)
fig = px.scatter(x=X_umap[:, 0], y=X_umap[:, 1], color=labels.astype(str),
                 title="UMAP (all data)",
                 labels={'x': 'UMAP1', 'y': 'UMAP2', 'color': 'Digit'})
fig.update_traces(marker=dict(size=3))
fig.show()


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [None]:
import numpy as np
import umap
from sklearn.cluster import KMeans

def landmark_umap(X, n_clusters=10, n_components=2, random_state=0):
    # 1. クラスタリングでランドマーク選択
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, n_init=10)
    kmeans.fit(X)
    landmarks = kmeans.cluster_centers_
    

    # 2. ランドマークUMAP学習
    umap_model = umap.UMAP(n_components=n_components, random_state=random_state)
    umap_model.fit(landmarks)

    # 3. 全データ射影
    X_emb = umap_model.transform(X)
    return X_emb

X_emb = landmark_umap(X, n_clusters=100)
fig = px.scatter(x=X_emb[:, 0], y=X_emb[:, 1], color=labels.astype(str),
                 title="UMAP (all data)",
                 labels={'x': 'UMAP1', 'y': 'UMAP2', 'color': 'Digit'})
fig.update_traces(marker=dict(size=3))
fig.show()



n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [22]:
from sklearn.manifold import MDS
from sklearn.metrics import pairwise_distances

def landmark_mds(X, n_clusters=10, n_components=2, random_state=0):
    # 1. クラスタリングでランドマーク選択
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, n_init=10)
    kmeans.fit(X)
    landmarks = kmeans.cluster_centers_

    # 2. ランドマーク間距離行列
    D_landmarks = pairwise_distances(landmarks)

    # 3. MDSでランドマーク埋め込み
    mds = MDS(n_components=n_components, dissimilarity='precomputed', random_state=random_state)
    landmark_emb = mds.fit_transform(D_landmarks)

    # 4. 全点補間（距離加重平均）
    D_all_landmarks = pairwise_distances(X, landmarks)
    weights = 1 / (D_all_landmarks + 1e-8)
    weights /= weights.sum(axis=1, keepdims=True)
    X_emb = weights @ landmark_emb
    return X_emb


X_emb = landmark_mds(X, n_clusters=100)
fig = px.scatter(x=X_emb[:, 0], y=X_emb[:, 1], color=labels.astype(str),
                 title="UMAP (all data)",
                 labels={'x': 'UMAP1', 'y': 'UMAP2', 'color': 'Digit'})
fig.update_traces(marker=dict(size=3))
fig.show()


The default value of `n_init` will change from 4 to 1 in 1.9.



In [26]:
from sklearn.manifold import Isomap

def landmark_isomap(X, n_clusters=10, n_neighbors=5, n_components=2, random_state=0):
    # 1. クラスタリングでランドマーク選択
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, n_init=10)
    kmeans.fit(X)
    landmarks = kmeans.cluster_centers_

    # 2. Landmark Isomap（ランドマークのみで学習）
    isomap_model = Isomap(n_neighbors=n_neighbors, n_components=n_components)
    isomap_model.fit(landmarks)

    # 3. 全データ射影（補間）
    X_emb = isomap_model.transform(X)
    return X_emb

X_emb = landmark_isomap(X, n_clusters=100)
fig = px.scatter(x=X_emb[:, 0], y=X_emb[:, 1], color=labels.astype(str),
                 title="UMAP (all data)",
                 labels={'x': 'UMAP1', 'y': 'UMAP2', 'color': 'Digit'})
fig.update_traces(marker=dict(size=3))
fig.show()


In [None]:
idx_landmarks = np.random.RandomState(seed=random_state).choice(X.shape[0], n_landmarks, replace=False)

# landmark umap

In [30]:
def normalized_stress(X, X_proj):
    D_n = pairwise_distances(X)
    D_q = pairwise_distances(X_proj)
    numerator = np.sum((D_n - D_q) ** 2)
    denominator = np.sum(D_n ** 2)
    return numerator / denominator

In [31]:
random_state = 42

umap_model = umap.UMAP(n_components=n_components, random_state=random_state)
X_proj = umap_model.fit_transform(X)
print(normalized_stress(X, X_proj))



n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



0.27264942906238004


In [41]:
def landmark_stress(X, n_landmarks=10, n_components=2):
    idx_landmarks = np.random.RandomState(seed=random_state).choice(X.shape[0], n_landmarks, replace=False)
    landmarks = X[idx_landmarks]

    model = umap.UMAP(n_components=n_components, random_state=random_state)
    X_proj_landmarks = model.fit_transform(landmarks)
    # print(normalized_stress(landmarks, X_proj_landmarks))
    return normalized_stress(landmarks, X_proj_landmarks)

from tqdm import tqdm
stress_list = [landmark_stress(X, i) for i in tqdm(range(10, 500, 50))]
px.line(stress_list).show()




n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_neighbors is larger than the dataset size; truncating to X.shape[0] - 1


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.

100%|██████████| 10/10 [00:03<00:00,  2.55it/s]


In [None]:
def landmark_umap(X, n_landmarks=10, n_components=2, random_state=0):
    # 1. ランダムにランドマークを選択
    kmeans = KMeans(n_clusters=n_landmarks, random_state=random_state, n_init=10)
    kmeans.fit(X)
    landmarks = kmeans.cluster_centers_
    print(landmarks.shape)
    idx_landmarks = np.random.RandomState(seed=random_state).choice(X.shape[0], n_landmarks, replace=False)

    
    landmarks = X[idx_landmarks]
    # 2. ランドマークUMAP学習
    umap_model = umap.UMAP(n_components=n_components, random_state=random_state)
    umap_model.fit(landmarks)

    # 各ランドマークの近傍を投影する

    # 3. 全データ射影
    X_emb = umap_model.transform(X)
    return X_emb

X_emb = landmark_umap(X, n_landmarks=100)
fig = px.scatter(x=X_emb[:, 0], y=X_emb[:, 1], color=labels.astype(str),
                 title="UMAP (all data)",
                 labels={'x': 'UMAP1', 'y': 'UMAP2', 'color': 'Digit'})
fig.update_traces(marker=dict(size=3))
fig.show()

(100, 784)



n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



# 評価関数、視覚化の準備

In [3]:

import numpy as np
from sklearn.metrics import pairwise_distances
from sklearn.neighbors import NearestNeighbors

# 信頼性
def eval_trustworthiness(X, X_proj, n_neighbors=7):
    N = X.shape[0]
    # Original and projected distances
    dist_X = pairwise_distances(X)
    dist_proj = pairwise_distances(X_proj)

    # Rank of distances
    rank_X = np.argsort(np.argsort(dist_X, axis=1), axis=1)
    
    # K-nearest neighbors in projected space
    nn_proj = NearestNeighbors(n_neighbors=n_neighbors+1).fit(X_proj)
    neighbors_proj = nn_proj.kneighbors(return_distance=False)[:, 1:]

    # For each i, for each neighbor in projection, get the rank in original space
    t_sum = 0
    for i in range(N):
        for j in neighbors_proj[i]:
            r = rank_X[i, j]
            if r >= n_neighbors:
                t_sum += r - n_neighbors + 1
    
    norm = 2 / (N * n_neighbors * (2 * N - 3 * n_neighbors - 1))
    T = 1 - norm * t_sum
    return T



# 連続性
def eval_continuity(X, X_proj, n_neighbors=7):
    N = X.shape[0]
    # Original and projected distances
    dist_X = pairwise_distances(X)
    dist_proj = pairwise_distances(X_proj)

    # Rank of distances in projected space
    rank_proj = np.argsort(np.argsort(dist_proj, axis=1), axis=1)

    # K-nearest neighbors in original space
    nn_orig = NearestNeighbors(n_neighbors=n_neighbors+1).fit(X)
    neighbors_orig = nn_orig.kneighbors(return_distance=False)[:, 1:]

    c_sum = 0
    for i in range(N):
        for j in neighbors_orig[i]:
            r = rank_proj[i, j]
            if r >= n_neighbors:
                c_sum += r - n_neighbors + 1
    
    norm = 2 / (N * n_neighbors * (2 * N - 3 * n_neighbors - 1))
    C = 1 - norm * c_sum
    return C

# ストレス係数: 元の距離行列と射影後の距離行列の差を測る指標
def eval_normalized_stress(X, X_proj):
    D_n = pairwise_distances(X)
    D_q = pairwise_distances(X_proj)
    numerator = np.sum((D_n - D_q) ** 2)
    denominator = np.sum(D_n ** 2)
    return numerator / denominator



# Shepard Diagram helper (returns distances in original and projected space)
def eval_shepard_diagram_data(X, X_proj):
    D_n = pairwise_distances(X)
    D_q = pairwise_distances(X_proj)
    # Use upper triangle without diagonal to avoid redundancy
    i_upper = np.triu_indices_from(D_n, k=1)
    return D_n[i_upper], D_q[i_upper]

# Average Local Error (optional, related to trustworthiness-like structure)
def eval_average_local_error(X, X_proj, n_neighbors=5):
    N = X.shape[0]
    dist_X = pairwise_distances(X)
    dist_proj = pairwise_distances(X_proj)

    nn_orig = NearestNeighbors(n_neighbors=n_neighbors+1).fit(X)
    neighbors_orig = nn_orig.kneighbors(return_distance=False)[:, 1:]

    error_sum = 0
    for i in range(N):
        for j in neighbors_orig[i]:
            error_sum += abs(dist_X[i, j] - dist_proj[i, j])
    
    return error_sum / (N * n_neighbors)

### ラベルあり ###

# シルエットスコア
from sklearn.metrics import silhouette_score
def eval_silhouette(X_proj, labels):
    return silhouette_score(X_proj, labels)

# Neiborhood hit :次元削減した空間で、近傍点が同じラベルかどうか
def eval_neighborhood_hit(X_proj, labels, n_neighbors=7):
    N = X_proj.shape[0]
    nn = NearestNeighbors(n_neighbors=n_neighbors+1).fit(X_proj)
    neighbors = nn.kneighbors(return_distance=False)[:, 1:]

    hit_sum = 0
    for i in range(N):
        hit_sum += np.sum(labels[neighbors[i]] == labels[i])

    return hit_sum / (N * n_neighbors)

from scipy.stats import spearmanr, pearsonr

def cluster_center(X, labels):
    unique_labels = np.unique(labels)
    centers = np.array([X[labels == lbl].mean(axis=0) for lbl in unique_labels])
    return centers, unique_labels

# クラスタ中心間距離の相関
def cluster_center_distance_correlation(X, X_proj, labels):
    unique_labels = np.unique(labels)
    n_clusters = len(unique_labels)

    # クラスタ中心の計算
    centers_X, unique_labels = cluster_center(X, labels)
    centers_X_proj = cluster_center(X_proj, labels)[0]

    # クラスタ中心間の距離行列
    dist_centers_X = pairwise_distances(centers_X)
    dist_centers_X_proj = pairwise_distances(centers_X_proj)

    # 上三角行列の要素を取得
    i_upper = np.triu_indices_from(dist_centers_X, k=1)
    dists_X = dist_centers_X[i_upper]
    dists_X_proj = dist_centers_X_proj[i_upper]

    # 相関計算
    spearman_corr, _ = spearmanr(dists_X, dists_X_proj)
    pearson_corr, _ = pearsonr(dists_X, dists_X_proj)

    return spearman_corr, pearson_corr

def eval_pairwise_distances_error(X, X_proj):
    D_n = pairwise_distances(X)
    D_q = pairwise_distances(X_proj)
    # 点ごとの距離誤差
    error = np.abs(D_n - D_q)
    return error # (N, N)

def eval_pairwise_distances(X, X_proj):
    D_n = pairwise_distances(X)
    D_q = pairwise_distances(X_proj)
    return D_n, D_q
    
    


In [4]:
# 視覚化用の関数

import plotly.express as px
import plotly.graph_objects as go

# 単一値 → bar plot
def plot_bar(metrics_dict, title="Metric comparison"):
    fig = go.Figure()
    for metric_name, val in metrics_dict.items():
        fig.add_trace(go.Bar(name=metric_name, x=[metric_name], y=[val]))
    fig.update_layout(title=title, showlegend=False)
    fig.show()

# 点ごとの値 → scatter plot
def plot_scatter(x, values, title="Point-wise metric"):
    fig = px.scatter(x=x[:,0], y=x[:,1], color=values, color_continuous_scale='Viridis')
    fig.update_layout(title=title)
    fig.show()

# k依存 → line plot
def plot_line(k_list, metric_list, labels_list, title="Metric vs k"):
    fig = go.Figure()
    for metric, label in zip(metric_list, labels_list):
        fig.add_trace(go.Scatter(x=k_list, y=metric, mode='lines+markers', name=label))
    fig.update_layout(title=title, xaxis_title="k", yaxis_title="Metric")
    fig.show()

# ペアごと / 距離行列 → heatmap
def plot_heat(matrix, title="Distance/Error heatmap"):
    fig = go.Figure(data=go.Heatmap(z=matrix, colorscale='Viridis'))
    fig.update_layout(title=title)
    fig.show()
def plot_shepard(D_n, D_q, title="Shepard Diagram"):
    # 配列を1次元に変換（既に1次元の場合はそのまま）
    D_n_flat = D_n.flatten() if hasattr(D_n, 'flatten') else D_n
    D_q_flat = D_q.flatten() if hasattr(D_q, 'flatten') else D_q
    
    fig = px.scatter(x=D_n_flat, y=D_q_flat, 
                     labels={'x': 'Original Distance', 'y': 'Projected Distance'}, 
                     title=title)
    fig.add_shape(type='line', x0=0, y0=0, x1=max(D_n_flat), y1=max(D_n_flat), 
                  line=dict(color='Red', dash='dash'))
    fig.show()

# 実験用データの生成

In [5]:
import numpy as np
from sklearn.utils import check_random_state
from sklearn.decomposition import PCA
from scipy.spatial.distance import cdist
from scipy.cluster.hierarchy import linkage, cophenet
from scipy.spatial.distance import pdist
import plotly.express as px

# -------------------------
# 1) データ生成：クラスタ中心が配置された低次元スケルトン上で局所パッチを生成
# -------------------------
def generate_clustered_manifold_hierarchical(
    n_samples=2000,
    n_clusters=8,
    cluster_sizes=None,
    cluster_layout="circle",   # "circle", "grid", "tree"
    local_kind="gaussian",     # "gaussian", "local_swiss", "local_line"
    local_scale=0.5,
    intrinsic_patch_dim=2,     # each cluster latent dim
    embed_dim=50,              # final ambient dim
    noise_scale=0.01,
    random_state=None
):
    rng = check_random_state(random_state)

    # クラスタサイズの決定
    if cluster_sizes is None:
        base = n_samples // n_clusters
        cluster_sizes = [base + (1 if i < (n_samples % n_clusters) else 0) for i in range(n_clusters)]

    # 1) create cluster centers in latent 2D skeleton
    if cluster_layout == "circle":
        angles = np.linspace(0, 2*np.pi, n_clusters, endpoint=False)
        centers2 = np.column_stack([np.cos(angles), np.sin(angles)]) * 5.0
    elif cluster_layout == "grid":
        side = int(np.ceil(np.sqrt(n_clusters)))
        xs = np.arange(side) - (side-1)/2
        ys = xs.copy()
        grid = np.array([(x,y) for x in xs for y in ys])[:n_clusters]
        centers2 = grid * 3.0
    elif cluster_layout == "tree":
        # simple binary-tree laid out in 2D (levels)
        levels = int(np.ceil(np.log2(n_clusters+1)))
        centers = []
        i = 0
        for lvl in range(levels):
            m = min(2**lvl, n_clusters - i)
            xs = np.linspace(- (2**(levels-lvl-1)), (2**(levels-lvl-1)), m)
            y = -lvl * 3.0
            for x in xs:
                if i >= n_clusters: break
                centers.append((x, y))
                i += 1
        centers2 = np.array(centers[:n_clusters])
    else:
        raise ValueError("unknown layout")

    X_list = []
    labels = []
    cluster_centroids_hd = []

    for k in range(n_clusters):
        m_k = cluster_sizes[k]
        center = centers2[k]

        # === sample local latent coordinates (intrinsic_patch_dim)
        if local_kind == "gaussian":
            # centered gaussian around the cluster center (in latent2->expand to intrinsic dim)
            # create small local coordinates and then map to 2D offset
            Z = rng.normal(scale=local_scale, size=(m_k, intrinsic_patch_dim))
            # map Z to local displacement in 2D latent (simple linear map)
            A = rng.normal(scale=0.5, size=(intrinsic_patch_dim, 2))
            disp2 = Z @ A
            coords2 = center + disp2
            # map coords2 to 3D swiss-like patch or simple 2D->3D lift
            X3 = np.column_stack([coords2[:,0], coords2[:,1], np.sin(coords2[:,0]) * 0.5])
        elif local_kind == "local_swiss":
            t = rng.normal(loc=center[0], scale=local_scale, size=m_k)
            h = rng.normal(loc=center[1], scale=local_scale, size=m_k)
            X3 = np.column_stack([t*np.cos(t), h, t*np.sin(t)])
        elif local_kind == "local_line":
            t = rng.normal(loc=center[0], scale=local_scale, size=m_k)
            X3 = np.column_stack([t, np.zeros_like(t), np.zeros_like(t)])
        else:
            raise ValueError("unknown local_kind")

        # project local 3D patch to higher ambient dimension
        if embed_dim > 3:
            # create random orthonormal-ish projection
            W = rng.normal(size=(3, embed_dim))
            X_hd = X3 @ W
        else:
            X_hd = X3

        # add small isotropic noise
        X_hd += rng.normal(scale=noise_scale, size=X_hd.shape)

        X_list.append(X_hd)
        labels += [k] * m_k
        cluster_centroids_hd.append(X_hd.mean(axis=0))

    X = np.vstack(X_list)
    labels = np.array(labels)
    cluster_centroids_hd = np.vstack(cluster_centroids_hd)

    return X, labels, cluster_centroids_hd, centers2  # return also latent 2D centers for reference
    # 返り値の説明: X: (n_samples, embed_dim)のデータ行列
    #            labels: (n_samples,)のクラスタラベル
    #            cluster_centroids_hd: (n_clusters, embed_dim)の各クラスタ中心
    #            centers2: (n_clusters, 2)の潜在2Dクラスタ中心


# ===============================
# 実験実行
# ===============================
# データ生成 (10次元)
X, labels = generate_clustered_manifold_hierarchical(
    n_samples=2000,
    n_clusters=40,
    cluster_layout="circle", # "circle", "grid", "tree"
    local_kind="local_swiss", # "gaussian", "local_swiss", "local_line"
    local_scale=0.3,
    intrinsic_patch_dim=3,
    embed_dim=10,
    noise_scale=0.01,
    random_state=42
)[:2]
print(X.shape, labels.shape)
X_pca = PCA(n_components=3).fit_transform(X)
fig = px.scatter_3d(x=X_pca[:,0], y=X_pca[:,1], z=X_pca[:,2], color=labels.astype(str),
                    size_max=3,
                    opacity=0.7,
                    size=list(np.ones(X_pca.shape[0])),
                    title="Original Data PCA",
                    labels={'x': 'PC1', 'y': 'PC2', 'z': 'PC3', 'color': 'Cluster'})
fig.update_traces(marker=dict(symbol="circle", line=dict(width=0)))
fig.show()



(2000, 10) (2000,)


In [6]:
# 手法ごとにプロットする例
from sklearn.decomposition import PCA
import umap
from sklearn.manifold import TSNE

# クラスタ中心
def cluster_centers_highdim(X, labels):
    centers = []
    for k in np.unique(labels):
        centers.append(X[labels==k].mean(axis=0))
    return np.array(centers)


def representative_points(X, labels):
    """各ラベルの平均に最も近い点を返す"""
    reps = []
    indices = []
    for k in np.unique(labels):
        Xk = X[labels==k]
        mean_k = Xk.mean(axis=0)
        dists = np.linalg.norm(Xk - mean_k, axis=1)
        idx = np.argmin(dists)
        reps.append(Xk[idx])
        indices.append(np.where(labels==k)[0][idx])
    return np.array(reps), np.array(indices)

# 手法ごとに embedding 関数をまとめる
def embed_methods(X):
    embeddings = dict()
    embeddings['PCA'] = PCA(n_components=2).fit_transform(X)
    embeddings['UMAP'] = umap.UMAP(n_components=2).fit_transform(X)
    embeddings['t-SNE'] = TSNE(n_components=2, perplexity=10, init='random').fit_transform(X)
    return embeddings

# 指標計算
def evaluate_embeddings(X_high, embeddings, labels):
    results = dict()
    reps_high, rep_idx = representative_points(X_high, labels)
    for name, X_low in embeddings.items():
        metrics = dict()
        metrics['trustworthiness'] = eval_trustworthiness(X_high, X_low)
        metrics['continuity'] = eval_continuity(X_high, X_low)
        metrics['neighborhood_hit'] = eval_neighborhood_hit(X_low, labels)
        metrics['silhouette'] = eval_silhouette(X_low, labels)
        metrics['normalized_stress'] = eval_normalized_stress(X_high, X_low)

        metrics['rep_stress'] = eval_normalized_stress(reps_high, X_low[rep_idx])
        metrics['rep_trustworthiness'] = eval_trustworthiness(reps_high, X_low[rep_idx], n_neighbors=7)
        metrics['rep_continuity'] = eval_continuity(reps_high, X_low[rep_idx], n_neighbors=7)

        # n_neighbors metrics
        metrics['k-trustworthiness'] = [eval_trustworthiness(X_high, X_low, n) for n in range(3, 21, 3)]
        metrics['k-continuity'] = [eval_continuity(X_high, X_low, n) for n in range(3, 21, 3)]
        metrics['k-neighborhood_hit'] = [eval_neighborhood_hit(X_low, labels, n) for n in range(3, 21, 3)]

        # matrix-wise metrics
        # metrics['pairwise_distance_error'] = eval_pairwise_distances_error(X_high, X_low)
        metrics['rep_pairwise_distance_error'] = eval_pairwise_distances_error(reps_high, X_low[rep_idx])
        D_n, D_q = eval_shepard_diagram_data(X_high, X_low)
       
        # metrics['']

        # pair-wise metrics
        metrics['rep_pairwise_distances'] = eval_pairwise_distances(reps_high, X_low[rep_idx])


        # 各点ごとの指標
        # metrics['average_local_error'] = eval_average_local_error(X_high, X_low)
        # metrics['sammon_stress'] = eval_sammon_stress(X_high, X_low)

        results[name] = metrics
    return results

def plot_2d(X, labels, title):
    fig = px.scatter(x=X[:,0], y=X[:,1], color=labels.astype(str),
                     title=title,
                     labels={'x': 'Dim1', 'y': 'Dim2', 'color': 'Cluster'})
    fig.update_traces(marker=dict(size=3))
    fig.show()

# 指標ごとに表示方法を変更
def plot_all_metrics(results, embeddings, labels):
    # 指標ごとに表示方法を定義
    scalar_metrics = ['trustworthiness', 'continuity', 'neighborhood_hit', 'silhouette', 
                   'rep_trustworthiness', 'rep_continuity', 'rep_stress', 'normalized_stress']
    parametric_metrics = ['k-trustworthiness', 'k-continuity', 'k-neighborhood-hit']  # 例: k依存の指標
    pointwise_metrics = []  # 例: 各点ごとの誤差
    matrix_metrics = [ 'rep_pairwise_distance_error']  # 例: 距離行列の誤差
    pairwise_metrics = ['rep_pairwise_distances']  # 例: 高次元、低次元の距離行列
    # heatmap_metrics = ['pairwise_distance_error']  # 例: 距離行列の誤差

    # Bar chart for quality metrics (0-1 range)
    for metric_name in scalar_metrics:
        if metric_name in list(results.values())[0].keys():
            metrics_dict = {name: vals[metric_name] for name, vals in results.items()}
            plot_bar(metrics_dict, title=f"{metric_name} comparison")
    
    # Line chart for parametric metrics
    for metric_name in parametric_metrics:
        if metric_name in list(results.values())[0].keys():
            # k値の範囲を定義（evaluate_embeddingsと同じ範囲）
            k_values = list(range(3, 21, 3))
            
            # 各手法の系列データを取得
            method_series = []
            method_names = []
            for method_name in results.keys():
                if metric_name in results[method_name]:
                    method_series.append(results[method_name][metric_name])
                    method_names.append(method_name)
            
            plot_line(k_values, method_series, method_names, title=f"{metric_name} vs k")

    # # Scatter plot for point-wise metrics
    # for metric_name in pointwise_metrics:
    #     if metric_name in list(results.values())[0].keys():
    #         for name, vals in results.items():
    #             if metric_name in vals:
    #                 plot_scatter(embeddings[name], vals[metric_name], title=f"{name} - {metric_name}")

    

    # Heatmap for matrix metrics
    for metric_name in matrix_metrics:
        if metric_name in list(results.values())[0].keys():
            for name, vals in results.items():
                if metric_name in vals:
                    plot_heat(vals[metric_name], title=f"{name} - {metric_name}")

    # Scatter for pairwise metrics (Shepard diagram)
    for metric_name in pairwise_metrics:
        if metric_name in list(results.values())[0].keys():
            for name, vals in results.items():
                if metric_name in vals:
                    
                    D_n, D_q = vals[metric_name]
                    print(D_n.shape, D_q.shape)
                    plot_shepard(D_n, D_q, title=f"{name} - Shepard Diagram")
    # 2d plot
    for name, X_low in embeddings.items():
        plot_2d(X_low, labels, title=f"{name} embedding")

In [13]:
embbed_X = embed_methods(X)

In [14]:
results = evaluate_embeddings(X, embbed_X, labels)


In [15]:
plot_all_metrics(results, embed_methods(X), labels)

(10, 10) (10, 10)


(10, 10) (10, 10)


(10, 10) (10, 10)


# PCA, UMAP, t-SNEの比較
MNIST: continuity, trustworthiness, neighborhood hitなど既存のメトリクスはumap > tnse >>> pcaといった結果. 
クラスタごと（ラベルの中心を代表点としたとき) のメトリクスは　umap > pca, tsne

生成したデータ: クラスタごとだと pca > umap > tsne. 特に