In [3]:
import numpy as np
import pandas as pd
import plotly.express as px


In [4]:
import numpy as np
from sklearn.metrics import pairwise_distances

def lmds(X, n_landmarks=100, n_components=2, random_state=0):
    rng = np.random.default_rng(random_state)
    N = X.shape[0]

    # 1. ランドマーク選択
    landmark_idx = rng.choice(N, size=n_landmarks, replace=False)
    L = X[landmark_idx]

    # 2. ランドマーク間距離
    D_LL = pairwise_distances(L, L, squared=True)

    # 3. 古典的 MDS (ランドマーク埋め込み)
    m = n_landmarks
    H = np.eye(m) - np.ones((m, m)) / m
    B = -0.5 * H @ D_LL @ H
    eigvals, eigvecs = np.linalg.eigh(B)
    idx = np.argsort(eigvals)[::-1]
    eigvals, eigvecs = eigvals[idx], eigvecs[:, idx]
    Y_L = eigvecs[:, :n_components] * np.sqrt(eigvals[:n_components])

    # 4. 非ランドマーク点の埋め込み
    D_XL = pairwise_distances(X, L, squared=True)  # N x m
    ones_m = np.ones((m, 1))
    mean_D_LL = D_LL.mean(axis=0)
    mean_D_XL = D_XL.mean(axis=1, keepdims=True)
    mean_all = D_LL.mean()

    B_X = -0.5 * (D_XL - mean_D_LL - mean_D_XL + mean_all)
    Y_X = B_X @ eigvecs[:, :n_components] @ np.diag(1 / np.sqrt(eigvals[:n_components]))

    return Y_X, Y_L, landmark_idx


In [1]:
# dataset
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, as_frame=False
)
n_samples = 2000
X = mnist.data[:n_samples] / 255.0
labels = mnist.target.astype(int)[:n_samples]




In [14]:
# pca
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
y_x = pca.fit_transform(X)
fig = px.scatter(x=y_x[:, 0], y=y_x[:, 1], color=labels.astype(str),
                 title="PCA (all data)",
                 labels={'x': 'PC1', 'y': 'PC2', 'color': 'Digit'})
fig.show()



In [None]:
y_x, y_l, landmark_idx = lmds(X, n_landmarks=100, random_state=42)
fig = px.scatter(x=y_x[:, 0], y=y_x[:, 1], color=labels.astype(str),
                 title="Landmark MDS (all data)",
                 labels={'x': 'LMDS1', 'y': 'LMDS2', 'color': 'Digit'})
fig.show()

In [None]:
# ただのMDS
from sklearn.manifold import MDS
mds = MDS(n_components=2, random_state=42)
X_mds = mds.fit_transform(X)
fig = px.scatter(x=X_mds[:, 0], y=X_mds[:, 1], color=labels.astype(str),
                 title="MDS (all data)",
                 labels={'x': 'MDS1', 'y': 'MDS2', 'color': 'Digit'})
fig.show()




The default value of `n_init` will change from 4 to 1 in 1.9.



In [6]:
import numpy as np
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.neighbors import kneighbors_graph
from scipy.linalg import eigh
from scipy.sparse.linalg import eigsh

# ----------------------------
# 1. LMDS
# ----------------------------
def lmds(X_landmarks, X_nonlandmarks, n_components=2):
    N = X_landmarks.shape[0]
    D_LL = pairwise_distances(X_landmarks)
    H = np.eye(N) - np.ones((N,N))/N
    B_LL = -0.5 * H @ (D_LL**2) @ H
    eigvals, eigvecs = eigh(B_LL)
    idx = np.argsort(eigvals)[::-1][:n_components]
    Y_L = eigvecs[:, idx] * np.sqrt(eigvals[idx])
    
    # 非ランドマーク点の座標（簡易版）
    D_NL = pairwise_distances(X_nonlandmarks, X_landmarks)
    # 中心化
    D_NL_mean_row = D_NL.mean(axis=1, keepdims=True)
    D_NL_mean_col = D_NL.mean(axis=0, keepdims=True)
    D_NL_mean_all = D_NL.mean()
    B_NL = -0.5 * (D_NL**2 - D_NL_mean_row - D_NL_mean_col + D_NL_mean_all)
    Y_N = B_NL @ eigvecs[:, idx] * np.sqrt(1)
    
    return Y_L, Y_N

# ----------------------------
# 2. Nyström PCA / Kernel
# ----------------------------
def nystrom_kernel(X_landmarks, X_nonlandmarks, n_components=2, gamma=0.1):
    K_LL = rbf_kernel(X_landmarks, X_landmarks, gamma=gamma)
    eigvals, eigvecs = eigh(K_LL)
    idx = np.argsort(eigvals)[::-1][:n_components]
    Y_L = eigvecs[:, idx] * np.sqrt(eigvals[idx])
    
    K_NL = rbf_kernel(X_nonlandmarks, X_landmarks, gamma=gamma)
    Y_N = K_NL @ eigvecs[:, idx] * np.sqrt(1/eigvals[idx])
    
    return Y_L, Y_N

# ----------------------------
# 3. Laplacian Eigenmaps (ランドマーク版)
# ----------------------------
def laplacian_eigenmaps(X_landmarks, X_nonlandmarks, n_neighbors=10, n_components=2):
    A = kneighbors_graph(X_landmarks, n_neighbors=n_neighbors, mode='connectivity', include_self=True)
    D = np.array(A.sum(axis=1)).flatten()
    L = np.diag(D) - A.toarray()
    
    eigvals, eigvecs = eigsh(L, k=n_components+1, which='SM')  # 0に近い固有値を取得
    # 0に対応する固有ベクトルは無視（定数ベクトル）
    Y_L = eigvecs[:, 1:n_components+1]
    
    # 非ランドマーク点は近傍ランドマークの加重平均で簡易射影
    from sklearn.neighbors import NearestNeighbors
    nbrs = NearestNeighbors(n_neighbors=n_neighbors).fit(X_landmarks)
    distances, indices = nbrs.kneighbors(X_nonlandmarks)
    weights = 1 / (distances + 1e-12)
    weights /= weights.sum(axis=1, keepdims=True)
    Y_N = np.array([np.sum(Y_L[indices[i]] * weights[i][:, None], axis=0) 
                    for i in range(X_nonlandmarks.shape[0])])
    
    return Y_L, Y_N

# # ----------------------------
# # 使い方サンプル
# # ----------------------------
# if __name__ == "__main__":
#     from sklearn.datasets import make_swiss_roll
#     X, _ = make_swiss_roll(n_samples=500, noise=0.05)
    
#     # ランドマーク選択
#     n_landmarks = 100
#     indices = np.random.choice(X.shape[0], n_landmarks, replace=False)
#     X_L = X[indices]
#     X_N = np.delete(X, indices, axis=0)
    
#     # LMDS
#     Y_L_LMDS, Y_N_LMDS = lmds(X_L, X_N, n_components=3)
    
#     # Nyström
#     Y_L_Nys, Y_N_Nys = nystrom_kernel(X_L, X_N, n_components=3, gamma=0.05)
    
#     # Laplacian Eigenmaps
#     Y_L_Lap, Y_N_Lap = laplacian_eigenmaps(X_L, X_N, n_neighbors=10, n_components=3)
    
#     print("LMDS landmark shape:", Y_L_LMDS.shape)
#     print("Nyström landmark shape:", Y_L_Nys.shape)
#     print("Laplacian landmark shape:", Y_L_Lap.shape)


In [12]:
# exec
# random
landmark_idx = np.random.choice(X.shape[0], 100, replace=False)
X_nonlandmarks = np.delete(X, landmark_idx, axis=0)
y_x, y_l = lmds(X, X_nonlandmarks)
fig = px.scatter(x=y_x[:, 0], y=y_x[:, 1], color
                    =labels.astype(str),
                    title="Landmark MDS (all data)",
                    labels={'x': 'LMDS1', 'y': 'LMDS2', 'color': 'Digit'})
fig.show()

# nystrom
y_l, y_x = nystrom_kernel(X[landmark_idx], np.delete(X
, landmark_idx, axis=0), n_components=2, gamma=0.1)
fig = px.scatter(x=y_x[:, 0], y=y_x[:, 1], color
                    =np.delete(labels, landmark_idx, axis=0).astype(str),
                    title="Nyström Kernel (all data)",
                    labels={'x': 'Nys1', 'y': 'Nys2', 'color': 'Digit'})
fig.show()
# laplacian
y_l, y_x = laplacian_eigenmaps(X[landmark_idx], np.delete(X
, landmark_idx, axis=0), n_neighbors=10, n_components=2)
fig = px.scatter(x=y_x[:, 0], y=y_x[:, 1], color
                    =np.delete(labels, landmark_idx, axis=0).astype(str),
                    title="Laplacian Eigenmaps (all data)",
                    labels={'x': 'Lap1', 'y': 'Lap2', 'color': 'Digit'})
fig.show()


In [None]:
# plotly 3d scatter

fig = px.scatter_3d(x=y_x[:, 0], y=y_x[:, 1], z=y_x[:, 2], color
                    =np.delete(labels, landmark_idx, axis=0).astype(str),
                    title="Laplacian Eigenmaps 3D (all data)",
                    labels={'x': 'Lap1', 'y': 'Lap2', 'z': 'Lap3', 'color': 'Digit'})
fig.show()

# クラスタ間の位置関係の維持: 実験用データの生成

In [None]:
import numpy as np
from sklearn.utils import check_random_state
from sklearn.decomposition import PCA
from scipy.spatial.distance import cdist
from scipy.cluster.hierarchy import linkage, cophenet
from scipy.spatial.distance import pdist

# -------------------------
# 1) データ生成：クラスタ中心が配置された低次元スケルトン上で局所パッチを生成
# -------------------------
def generate_clustered_manifold_hierarchical(
    n_samples=2000,
    n_clusters=8,
    cluster_sizes=None,
    cluster_layout="circle",   # "circle", "grid", "tree"
    local_kind="gaussian",     # "gaussian", "local_swiss", "local_line"
    local_scale=0.5,
    intrinsic_patch_dim=2,     # each cluster latent dim
    embed_dim=50,              # final ambient dim
    noise_scale=0.01,
    random_state=None
):
    rng = check_random_state(random_state)

    # クラスタサイズの決定
    if cluster_sizes is None:
        base = n_samples // n_clusters
        cluster_sizes = [base + (1 if i < (n_samples % n_clusters) else 0) for i in range(n_clusters)]

    # 1) create cluster centers in latent 2D skeleton
    if cluster_layout == "circle":
        angles = np.linspace(0, 2*np.pi, n_clusters, endpoint=False)
        centers2 = np.column_stack([np.cos(angles), np.sin(angles)]) * 5.0
    elif cluster_layout == "grid":
        side = int(np.ceil(np.sqrt(n_clusters)))
        xs = np.arange(side) - (side-1)/2
        ys = xs.copy()
        grid = np.array([(x,y) for x in xs for y in ys])[:n_clusters]
        centers2 = grid * 3.0
    elif cluster_layout == "tree":
        # simple binary-tree laid out in 2D (levels)
        levels = int(np.ceil(np.log2(n_clusters+1)))
        centers = []
        i = 0
        for lvl in range(levels):
            m = min(2**lvl, n_clusters - i)
            xs = np.linspace(- (2**(levels-lvl-1)), (2**(levels-lvl-1)), m)
            y = -lvl * 3.0
            for x in xs:
                if i >= n_clusters: break
                centers.append((x, y))
                i += 1
        centers2 = np.array(centers[:n_clusters])
    else:
        raise ValueError("unknown layout")

    X_list = []
    labels = []
    cluster_centroids_hd = []

    for k in range(n_clusters):
        m_k = cluster_sizes[k]
        center = centers2[k]

        # === sample local latent coordinates (intrinsic_patch_dim)
        if local_kind == "gaussian":
            # centered gaussian around the cluster center (in latent2->expand to intrinsic dim)
            # create small local coordinates and then map to 2D offset
            Z = rng.normal(scale=local_scale, size=(m_k, intrinsic_patch_dim))
            # map Z to local displacement in 2D latent (simple linear map)
            A = rng.normal(scale=0.5, size=(intrinsic_patch_dim, 2))
            disp2 = Z @ A
            coords2 = center + disp2
            # map coords2 to 3D swiss-like patch or simple 2D->3D lift
            X3 = np.column_stack([coords2[:,0], coords2[:,1], np.sin(coords2[:,0]) * 0.5])
        elif local_kind == "local_swiss":
            t = rng.normal(loc=center[0], scale=local_scale, size=m_k)
            h = rng.normal(loc=center[1], scale=local_scale, size=m_k)
            X3 = np.column_stack([t*np.cos(t), h, t*np.sin(t)])
        elif local_kind == "local_line":
            t = rng.normal(loc=center[0], scale=local_scale, size=m_k)
            X3 = np.column_stack([t, np.zeros_like(t), np.zeros_like(t)])
        else:
            raise ValueError("unknown local_kind")

        # project local 3D patch to higher ambient dimension
        if embed_dim > 3:
            # create random orthonormal-ish projection
            W = rng.normal(size=(3, embed_dim))
            X_hd = X3 @ W
        else:
            X_hd = X3

        # add small isotropic noise
        X_hd += rng.normal(scale=noise_scale, size=X_hd.shape)

        X_list.append(X_hd)
        labels += [k] * m_k
        cluster_centroids_hd.append(X_hd.mean(axis=0))

    X = np.vstack(X_list)
    labels = np.array(labels)
    cluster_centroids_hd = np.vstack(cluster_centroids_hd)

    return X, labels, cluster_centroids_hd, centers2  # return also latent 2D centers for reference
    # 返り値の説明: X: (n_samples, embed_dim)のデータ行列
    #            labels: (n_samples,)のクラスタラベル
    #            cluster_centroids_hd: (n_clusters, embed_dim)の各クラスタ中心
    #            centers2: (n_clusters, 2)の潜在2Dクラスタ中心

# -------------------------
# 2) 評価関数：クラスタレベルの保存指標
# -------------------------
def centroid_distance_metrics(centroids_orig, centroids_emb):
    """
    centroids_orig, centroids_emb: (n_clusters, D)
    returns: dict with Pearson/Spearman correlation, RMSE, relative error matrix
    """
    from scipy.stats import spearmanr, pearsonr
    D_orig = cdist(centroids_orig, centroids_orig)
    D_emb = cdist(centroids_emb, centroids_emb)
    # take upper triangular entries
    iu = np.triu_indices(D_orig.shape[0], k=1)
    v_orig = D_orig[iu]
    v_emb = D_emb[iu]
    pearson = pearsonr(v_orig, v_emb)[0]
    spearman = spearmanr(v_orig, v_emb)[0]
    rmse = np.sqrt(np.mean((v_orig - v_emb)**2))
    rel_rmse = rmse / (np.mean(v_orig) + 1e-12)
    return {"pearson": pearson, "spearman": spearman, "rmse": rmse, "rel_rmse": rel_rmse}

def cluster_nn_preservation(centroids_orig, centroids_emb, k=3):
    """
    For each cluster centroid, compute its k nearest cluster neighbors in orig and emb.
    Return average precision@k.
    """
    D_orig = cdist(centroids_orig, centroids_orig)
    D_emb = cdist(centroids_emb, centroids_emb)
    n = D_orig.shape[0]
    precs = []
    for i in range(n):
        orig_nn = np.argsort(D_orig[i])[1:k+1]
        emb_nn = np.argsort(D_emb[i])[1:k+1]
        inter = len(set(orig_nn).intersection(set(emb_nn)))
        precs.append(inter / k)
    return np.mean(precs)

def cophenetic_corr_of_centroids(centroids):
    """
    Build hierarchical clustering (average linkage) and return condensed distance pdist and cophenetic distances.
    For comparing two sets, use cophenet return from scipy linkage.
    """
    Z = linkage(centroids, method='average')
    coph_dists, _ = cophenet(Z, pdist(centroids))
    return Z, coph_dists

# -------------------------
# Example usage
# -------------------------
if __name__ == "__main__":
    X, labels, centroids_hd, latent_centers2 = generate_clustered_manifold_hierarchical(
        n_samples=3000, n_clusters=10, cluster_layout="circle",
        local_kind="local_swiss", intrinsic_patch_dim=2, embed_dim=50,
        local_scale=0.3, noise_scale=0.01, random_state=0
    )

    # do a quick PCA to 2D for visualization / embedding
    X_pca = PCA(n_components=3, random_state=0).fit_transform(X)
    centroids_emb_pca = np.vstack([X_pca[labels==k].mean(axis=0) for k in range(10)])
    metrics = centroid_distance_metrics(centroids_hd, centroids_emb_pca)
    print("centroid metrics (orig vs PCA-2D):", metrics)
    print("cluster-NN preservation@3 (PCA):", cluster_nn_preservation(centroids_hd, centroids_emb_pca, k=3))
    fig = px.scatter_3d(x=X_pca[:,0], y=X_pca[:,1], z=X_pca[:,2], color=labels.astype(str),
                         title="PCA 3D embedding of clustered manifold data",
                         labels={'x': 'PC1', 'y': 'PC2', 'z': 'PC3', 'color': 'Cluster'})
    fig.update_traces(marker=dict(size=2))
    fig.show()


centroid metrics (orig vs PCA-2D): {'pearson': 0.8477730110328396, 'spearman': 0.8301712779973649, 'rmse': 16.28863987909336, 'rel_rmse': 0.3537850005179882}
cluster-NN preservation@3 (PCA): 0.6666666666666667


In [None]:
#

# generate data(sub)

In [13]:
# ===============================
# 1. データ生成
# ===============================
import numpy as np
import plotly.express as px
from sklearn.decomposition import PCA
import umap

# --- データ生成関数（例: clustered swiss roll） ---
def generate_clustered_swiss(n_samples=2000, n_clusters=5, n_dim=10, rng=None):
    if rng is None:
        rng = np.random.RandomState(0)

    # latent cluster centers (t,h)
    t_centers = np.linspace(1.5*np.pi, 4.5*np.pi, n_clusters)
    h_centers = np.linspace(0, 21, n_clusters)
    centers = np.column_stack([t_centers, h_centers])

    # 各クラスタからサンプル
    cluster_choices = rng.choice(n_clusters, size=n_samples)
    Z = np.zeros((n_samples, 2))
    for k in range(n_clusters):
        idx = np.where(cluster_choices == k)[0]
        if idx.size > 0:
            Z[idx] = centers[k] + rng.normal(scale=[0.5, 2.0], size=(idx.size, 2))

    # swiss roll mapping
    t, h = Z[:, 0], Z[:, 1]
    X3 = np.vstack([t * np.cos(t), h, t * np.sin(t)]).T

    # ランダム線形射影で n_dim に拡張
    if n_dim > 3:
        W = rng.randn(3, n_dim)
        X_hd = X3 @ W
    else:
        X_hd = X3
    return X_hd, cluster_choices


# ===============================
# 2. 次元削減 (PCA, UMAP)
# ===============================
def apply_pca(X, n_components=2):
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X)
    return X_pca

def apply_umap(X, n_components=2, n_neighbors=15, min_dist=0.1, random_state=42):
    reducer = umap.UMAP(n_components=n_components,
                        n_neighbors=n_neighbors,
                        min_dist=min_dist,
                        random_state=random_state)
    X_umap = reducer.fit_transform(X)
    return X_umap


# ===============================
# 3. 可視化（Plotly）
# ===============================
def plot_embedding(X_emb, labels, title="embedding"):
    fig = px.scatter(
        x=X_emb[:,0], y=X_emb[:,1],
        color=labels.astype(str),
        title=title,
        opacity=0.7
    )
    fig.update_traces(marker=dict(size=4))
    fig.show()


# ===============================
# 実験実行
# ===============================
# データ生成 (10次元)
X, labels = generate_clustered_swiss(n_samples=2000, n_clusters=6, n_dim=10)

# PCA
X_pca = apply_pca(X, n_components=2)
plot_embedding(X_pca, labels, title="PCA projection")

# UMAP
X_umap = apply_umap(X, n_components=2, n_neighbors=20, min_dist=0.1)
plot_embedding(X_umap, labels, title="UMAP projection")



IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html




n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



# ランドマーク型次元削減

In [3]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_digits
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
import plotly.figure_factory as ff
import plotly.express as px

# 1. データ読み込み（ここでは digits データセットを使用: 8x8 MNIST風）
digits = load_digits()
X = digits.data
y = digits.target

# 2. 前処理（標準化 + 次元削減して高速化）
X_scaled = StandardScaler().fit_transform(X)
pca = PCA(n_components=20)
X_pca = pca.fit_transform(X_scaled)

# 3. 階層的クラスタリング
Z = linkage(X_pca, method="ward")

# 4. デンドログラム（全体表示）
fig_dendro = ff.create_dendrogram(Z, orientation="left")
fig_dendro.update_layout(width=800, height=600, title="Hierarchical Clustering Dendrogram")
fig_dendro.show()

# 5. クラスタ分割（例: 10クラスタに分割）
clusters = fcluster(Z, 10, criterion="maxclust")

# 6. 各クラスタに含まれるラベル分布を集計
df = pd.DataFrame({"cluster": clusters, "label": y})
counts = df.groupby(["cluster", "label"]).size().reset_index(name="count")

# 7. 可視化（クラスタごとのラベル割合）
fig_bar = px.bar(counts, x="cluster", y="count", color="label", 
                 barmode="stack", title="Label Distribution per Cluster")
fig_bar.show()

In [None]:
# umap
import umap
umap_model = umap.UMAP(random_state=0)
X_umap = umap_model.fit_transform(X_scaled)
fig_umap = px.scatter(x=X_umap[:,0], y=X_umap[:,1
], color=y.astype(str),
                      title="UMAP projection of Digits dataset",
                      labels={'x': 'UMAP1', 'y': 'UMAP2', 'color': 'Digit'})
fig_umap.update_traces(marker=dict(size=3))
fig_umap.show()


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [2]:
import numpy as np
import plotly.express as px

from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import umap

# ========== 1. データ用意 ==========
digits = load_digits()
X = digits.data  # (n_samples, 64) 8x8画像
y = digits.target

# ========== 2. ランドマーク選出 ==========
# ここでは k-means のクラスタ中心をランドマークとする
n_landmarks = 10
kmeans = KMeans(n_clusters=n_landmarks, random_state=0)
labels = kmeans.fit_predict(X)
landmarks = kmeans.cluster_centers_

# ========== 3. 一段階目の次元削減 ==========
# ランドマークを2次元に配置（大域構造）
reducer_global = umap.UMAP(random_state=0)
landmarks_2d = reducer_global.fit_transform(landmarks)

fig = px.scatter(x=landmarks_2d[:, 0], y=landmarks_2d[:, 1], text=[str(i) for i in range(n_landmarks)],
                 title="1段階目: ランドマークの2D配置",
                 labels={'x': 'UMAP1', 'y': 'UMAP2'})
fig.update_traces(marker=dict(size=12, symbol='x'))
fig.show()

# ========== 4. 特定クラスタの2段階目次元削減 ==========
target_cluster = 3  # ランドマーク 3 を選択
X_sub = X[labels == target_cluster]
y_sub = y[labels == target_cluster]

# サブクラスタ内を次元削減
reducer_local = PCA(n_components=2)
X_sub_2d = reducer_local.fit_transform(X_sub)

fig = px.scatter(x=X_sub_2d[:, 0], y=X_sub_2d[:, 1], color=y_sub.astype(str),
                    title=f"2段階目: クラスタ {target_cluster} 内の次元削減",
                    labels={'x': 'PC1', 'y': 'PC2', 'color': 'Digit'})
fig.update_traces(marker=dict(size=6))
fig.show()



n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_neighbors is larger than the dataset size; truncating to X.shape[0] - 1



In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import umap


# ========== ランドマーク選出 ==========
n_landmarks = 10
kmeans = KMeans(n_clusters=n_landmarks, random_state=0)
labels = kmeans.fit_predict(X)
landmarks = kmeans.cluster_centers_

# ========== ランドマーク次元削減 ==========
reducer_global = umap.UMAP(random_state=0)
landmarks_2d = reducer_global.fit_transform(landmarks)

# ========== 各クラスタの局所次元削減 ==========
cluster_embeds = {}
for cluster_id in range(n_landmarks):
    X_sub = X[labels == cluster_id]
    y_sub = y[labels == cluster_id]
    
    if len(X_sub) > 2:  # クラスタ内に十分な点があれば
        reducer_local = PCA(n_components=2)
        X_sub_2d = reducer_local.fit_transform(X_sub)
        
        # 正規化してスケールを合わせる
        X_sub_2d = (X_sub_2d - X_sub_2d.mean(axis=0)) / (X_sub_2d.std(axis=0) + 1e-9)
        cluster_embeds[cluster_id] = (X_sub_2d, y_sub)

# ========== 可視化 ==========
fig = px.

for cluster_id, (X_sub_2d, y_sub) in cluster_embeds.items():
    # ランドマーク座標を基準にシフト
    center = landmarks_2d[cluster_id]
    scale = 0.5  # 各クラスタの大きさを調整
    X_shifted = X_sub_2d * scale + center
    
    plt.scatter(X_shifted[:, 0], X_shifted[:, 1], c=y_sub, cmap="tab10", s=15, alpha=0.6)

# ランドマークを大きめに表示
plt.scatter(landmarks_2d[:, 0], landmarks_2d[:, 1], 
            c=range(n_landmarks), cmap="tab10", s=250, marker="X", edgecolor="black")

plt.title("ランドマーク周囲にクラスタ次元削減結果を展開")
plt.show()


In [None]:
data = load_files('dataSet\2020news-bydate\20news-bydate-train')

In [55]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

# データの読み込み
newsgroups = fetch_20newsgroups(
    subset='train',
    # data_home=local_folder,
    remove=('headers', 'footers', 'quotes')
)

# TF-IDF ベクトル化（最大 5000 特徴量）
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(newsgroups.data)

# DataFrame に変換
df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
df['target'] = newsgroups.target
df['category'] = df['target'].map(dict(enumerate(newsgroups.target_names)))

# 確認
print(df.head())
print("Shape:", df.shape)



Retry downloading from url: https://ndownloader.figshare.com/files/5975967



URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1000)>

In [None]:
# 階層的なガウス
import numpy as np

def generate_hierarchical_gaussian(n_samples=1000, dim=50, n_super=3, n_sub=3, seed=0):
    rng = np.random.RandomState(seed)
    X, y_super, y_sub = [], [], []
    
    for i in range(n_super):
        super_center = rng.randn(dim) * 5
        for j in range(n_sub):
            sub_center = super_center + rng.randn(dim) * 2
            points = rng.randn(n_samples//(n_super*n_sub), dim) + sub_center
            X.append(points)
            y_super += [i] * len(points)
            y_sub += [j + i*n_sub] * len(points)
    
    return np.vstack(X), np.array(y_super), np.array(y_sub)

X, y_super, y_sub = generate_hierarchical_gaussian()
print("Shape:", X.shape)


In [36]:
import scanpy as sc

adata = sc.datasets.pbmc3k()  # 2700細胞 × 1838遺伝子
print(adata.X.shape)  # (2700, 1838)
print("細胞型アノテーション:", adata.obs.head())





100%|██████████| 5.58M/5.58M [00:03<00:00, 1.88MB/s]


(2700, 32738)
細胞型アノテーション: Empty DataFrame
Columns: []
Index: [AAACATACAACCAC-1, AAACATTGAGCTAC-1, AAACATTGATCAGC-1, AAACCGTGCTTCCG-1, AAACCGTGTATGCG-1]


In [39]:
X = adata.X.toarray()  # numpy array に変換
# 前処理
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, n_top_genes=1000, subset=True)
sc.pp.scale(adata)
sc.tl.pca(adata, svd_solver='arpack')

# 近傍グラフを作る
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)

# Louvain クラスタリング
sc.tl.louvain(adata)

# 確認
print(adata.obs['louvain'].value_counts())
y = adata.obs['louvain'].astype(int).values



zero-centering a sparse array/matrix densifies it.



ModuleNotFoundError: No module named 'igraph'

In [None]:
# pca
X_pca = PCA(n_components=2).fit_transform(X)


In [30]:
import numpy as np
import plotly.graph_objects as go
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import umap



# ========== ランドマーク選出 ==========
n_landmarks = 50
kmeans = KMeans(n_clusters=n_landmarks, random_state=0)
labels = kmeans.fit_predict(X)
landmarks = kmeans.cluster_centers_

# ========== ランドマーク次元削減 ==========
reducer_global = umap.UMAP(random_state=0)
landmarks_2d = reducer_global.fit_transform(landmarks)

# ========== 各クラスタの局所次元削減 ==========
all_points = []
all_colors = []

for cluster_id in range(n_landmarks):
    X_sub = X[labels == cluster_id]
    y_sub = y[labels == cluster_id]  # digit ラベル
    
    if len(X_sub) > 2:
        reducer_local = PCA(n_components=2)
        X_sub_2d = reducer_local.fit_transform(X_sub)
        
        # 正規化してスケール調整
        X_sub_2d = (X_sub_2d - X_sub_2d.mean(axis=0)) / (X_sub_2d.std(axis=0) + 1e-9)
        center = landmarks_2d[cluster_id]
        scale = 0.1
        X_shifted = X_sub_2d * scale + center
        
        all_points.append(X_shifted)
        all_colors.append(y_sub)

# まとめる
all_points = np.vstack(all_points)
all_colors = np.concatenate(all_colors)

# ========== Plotly で可視化 ==========
fig = go.Figure()

# 全クラスタの点をまとめて 1 trace
fig.add_trace(go.Scatter(
    x=all_points[:, 0],
    y=all_points[:, 1],
    mode='markers',
    marker=dict(size=5, color=all_colors, colorscale='Rainbow', opacity=0.6, colorbar=dict(title="Digit label")),
    name= "Data Points",
    text=[f"Digit: {lbl}" for lbl in all_colors],
))

# ランドマークを大きく描画
fig.add_trace(go.Scatter(
    x=landmarks_2d[:, 0],
    y=landmarks_2d[:, 1],
    mode='markers+text',
    marker=dict(size=10, color=list(range(n_landmarks)), colorscale='Viridis', symbol="x"),
    text=[f"L{idx}" for idx in range(n_landmarks)],
    textposition="top center",
    name="Landmarks"
))

fig.update_layout(
    title="ランドマーク周囲にクラスタ次元削減結果を展開 (Colored by Digit Label)",
    width=800,
    height=800,
    showlegend=True
)

fig.show()



n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [27]:
# 小数点2まで で
print(landmark_norm.astype(float).round(2))

[[ 0.    0.91  1.07 ...  1.25  2.65  1.57]
 [ 0.    0.11  0.71 ... -0.63 -0.59 -0.26]
 [ 0.   -0.63 -0.51 ...  0.55 -0.35 -0.26]
 ...
 [ 0.   -0.24  0.12 ...  1.49  0.9  -0.19]
 [ 0.   -0.63 -1.34 ... -0.51 -0.63 -0.26]
 [ 0.    0.64  0.94 ... -1.31 -0.63 -0.26]]


In [None]:
# ランドマークの高次元での相対距離とumap上での相対距離を比較(heatmap, shapard)

from sklearn.metrics import pairwise_distances
from scipy.stats import spearmanr


# 正規化
landmark_norm = (landmarks - landmarks.mean(axis=0)) / (landmarks.std(axis=0) + 1e-9)
landmarks_2d_norm = (landmarks_2d - landmarks_2d.mean(axis=0)) / (landmarks_2d.std(axis=0) + 1e-9)

# 距離行列
D_hd = pairwise_distances(landmark_norm)
D_2d = pairwise_distances(landmarks_2d_norm)
# heatmap
fig = px.imshow(D_hd, title="Landmark Distances in High-Dim Space")
fig.show()
fig = px.imshow(D_2d, title="Landmark Distances in 2D UMAP Space")
fig.show()# spearman
iu = np.triu_indices(n_landmarks, k=1)
v_hd = D_hd[iu]
v_2d = D_2d[iu]
corr, _ = spearmanr(v_hd, v_2d)
print("Spearman correlation of landmark distances (high-dim vs 2D):", corr)



# 誤差行列
rel_error = np.abs(D_hd - D_2d) / (D_hd + 1e-9)
fig = px.imshow(rel_error, title="Relative Error of Landmark Distances (|D_hd - D_2d| / D_hd)")
fig.show()  


# shapard diagram
fig = px.scatter(x=v_hd, y=v_2d,
                 title="Shepard Diagram of Landmark Distances",
                 labels={'x': 'High-Dim Distances', 'y': '2D UMAP Distances'})
fig.add_shape(type="line", x0=0, y0=0, x1=max(v_hd), y1=max(v_hd),
              line=dict(dash="dash", color="red"))
fig.show()

Spearman correlation of landmark distances (high-dim vs 2D): 0.4171997412341772


In [None]:
# 