# データセット評価

In [18]:
import numpy as np
from sklearn.decomposition import PCA
import plotly.express as px

# スパース性
def sparsity_ratio(matrix):
    non_zero_count = np.count_nonzero(matrix)
    total_elements = matrix.size
    sparsity = 1 - (non_zero_count / total_elements)
    return sparsity

# 内在次元
def intrisic_dimensionality_ratio(matrix, variance_threshold=0.95):
    n_samples, n_features = matrix.shape
    pca = PCA()
    pca.fit(matrix)
    cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
    num_components = np.searchsorted(cumulative_variance, variance_threshold) + 1
    intrinsic_dim_ratio = num_components / n_features
    return intrinsic_dim_ratio

# 累積寄与率のプロット
def cumulative_variance_plot(matrix):
    import matplotlib.pyplot as plt
    from sklearn.decomposition import PCA

    pca = PCA()
    pca.fit(matrix)
    cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

    fig = px.line(
        x=np.arange(1, len(cumulative_variance) + 1),
        y=cumulative_variance,
        labels={'x': 'Number of Components', 'y': 'Cumulative Variance'},
        title='Cumulative Variance Explained by PCA Components'
    ).show()

    

In [19]:
# test
# mnist 70000, 784
from sklearn.datasets import fetch_openml
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
mnist = fetch_openml('mnist_784', version=1)
matrix = mnist.data.values


In [20]:
print(f"sparsity_ratio: {sparsity_ratio(matrix):.4f}")
print(f"intrisic_dimensionality_ratio: {intrisic_dimensionality_ratio(matrix):.4f}")
cumulative_variance_plot(matrix)

sparsity_ratio: 0.8086
intrisic_dimensionality_ratio: 0.1964


Matplotlib is building the font cache; this may take a moment.


MNISTデータセットはスパース性が高く、表現に必要な次元も20%は必要. 表データよりも可視化するのが難しいデータのタイプ
2Dの可視化だと全体の分散の16%しか表現できない

# 投影の品質評価関数

In [None]:

import numpy as np
from sklearn.metrics import pairwise_distances
from sklearn.neighbors import NearestNeighbors

# 信頼性
def trustworthiness(X, X_proj, n_neighbors=7):
    N = X.shape[0]
    # Original and projected distances
    dist_X = pairwise_distances(X)
    dist_proj = pairwise_distances(X_proj)

    # Rank of distances
    rank_X = np.argsort(np.argsort(dist_X, axis=1), axis=1)
    
    # K-nearest neighbors in projected space
    nn_proj = NearestNeighbors(n_neighbors=n_neighbors+1).fit(X_proj)
    neighbors_proj = nn_proj.kneighbors(return_distance=False)[:, 1:]

    # For each i, for each neighbor in projection, get the rank in original space
    t_sum = 0
    for i in range(N):
        for j in neighbors_proj[i]:
            r = rank_X[i, j]
            if r >= n_neighbors:
                t_sum += r - n_neighbors + 1
    
    norm = 2 / (N * n_neighbors * (2 * N - 3 * n_neighbors - 1))
    T = 1 - norm * t_sum
    return T



# 連続性
def continuity(X, X_proj, n_neighbors=7):
    N = X.shape[0]
    # Original and projected distances
    dist_X = pairwise_distances(X)
    dist_proj = pairwise_distances(X_proj)

    # Rank of distances in projected space
    rank_proj = np.argsort(np.argsort(dist_proj, axis=1), axis=1)

    # K-nearest neighbors in original space
    nn_orig = NearestNeighbors(n_neighbors=n_neighbors+1).fit(X)
    neighbors_orig = nn_orig.kneighbors(return_distance=False)[:, 1:]

    c_sum = 0
    for i in range(N):
        for j in neighbors_orig[i]:
            r = rank_proj[i, j]
            if r >= n_neighbors:
                c_sum += r - n_neighbors + 1
    
    norm = 2 / (N * n_neighbors * (2 * N - 3 * n_neighbors - 1))
    C = 1 - norm * c_sum
    return C

# ストレス係数: 元の距離行列と射影後の距離行列の差を測る指標
def normalized_stress(X, X_proj):
    D_n = pairwise_distances(X)
    D_q = pairwise_distances(X_proj)
    numerator = np.sum((D_n - D_q) ** 2)
    denominator = np.sum(D_n ** 2)
    return numerator / denominator



# Shepard Diagram helper (returns distances in original and projected space)
def shepard_diagram_data(X, X_proj):
    D_n = pairwise_distances(X)
    D_q = pairwise_distances(X_proj)
    # Use upper triangle without diagonal to avoid redundancy
    i_upper = np.triu_indices_from(D_n, k=1)
    return D_n[i_upper], D_q[i_upper]

# Average Local Error (optional, related to trustworthiness-like structure)
def average_local_error(X, X_proj, n_neighbors=5):
    N = X.shape[0]
    dist_X = pairwise_distances(X)
    dist_proj = pairwise_distances(X_proj)

    nn_orig = NearestNeighbors(n_neighbors=n_neighbors+1).fit(X)
    neighbors_orig = nn_orig.kneighbors(return_distance=False)[:, 1:]

    error_sum = 0
    for i in range(N):
        for j in neighbors_orig[i]:
            error_sum += abs(dist_X[i, j] - dist_proj[i, j])
    
    return error_sum / (N * n_neighbors)

### ラベルあり ###

# シルエットスコア
from sklearn.metrics import silhouette_score
def silhouette(X_proj, labels):
    return silhouette_score(X_proj, labels)

# Neiborhood hit :次元削減した空間で、近傍点が同じラベルかどうか
def neighborhood_hit(X_proj, labels, n_neighbors=7):
    N = X_proj.shape[0]
    nn = NearestNeighbors(n_neighbors=n_neighbors+1).fit(X_proj)
    neighbors = nn.kneighbors(return_distance=False)[:, 1:]

    hit_sum = 0
    for i in range(N):
        hit_sum += np.sum(labels[neighbors[i]] == labels[i])

    return hit_sum / (N * n_neighbors)

from scipy.stats import spearmanr, pearsonr

def cluster_center(X, labels):
    unique_labels = np.unique(labels)
    centers = np.array([X[labels == lbl].mean(axis=0) for lbl in unique_labels])
    return centers, unique_labels

# クラスタ中心間距離の相関
def cluster_center_distance_correlation(X, X_proj, labels):
    unique_labels = np.unique(labels)
    n_clusters = len(unique_labels)

    # クラスタ中心の計算
    centers_X, unique_labels = cluster_center(X, labels)
    centers_X_proj = cluster_center(X_proj, labels)[0]

    # クラスタ中心間の距離行列
    dist_centers_X = pairwise_distances(centers_X)
    dist_centers_X_proj = pairwise_distances(centers_X_proj)

    # 上三角行列の要素を取得
    i_upper = np.triu_indices_from(dist_centers_X, k=1)
    dists_X = dist_centers_X[i_upper]
    dists_X_proj = dist_centers_X_proj[i_upper]

    # 相関計算
    spearman_corr, _ = spearmanr(dists_X, dists_X_proj)
    pearson_corr, _ = pearsonr(dists_X, dists_X_proj)

    return spearman_corr, pearson_corr





In [25]:


import numpy as np
from sklearn.metrics import pairwise_distances
from sklearn.manifold import trustworthiness as _sk_trustworthiness
from scipy.stats import spearmanr, pearsonr
from scipy.sparse.csgraph import shortest_path

# ---- 共通前処理 ----
def _pairwise(X, metric="euclidean"):
    D = pairwise_distances(X, metric=metric)
    np.fill_diagonal(D, 0.0)
    return D

def _rank_matrix(D):
    # 小さい距離ほど小さい順位（自身除外）
    n = D.shape[0]
    order = np.argsort(D, axis=1)
    ranks = np.empty_like(order)
    ranks[np.arange(n)[:, None], order] = np.arange(n)[None, :]
    return ranks

def _flatten_upper(M):
    idx = np.triu_indices_from(M, k=1)
    return M[idx]

# ---- 距離ベース ----
def stress(X, Y, metric="euclidean"):
    DX, DY = _pairwise(X, metric), _pairwise(Y, metric)
    num = np.sum((DX - DY) ** 2) / 2
    den = np.sum(DX ** 2) / 2 + 1e-12
    return np.sqrt(num / den)  # Kruskal normalized stress-1 に相当

def normalized_stress(X, Y, metric="euclidean"):
    # Sammon ではなくKruskal系の正規化（上と同じだが名前を分けておく）
    return stress(X, Y, metric=metric)

def sammon_stress(X, Y, metric="euclidean", eps=1e-12):
    DX, DY = _pairwise(X, metric), _pairwise(Y, metric)
    w = 1.0 / (DX + eps)
    num = np.sum(w * (DX - DY) ** 2) / 2
    den = np.sum(DX) + eps
    return num / den

def pearson_corr_of_distances(X, Y, metric="euclidean"):
    a = _flatten_upper(_pairwise(X, metric))
    b = _flatten_upper(_pairwise(Y, metric))
    r, _ = pearsonr(a, b)
    return r

def spearman_corr_of_distances(X, Y, metric="euclidean"):
    a = _flatten_upper(_pairwise(X, metric))
    b = _flatten_upper(_pairwise(Y, metric))
    r, _ = spearmanr(a, b)
    return r

def shepard_diagram_data(X, Y, metric="euclidean"):
    # 散布図用に (d_high, d_low) のペアを返す
    a = _flatten_upper(_pairwise(X, metric))
    b = _flatten_upper(_pairwise(Y, metric))
    return a, b

# ---- 近傍保存ファミリ ----
def trustworthiness(X, Y, n_neighbors=15, metric="euclidean"):
    return _sk_trustworthiness(X, Y, n_neighbors=n_neighbors, metric=metric)

def continuity(X, Y, n_neighbors=15, metric="euclidean"):
    # Lee & Verleysen (2009) 定義
    DX, DY = _pairwise(X, metric), _pairwise(Y, metric)
    RX, RY = _rank_matrix(DX), _rank_matrix(DY)
    n = X.shape[0]; k = n_neighbors
    # 高次元で近傍だが低次元で外れたものの順位超過 (intrusions)
    C = 0.0
    for i in range(n):
        N_hi = set(np.argsort(DX[i])[1:k+1])  # 自身除外
        ranks_low = RY[i]
        # 低次元で近傍外のうち高次元近傍にいた点
        intrusions = [j for j in N_hi if ranks_low[j] > k]
        C += np.sum(ranks_low[intrusions] - k)
    norm = n * k * (2*n - 3*k - 1)
    return 1.0 - (2.0 / norm) * C

def neighborhood_hit(Y, labels, n_neighbors=15, metric="euclidean"):
    DY = _pairwise(Y, metric)
    n = Y.shape[0]; k = n_neighbors
    hit = 0
    for i in range(n):
        nbrs = np.argsort(DY[i])[1:k+1]
        hit += np.mean(labels[nbrs] == labels[i])
    return hit / n

def average_local_error(X, Y, n_neighbors=15, metric="euclidean", eps=1e-12):
    DX, DY = _pairwise(X, metric), _pairwise(Y, metric)
    n = X.shape[0]; k = n_neighbors
    errs = []
    for i in range(n):
        nbrs = np.argsort(DX[i])[1:k+1]
        rel = np.abs(DX[i, nbrs] - DY[i, nbrs]) / (DX[i, nbrs] + eps)
        errs.append(np.mean(rel))
    return float(np.mean(errs))

# ---- co-ranking / LCMC / R_NX 曲線 ----
def coranking_matrix(X, Y, metric="euclidean"):
    RX, RY = _rank_matrix(_pairwise(X, metric)), _rank_matrix(_pairwise(Y, metric))
    n = X.shape[0]
    Q = np.zeros((n-1, n-1), dtype=int)
    for i in range(n):
        for j in range(n):
            if i == j: 
                continue
            k = RX[i, j]  # 0..n-1
            l = RY[i, j]
            if k > 0 and l > 0:
                Q[k-1, l-1] += 1
    return Q

def R_NX_curve(X, Y, Ks=None, metric="euclidean"):
    # Chen & Buja の Q_NX（=R_NX）: 近傍オーバーラップ率
    n = X.shape[0]
    if Ks is None:
        Ks = np.arange(1, min(n-1, 100)+1)
    DX, DY = _pairwise(X, metric), _pairwise(Y, metric)
    R = []
    for K in Ks:
        overlap = 0
        for i in range(n):
            NX = set(np.argsort(DX[i])[1:K+1])
            NY = set(np.argsort(DY[i])[1:K+1])
            overlap += len(NX & NY) / K
        R.append(overlap / n)
    return np.array(Ks), np.array(R)

def LCMC(X, Y, Ks=None, metric="euclidean"):
    Ks, R = R_NX_curve(X, Y, Ks, metric)
    n = X.shape[0]
    baseline = Ks / (n - 1)
    return Ks, (R - baseline)

def AUC_R_NX(X, Y, K_max=None, metric="euclidean"):
    n = X.shape[0]
    if K_max is None:
        K_max = min(n-2, 100)
    Ks, L = LCMC(X, Y, np.arange(1, K_max+1), metric)
    # 0..K_max にわたる平均（台形則）
    # auc = np.trapz(L, Ks) / K_max
    auc = np.trapezoid(L, Ks) / K_max
    return float(auc)

# ---- KL（t-SNE 風，確率化して KL(P || Q)）----
def tsne_kl_divergence(X, Y, perplexity=30.0, metric="euclidean", eps=1e-12):
    def _Hs(P):
        P = P + eps
        return -np.sum(P * np.log2(P))
    # 高次元：ガウスカーネルで目標perplexity
    DX = _pairwise(X, metric)
    n = DX.shape[0]
    sigmas = np.full(n, 1.0)
    PX = np.zeros((n, n))
    targetH = np.log2(perplexity)

    for i in range(n):
        beta_min, beta_max = -np.inf, np.inf
        beta = 1.0  # = 1/(2 sigma^2)
        Di = np.delete(DX[i], i)
        for _ in range(50):
            Pi = np.exp(-Di * beta)
            Pi /= np.sum(Pi)
            H = _Hs(Pi)
            if np.abs(H - targetH) < 1e-3:
                break
            if H > targetH:
                beta_min = beta
                beta = beta * 2 if beta_max == np.inf else (beta + beta_max) / 2
            else:
                beta_max = beta
                beta = beta / 2 if beta_min == -np.inf else (beta + beta_min) / 2
        row = np.zeros(n); row[np.arange(n)!=i] = Pi
        PX[i] = row
    PX = (PX + PX.T) / (2*np.sum(PX))

    # 低次元：t 分布（1 自由度，学生のt）
    DY = _pairwise(Y, metric)
    Q = 1 / (1 + DY**2); np.fill_diagonal(Q, 0.0)
    Q /= np.sum(Q)

    # KL(P||Q)
    mask = PX > 0
    return float(np.sum(PX[mask] * (np.log(PX[mask] + eps) - np.log(Q[mask] + eps))))

# ---- グラフ系（kNN 精度・再現率・重なり）----
def knn_graph_metrics(X, Y, k=15, metric="euclidean"):
    DX, DY = _pairwise(X, metric), _pairwise(Y, metric)
    n = X.shape[0]
    precs, recs, jaccs = [], [], []
    for i in range(n):
        NX = set(np.argsort(DX[i])[1:k+1])
        NY = set(np.argsort(DY[i])[1:k+1])
        inter = len(NX & NY)
        precs.append(inter / k)
        recs.append(inter / k)
        jaccs.append(inter / (len(NX | NY)))
    return dict(precision=float(np.mean(precs)),
                recall=float(np.mean(recs)),
                jaccard=float(np.mean(jaccs)))

def shortest_path_correlation(X, Y, k=10, metric="euclidean"):
    # kNN グラフ上の最短路長の相関（グローバル位相の保全を見る）
    def _sp(D):
        n = D.shape[0]
        A = np.full_like(D, np.inf, dtype=float)
        idx = np.argsort(D, axis=1)[:, 1:k+1]
        rows = np.repeat(np.arange(n), k)
        cols = idx.ravel()
        A[rows, cols] = D[rows, cols]
        A[cols, rows] = D[rows, cols]
        np.fill_diagonal(A, 0.0)
        return shortest_path(A, directed=False, unweighted=False)
    SX, SY = _sp(_pairwise(X, metric)), _sp(_pairwise(Y, metric))
    a, b = _flatten_upper(SX), _flatten_upper(SY)
    r, _ = spearmanr(a, b)
    return r

# ---- ランク誤差（MRRE の簡易版）----
def mean_rank_error(X, Y, n_neighbors=15, metric="euclidean"):
    RX, RY = _rank_matrix(_pairwise(X, metric)), _rank_matrix(_pairwise(Y, metric))
    n = X.shape[0]; k = n_neighbors
    errs = []
    for i in range(n):
        nbrs = np.argsort(RX[i])[1:k+1]
        errs.append(np.mean(np.abs(RX[i, nbrs] - RY[i, nbrs])))
    return float(np.mean(errs))


In [29]:
X = np.random.rand(1000, 50)
# Y = np.random.rand(1000, 2)
Y = PCA(n_components=2).fit_transform(X)
labels = np.random.randint(0, 5, size=1000)

# X: (n,D), Y: (n,d), labels: (n,)
print("Stress:", stress(X, Y))
print("Pearson r(dist):", pearson_corr_of_distances(X, Y))
print("Spearman r(dist):", spearman_corr_of_distances(X, Y))
print("Trustworthiness@15:", trustworthiness(X, Y, 15))
print("Continuity@15:", continuity(X, Y, 15))
print("Neighborhood hit@15:", neighborhood_hit(Y, labels, 15))
print("AUC-R_NX:", AUC_R_NX(X, Y))
print("t-SNE KL:", tsne_kl_divergence(X, Y, perplexity=30))


Stress: 0.7939911487400045
Pearson r(dist): 0.22125852573322433
Spearman r(dist): 0.19838878508234725
Trustworthiness@15: 0.6024328215626066
Continuity@15: 0.6716337086318662
Neighborhood hit@15: 0.19613333333333333
AUC-R_NX: 0.041445017424339
t-SNE KL: 3.2056031001582896


In [32]:
# iris
from sklearn.datasets import load_iris
data = load_iris()
X = data.data
labels = data.target
Y = PCA(n_components=2).fit_transform(X)
from sklearn.manifold import TSNE
# Y = TSNE(n_components=2, random_state=0).fit_transform(X)

print("Stress:", stress(X, Y))
print("Pearson r(dist):", pearson_corr_of_distances(X, Y))
print("Spearman r(dist):", spearman_corr_of_distances(X, Y))
print("Trustworthiness@15:", trustworthiness(X, Y, 15))
print("Continuity@15:", continuity(X, Y, 15))
print("Neighborhood hit@15:", neighborhood_hit(Y, labels, 15))
print("AUC-R_NX:", AUC_R_NX(X, Y))
print("t-SNE KL:", tsne_kl_divergence(X, Y, perplexity=30))


Stress: 0.04179644853519415
Pearson r(dist): 0.9983836039716812
Spearman r(dist): 0.9957991157922617
Trustworthiness@15: 0.9859597550306212
Continuity@15: 0.9928853893263342
Neighborhood hit@15: 0.9239999999999999
AUC-R_NX: 0.5700286564597843
t-SNE KL: 0.5977758962047521


In [None]:
# test
# mnist 70000, 784
data = mnist.data.values[:5000]  # Use a subset for faster computation
data_proj = PCA(n_components=2).fit_transform(data)
fig_proj = px.scatter(data_proj, x=0, y=1, color=mnist.target[:5000].astype(int), title='PCA Projection of MNIST')
fig_proj.show()

print(f"trustworthiness: {trustworthiness(data, data_proj):.4f}")
print(f"continuity: {continuity(data, data_proj):.4f}")
print(f"normalized_stress: {normalized_stress(data, data_proj):.4f}")
print(f"neighborhood_hit: {neighborhood_hit(data_proj, mnist.target[:5000].astype(int)):.4f}")


trustworthiness: 0.7453
continuity: 0.9274
normalized_stress: 0.4128
neighborhood_hit: 0.3958


In [10]:
# 特定のラベルのデータのストレス係数
for i in range(10):
    label = i
    data_label = data[mnist.target[:5000].astype(int) == label]
    data_proj_label = data_proj[mnist.target[:5000].astype(int) == label]
    print(f"normalized_stress for label {label}: {normalized_stress(data_label, data_proj_label):.4f}")

normalized_stress for label 0: 0.5821
normalized_stress for label 1: 0.6483
normalized_stress for label 2: 0.5544
normalized_stress for label 3: 0.5651
normalized_stress for label 4: 0.5804
normalized_stress for label 5: 0.5249
normalized_stress for label 6: 0.5439
normalized_stress for label 7: 0.5749
normalized_stress for label 8: 0.5844
normalized_stress for label 9: 0.5564


In [17]:
# 特定のラベルだけで次元削減したときのメトリクス
for i in range(10):
    label = i
    data_label = data[mnist.target[:5000].astype(int) == label]
    # 標準化　NaN対策
    data_label = (data_label - np.mean(data_label, axis=0)) / (np.std(data_label, axis=0) + 1e-10)
    pca = PCA(n_components=2)
    data_proj_label = pca.fit_transform(data_label)
    print(i)
    print(f"normalized_stress for label {label} (PCA): {normalized_stress(data_label, data_proj_label):.4f}")
    # trustworthiness
    for k in [5, 10, 15, 50, 100]:
        print(f" n_neighbors={k}")
        print(f"  trustworthiness: {trustworthiness(data_label, data_proj_label, n_neighbors=k):.4f}")
        print(f"  continuity: {continuity(data_label, data_proj_label, n_neighbors=k):.4f}")
        
    print()


0
normalized_stress for label 0 (PCA): 0.3641
 n_neighbors=5
  trustworthiness: 0.7850
  continuity: 0.9265
 n_neighbors=10
  trustworthiness: 0.7940
  continuity: 0.9241
 n_neighbors=15
  trustworthiness: 0.7953
  continuity: 0.9222
 n_neighbors=50
  trustworthiness: 0.8096
  continuity: 0.9211
 n_neighbors=100
  trustworthiness: 0.8186
  continuity: 0.9175

1
normalized_stress for label 1 (PCA): 0.3638
 n_neighbors=5
  trustworthiness: 0.8164
  continuity: 0.9266
 n_neighbors=10
  trustworthiness: 0.8320
  continuity: 0.9259
 n_neighbors=15
  trustworthiness: 0.8401
  continuity: 0.9276
 n_neighbors=50
  trustworthiness: 0.8634
  continuity: 0.9370
 n_neighbors=100
  trustworthiness: 0.8811
  continuity: 0.9499

2
normalized_stress for label 2 (PCA): 0.4549
 n_neighbors=5
  trustworthiness: 0.7264
  continuity: 0.8941
 n_neighbors=10
  trustworthiness: 0.7302
  continuity: 0.8910
 n_neighbors=15
  trustworthiness: 0.7338
  continuity: 0.8863
 n_neighbors=50
  trustworthiness: 0.7447


In [6]:
# パラメータを変えて
for k in [5, 10, 15, 50, 100]:
    print(f"n_neighbors={k}")
    print(f" trustworthiness: {trustworthiness(data, data_proj, n_neighbors=k):.4f}")
    print(f" continuity: {continuity(data, data_proj, n_neighbors=k):.4f}")
    print(f" neighborhood_hit: {neighborhood_hit(data_proj, mnist.target[:5000].astype(int), n_neighbors=k):.4f}")
    print("")

n_neighbors=5
 trustworthiness: 0.7463
 continuity: 0.9303
 neighborhood_hit: 0.3983

n_neighbors=10
 trustworthiness: 0.7458
 continuity: 0.9233
 neighborhood_hit: 0.3934

n_neighbors=15
 trustworthiness: 0.7456
 continuity: 0.9185
 neighborhood_hit: 0.3925

n_neighbors=50
 trustworthiness: 0.7475
 continuity: 0.8987
 neighborhood_hit: 0.3855

n_neighbors=100
 trustworthiness: 0.7498
 continuity: 0.8851
 neighborhood_hit: 0.3792



In [7]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=42)
proj_tsne = tsne.fit_transform(data)
fig_tsne = px.scatter(proj_tsne, x=0, y=1, color=mnist.target[:5000].astype(int), title='t-SNE Projection of MNIST')
fig_tsne.show()
print(f"trustworthiness (t-SNE): {trustworthiness(data, proj_tsne):.4f}")
print(f"continuity (t-SNE): {continuity(data, proj_tsne):.4f}")
print(f"normalized_stress (t-SNE): {normalized_stress(data, proj_tsne):.4f}")
print(f"neighborhood_hit (t-SNE): {neighborhood_hit(proj_tsne, mnist.target[:5000].astype(int)):.4f}")

trustworthiness (t-SNE): 0.9836
continuity (t-SNE): 0.9688
normalized_stress (t-SNE): 0.9521
neighborhood_hit (t-SNE): 0.8965


In [11]:
for i in range(10):
    label = i
    data_label = data[mnist.target[:5000].astype(int) == label]
    data_proj_label = proj_tsne[mnist.target[:5000].astype(int) == label]
    print(f"normalized_stress for label {label} (t-SNE): {normalized_stress(data_label, data_proj_label):.4f}")

normalized_stress for label 0 (t-SNE): 0.9866
normalized_stress for label 1 (t-SNE): 0.9723
normalized_stress for label 2 (t-SNE): 0.9767
normalized_stress for label 3 (t-SNE): 0.9813
normalized_stress for label 4 (t-SNE): 0.9758
normalized_stress for label 5 (t-SNE): 0.9824
normalized_stress for label 6 (t-SNE): 0.9848
normalized_stress for label 7 (t-SNE): 0.9776
normalized_stress for label 8 (t-SNE): 0.9814
normalized_stress for label 9 (t-SNE): 0.9755


In [8]:
# k
for k in [5, 10, 15, 50, 100]:
    print(f"n_neighbors={k}")
    print(f" trustworthiness (t-SNE): {trustworthiness(data, proj_tsne, n_neighbors=k):.4f}")
    print(f" continuity (t-SNE): {continuity(data, proj_tsne, n_neighbors=k):.4f}")
    print(f" neighborhood_hit (t-SNE): {neighborhood_hit(proj_tsne, mnist.target[:5000].astype(int), n_neighbors=k):.4f}")
    print("")

n_neighbors=5
 trustworthiness (t-SNE): 0.9868
 continuity (t-SNE): 0.9719
 neighborhood_hit (t-SNE): 0.9012

n_neighbors=10
 trustworthiness (t-SNE): 0.9794
 continuity (t-SNE): 0.9655
 neighborhood_hit (t-SNE): 0.8910

n_neighbors=15
 trustworthiness (t-SNE): 0.9740
 continuity (t-SNE): 0.9610
 neighborhood_hit (t-SNE): 0.8825

n_neighbors=50
 trustworthiness (t-SNE): 0.9531
 continuity (t-SNE): 0.9389
 neighborhood_hit (t-SNE): 0.8528

n_neighbors=100
 trustworthiness (t-SNE): 0.9348
 continuity (t-SNE): 0.9178
 neighborhood_hit (t-SNE): 0.8266



In [9]:
from umap import UMAP
umap = UMAP(n_components=2, random_state=42)
proj_umap = umap.fit_transform(data)
fig_umap = px.scatter(proj_umap, x=0, y=1, color=mnist.target[:5000].astype(int), title='UMAP Projection of MNIST')
fig_umap.show() 
print(f"trustworthiness (UMAP): {trustworthiness(data, proj_umap):.4f}")
print(f"continuity (UMAP): {continuity(data, proj_umap):.4f}")
print(f"normalized_stress (UMAP): {normalized_stress(data, proj_umap):.4f}")


ModuleNotFoundError: No module named 'umap'

In [35]:
average_local_error_value = average_local_error(data, proj_tsne)
print(f"average_local_error (t-SNE): {average_local_error_value:.4f}")
average_local_error_value = average_local_error(data, data_proj)
print(f"average_local_error (pca): {average_local_error_value:.4f}")
average_local_error_value = average_local_error(data, proj_umap)
print(f"average_local_error (UMAP): {average_local_error_value:.4f}")

average_local_error (t-SNE): 1389.8790
average_local_error (pca): 1146.6299
average_local_error (UMAP): 1395.6449


# 実験結果のまとめ
- PCA, t-SNE, UMAPの3手法でMNISTデータセットを2次元に射影し、各種評価指標で比較した。
- **PCA**は分散の16%程度しか表現できず、スパースな高次元データの構造を十分に捉えきれない。
- **t-SNE**は局所構造の保持（trustworthiness, neighborhood hit）が高く、クラスタ分離も良好だが、グローバルな構造は歪む傾向がある。
- **UMAP**は局所・グローバル構造のバランスが良く、trustworthinessやcontinuityも高い値を示した。
- normalized_stressやaverage_local_errorでも、UMAP/t-SNEがPCAよりも低い値となり、より元の距離関係を保っていることが分かる。
- 以上より、MNISTのような高次元・非線形なデータでは、UMAPやt-SNEがPCAよりも可視化・構造保持の面で優れている。

In [None]:
# これまで書いた指標を用いて可視化する