# データセット評価

In [15]:
import numpy as np
from sklearn.decomposition import PCA
import plotly.express as px
def sparsity_ratio(matrix):
    non_zero_count = np.count_nonzero(matrix)
    total_elements = matrix.size
    sparsity = 1 - (non_zero_count / total_elements)
    return sparsity

def intrisic_dimensionality_ratio(matrix, variance_threshold=0.95):
    n_samples, n_features = matrix.shape
    pca = PCA()
    pca.fit(matrix)
    cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
    num_components = np.searchsorted(cumulative_variance, variance_threshold) + 1
    intrinsic_dim_ratio = num_components / n_features
    return intrinsic_dim_ratio

def cumulative_variance_plot(matrix):
    import matplotlib.pyplot as plt
    from sklearn.decomposition import PCA

    pca = PCA()
    pca.fit(matrix)
    cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

    fig = px.line(
        x=np.arange(1, len(cumulative_variance) + 1),
        y=cumulative_variance,
        labels={'x': 'Number of Components', 'y': 'Cumulative Variance'},
        title='Cumulative Variance Explained by PCA Components'
    ).show()

    

In [12]:
# test
# mnist 70000, 784
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)
matrix = mnist.data.values


  warn(


Sparsity ratio: 0.81
Intrinsic dimensionality: 4
Intrinsic dimensionality ratio: 0.01


In [17]:
print(f"sparsity_ratio: {sparsity_ratio(matrix):.4f}")
print(f"intrisic_dimensionality_ratio: {intrisic_dimensionality_ratio(matrix):.4f}")
cumulative_variance_plot(matrix)

sparsity_ratio: 0.8086
intrisic_dimensionality_ratio: 0.1964


# 投影の品質評価

In [None]:

import numpy as np
from sklearn.metrics import pairwise_distances
from sklearn.neighbors import NearestNeighbors

# 信頼性
def trustworthiness(X, X_proj, n_neighbors=7):
    N = X.shape[0]
    # Original and projected distances
    dist_X = pairwise_distances(X)
    dist_proj = pairwise_distances(X_proj)

    # Rank of distances
    rank_X = np.argsort(np.argsort(dist_X, axis=1), axis=1)
    
    # K-nearest neighbors in projected space
    nn_proj = NearestNeighbors(n_neighbors=n_neighbors+1).fit(X_proj)
    neighbors_proj = nn_proj.kneighbors(return_distance=False)[:, 1:]

    # For each i, for each neighbor in projection, get the rank in original space
    t_sum = 0
    for i in range(N):
        for j in neighbors_proj[i]:
            r = rank_X[i, j]
            if r >= n_neighbors:
                t_sum += r - n_neighbors + 1
    
    norm = 2 / (N * n_neighbors * (2 * N - 3 * n_neighbors - 1))
    T = 1 - norm * t_sum
    return T

# 連続性
def continuity(X, X_proj, n_neighbors=7):
    N = X.shape[0]
    # Original and projected distances
    dist_X = pairwise_distances(X)
    dist_proj = pairwise_distances(X_proj)

    # Rank of distances in projected space
    rank_proj = np.argsort(np.argsort(dist_proj, axis=1), axis=1)

    # K-nearest neighbors in original space
    nn_orig = NearestNeighbors(n_neighbors=n_neighbors+1).fit(X)
    neighbors_orig = nn_orig.kneighbors(return_distance=False)[:, 1:]

    c_sum = 0
    for i in range(N):
        for j in neighbors_orig[i]:
            r = rank_proj[i, j]
            if r >= n_neighbors:
                c_sum += r - n_neighbors + 1
    
    norm = 2 / (N * n_neighbors * (2 * N - 3 * n_neighbors - 1))
    C = 1 - norm * c_sum
    return C

def normalized_stress(X, X_proj):
    D_n = pairwise_distances(X)
    D_q = pairwise_distances(X_proj)
    numerator = np.sum((D_n - D_q) ** 2)
    denominator = np.sum(D_n ** 2)
    return numerator / denominator

def neighborhood_hit(X_proj, labels, n_neighbors=7):
    N = X_proj.shape[0]
    nn = NearestNeighbors(n_neighbors=n_neighbors+1).fit(X_proj)
    neighbors = nn.kneighbors(return_distance=False)[:, 1:]

    hit_sum = 0
    for i in range(N):
        hit_sum += np.sum(labels[neighbors[i]] == labels[i])

    return hit_sum / (N * n_neighbors)

# Shepard Diagram helper (returns distances in original and projected space)
def shepard_diagram_data(X, X_proj):
    D_n = pairwise_distances(X)
    D_q = pairwise_distances(X_proj)
    # Use upper triangle without diagonal to avoid redundancy
    i_upper = np.triu_indices_from(D_n, k=1)
    return D_n[i_upper], D_q[i_upper]

# Average Local Error (optional, related to trustworthiness-like structure)
def average_local_error(X, X_proj, n_neighbors=5):
    N = X.shape[0]
    dist_X = pairwise_distances(X)
    dist_proj = pairwise_distances(X_proj)

    nn_orig = NearestNeighbors(n_neighbors=n_neighbors+1).fit(X)
    neighbors_orig = nn_orig.kneighbors(return_distance=False)[:, 1:]

    error_sum = 0
    for i in range(N):
        for j in neighbors_orig[i]:
            error_sum += abs(dist_X[i, j] - dist_proj[i, j])
    
    return error_sum / (N * n_neighbors)


In [28]:
# test
# mnist 70000, 784
data = mnist.data.values[:5000]  # Use a subset for faster computation
data_proj = PCA(n_components=2).fit_transform(data)
fig_proj = px.scatter(data_proj, x=0, y=1, color=mnist.target[:5000].astype(int), title='PCA Projection of MNIST')
fig_proj.show()

print(f"trustworthiness: {trustworthiness(data, data_proj):.4f}")
print(f"continuity: {continuity(data, data_proj):.4f}")
print(f"normalized_stress: {normalized_stress(data, data_proj):.4f}")
print(f"neighborhood_hit: {neighborhood_hit(data_proj, mnist.target[:5000].astype(int)):.4f}")




trustworthiness: 0.7454
continuity: 0.9274
normalized_stress: 0.4128
neighborhood_hit: 0.3958


In [29]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=42)
proj_tsne = tsne.fit_transform(data)
fig_tsne = px.scatter(proj_tsne, x=0, y=1, color=mnist.target[:5000].astype(int), title='t-SNE Projection of MNIST')
fig_tsne.show()
print(f"trustworthiness (t-SNE): {trustworthiness(data, proj_tsne):.4f}")
print(f"continuity (t-SNE): {continuity(data, proj_tsne):.4f}")
print(f"normalized_stress (t-SNE): {normalized_stress(data, proj_tsne):.4f}")
print(f"neighborhood_hit (t-SNE): {neighborhood_hit(proj_tsne, mnist.target[:5000].astype(int)):.4f}")

trustworthiness (t-SNE): 0.9836
continuity (t-SNE): 0.9688
normalized_stress (t-SNE): 0.9522
neighborhood_hit (t-SNE): 0.8960


In [33]:
from umap import UMAP
umap = UMAP(n_components=2, random_state=42)
proj_umap = umap.fit_transform(data)
fig_umap = px.scatter(proj_umap, x=0, y=1, color=mnist.target[:5000].astype(int), title='UMAP Projection of MNIST')
fig_umap.show() 
print(f"trustworthiness (UMAP): {trustworthiness(data, proj_umap):.4f}")
print(f"continuity (UMAP): {continuity(data, proj_umap):.4f}")
print(f"normalized_stress (UMAP): {normalized_stress(data, proj_umap):.4f}")



n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



trustworthiness (UMAP): 0.9631
continuity (UMAP): 0.9709
normalized_stress (UMAP): 0.9947


In [35]:
average_local_error_value = average_local_error(data, proj_tsne)
print(f"average_local_error (t-SNE): {average_local_error_value:.4f}")
average_local_error_value = average_local_error(data, data_proj)
print(f"average_local_error (pca): {average_local_error_value:.4f}")
average_local_error_value = average_local_error(data, proj_umap)
print(f"average_local_error (UMAP): {average_local_error_value:.4f}")

average_local_error (t-SNE): 1389.8790
average_local_error (pca): 1146.6299
average_local_error (UMAP): 1395.6449
