# Util

In [1]:
import numpy as np
from umap import UMAP
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd


# データ準備

In [None]:
# mnist
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler

# MNIST をロード
mnist = fetch_openml('mnist_784', version=1, as_frame=False)

In [None]:


# サンプリング
sample_num = 10000

X = mnist.data[:sample_num]      # shape: (70000, 784)
y = mnist.target[:sample_num]    # shape: (70000,)

# 高次元データ (784次元)
high_dim_data = StandardScaler().fit_transform(X)  # スケーリングしておくとPCAなどに便利

print("Shape of high_dim_data:", high_dim_data.shape)  # -> (70000, 784)





Shape of high_dim_data: (10000, 784)


# UMAPの時間計測

In [5]:
import random
import time
dimensions = [2, 10, 50, 100]

for i in dimensions:

    data = high_dim_data[:, :i]  # i次元に削減
    print("Shape of data:", data.shape)  # -> (70000, i)
    umap = UMAP(n_components=2, random_state=42, n_neighbors=15, min_dist=0.1)
    start_time = time.time()
    umap_result = umap.fit_transform(data)  # 2次元に削減
    print("UMAP time:", time.time() - start_time)  # -> 時間計測

    

Shape of data: (1000, 2)


  warn(


UMAP time: 16.876598119735718
Shape of data: (1000, 10)


  warn(


UMAP time: 7.3289618492126465
Shape of data: (1000, 50)


  warn(


UMAP time: 7.322424650192261
Shape of data: (1000, 100)


  warn(


UMAP time: 6.9037721157073975


# PCAを用いた次元削減とUMAP

In [7]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [13]:
dimensions = [4, 8, 16, 32, 64, 128, 256, 512]
for i in dimensions:
    pca = PCA(n_components=i, random_state=42)
    data = pca.fit_transform(high_dim_data) 

    umap = UMAP(n_components=2, random_state=42, n_neighbors=15, min_dist=0.1)
    start_time = time.time()
    result = umap.fit_transform(data)  # 2次元に削減
    print("UMAP time:", time.time() - start_time)  
    fig = px.scatter(x=result[:, 0], y=result[:, 1], color=y, title=f'PCA: {i} dimensions')
    fig.show()



n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



UMAP time: 15.620704412460327



n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



UMAP time: 15.829270601272583



n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



UMAP time: 16.439725399017334



n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



UMAP time: 16.326475143432617



n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



UMAP time: 16.29503297805786



n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



UMAP time: 16.121237993240356



n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



UMAP time: 16.400346994400024



n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



UMAP time: 16.956671953201294
