In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.datasets import load_digits
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import ParameterGrid
import numpy as np
from scipy.spatial.distance import pdist, squareform
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load the Digits dataset
digits = load_digits()
X = digits.data
y = digits.target
target_names = digits.target_names

In [3]:
# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [4]:
# Function to compute trustworthiness (measures neighborhood preservation)
def trustworthiness(X_high, X_low, n_neighbors=5):
    dist_high = squareform(pdist(X_high))
    dist_low = squareform(pdist(X_low))
    n_samples = X_high.shape[0]
    rank_high = np.argsort(dist_high, axis=1)
    rank_low = np.argsort(dist_low, axis=1)
    violations = 0
    for i in range(n_samples):
        for j in rank_low[i, 1:n_neighbors+1]:
            if j not in rank_high[i, 1:n_neighbors+1]:
                violations += 1
    return 1 - (2 * violations / (n_samples * n_neighbors * (2 * n_samples - 3 * n_neighbors - 1)))

In [5]:
# PCA with explained variance analysis
pca = PCA(n_components=3)
pca_results = pca.fit_transform(X_scaled)
pca_df = pd.DataFrame(data=pca_results, columns=['PC1', 'PC2', 'PC3'])
pca_df['Target'] = y.astype(str)
pca_variance = pca.explained_variance_ratio_

In [6]:
# Grid search for t-SNE
tsne_params = {
    'perplexity': [5, 30, 50],
    'learning_rate': [10, 200, 'auto'],
    'n_iter': [1000]
}
best_tsne = None
best_tsne_score = -1
tsne_results = None
for params in ParameterGrid(tsne_params):
    tsne = TSNE(n_components=3, random_state=42, **params)
    tsne_result = tsne.fit_transform(X_scaled)
    score = trustworthiness(X_scaled, tsne_result)
    if score > best_tsne_score:
        best_tsne_score = score
        best_tsne = params
        tsne_results = tsne_result

tsne_df = pd.DataFrame(data=tsne_results, columns=['TSNE1', 'TSNE2', 'TSNE3'])
tsne_df['Target'] = y.astype(str)

In [7]:
# Grid search for UMAP
umap_params = {
    'n_neighbors': [5, 15, 30],
    'min_dist': [0.1, 0.5],
    'n_components': [3]
}
best_umap = None
best_umap_score = -1
umap_results = None
for params in ParameterGrid(umap_params):
    umap_model = umap.UMAP(random_state=42, **params)
    umap_result = umap_model.fit_transform(X_scaled)
    score = trustworthiness(X_scaled, umap_result)
    if score > best_umap_score:
        best_umap_score = score
        best_umap = params
        umap_results = umap_result

umap_df = pd.DataFrame(data=umap_results, columns=['UMAP1', 'UMAP2', 'UMAP3'])
umap_df['Target'] = y.astype(str)

In [9]:
# Reconstruction error for PCA
X_reconstructed = pca.inverse_transform(pca_results)
pca_reconstruction_error = mean_squared_error(X_scaled.flatten(), X_reconstructed.flatten())

In [10]:
# Create a dashboard with 3D scatter plots
fig = make_subplots(
    rows=1, cols=3,
    specs=[[{'is_3d': True}, {'is_3d': True}, {'is_3d': True}]],
    subplot_titles=['PCA (3D)', 't-SNE (3D)', 'UMAP (3D)']
)

In [11]:
# PCA 3D Scatter
fig.add_trace(
    go.Scatter3d(
        x=pca_df['PC1'], y=pca_df['PC2'], z=pca_df['PC3'],
        mode='markers',
        marker=dict(size=4, color=pca_df['Target'].astype(int), colorscale='Viridis', opacity=0.8),
        name='PCA'
    ),
    row=1, col=1
)

In [12]:
# t-SNE 3D Scatter
fig.add_trace(
    go.Scatter3d(
        x=tsne_df['TSNE1'], y=tsne_df['TSNE2'], z=tsne_df['TSNE3'],
        mode='markers',
        marker=dict(size=4, color=tsne_df['Target'].astype(int), colorscale='Viridis', opacity=0.8),
        name='t-SNE'
    ),
    row=1, col=2
)

In [13]:
# UMAP 3D Scatter
fig.add_trace(
    go.Scatter3d(
        x=umap_df['UMAP1'], y=umap_df['UMAP2'], z=umap_df['UMAP3'],
        mode='markers',
        marker=dict(size=4, color=umap_df['Target'].astype(int), colorscale='Viridis', opacity=0.8),
        name='UMAP'
    ),
    row=1, col=3
)

In [14]:
fig.update_layout(
    title_text="Advanced Dimensionality Reduction: Digits Dataset",
    height=600, width=1200,
    scene=dict(xaxis_title="X", yaxis_title="Y", zaxis_title="Z"),
    scene2=dict(xaxis_title="X", yaxis_title="Y", zaxis_title="Z"),
    scene3=dict(xaxis_title="X", yaxis_title="Y", zaxis_title="Z")
)
fig.show()

In [15]:
# Print evaluation metrics
print(f"PCA Explained Variance Ratio: {pca_variance}")
print(f"PCA Reconstruction Error: {pca_reconstruction_error:.4f}")
print(f"Best t-SNE Parameters: {best_tsne}, Trustworthiness: {best_tsne_score:.4f}")
print(f"Best UMAP Parameters: {best_umap}, Trustworthiness: {best_umap_score:.4f}")

PCA Explained Variance Ratio: [0.12033916 0.09561054 0.08444415]
PCA Reconstruction Error: 0.6668
Best t-SNE Parameters: {'learning_rate': 200, 'n_iter': 1000, 'perplexity': 30}, Trustworthiness: 0.9998
Best UMAP Parameters: {'min_dist': 0.5, 'n_components': 3, 'n_neighbors': 5}, Trustworthiness: 0.9997
