Install the requriments

In [None]:

! pip install numpy
! pip install scikit-learn
# ! pip install torch==1.10.1+cu102
! pip install umap-learn
! pip install plotly
! pip install scikit-dimension
! pip install torchvision

Load dataset open-source packages

In [None]:
import torchvision
import numpy as np
from sklearn import datasets

Dataset = 'CIFAR10'

if Dataset == 'MNIST':
    digit_num = 10
    digits = datasets.load_digits(n_class=digit_num)
    X, y = digits.data, digits.target
elif Dataset == 'CIFAR10':
    cifar10 = torchvision.datasets.CIFAR10(root='./data', train=True, download=True)
    X, y = cifar10.data.transpose(0,3,1,2)[:2000], np.array(cifar10.targets)[:2000]

X.shape, y.shape

Filter and keep the same number of samples for each class

In [None]:
# keep the same number of samples for each class
labels, sample_num = np.unique(y, return_counts=True)
sample_num = np.min(sample_num)
X = np.concatenate([X[y == label][:sample_num] for label in labels])
y = np.concatenate([y[y == label][:sample_num] for label in labels])
X.shape, y.shape

Metrix: **score** and **time cost** of the vectors matching by nearest k neighbors

In [None]:
import time


def calculate_dist(x, y, method='Euclidean', p=3):
    """
    Calculate the distance between x and y
    """

    if method == 'Euclidean':
        dist = np.sqrt(np.sum(np.square(x - y)))
    elif method == 'Manhattan':
        dist = np.sum(np.abs(x - y))
    elif method == 'Cosine':
        dist = np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))
        dist = 1 - dist  # ensure larger distance means less similarity
    elif method == 'Chebyshev':
        dist = np.max(np.abs(x - y))
    elif method == 'Minkowski':
        dist = np.power(np.sum(np.power(np.abs(x - y), p)), 1 / p)
    else:
        raise ValueError('Invalid distance method')

    return dist


def nearest_k_neighbors(X, Y, k=10, method='Euclidean', p=3):
    """
    Evaluate the nearest k neighbors of X in Y
    """

    dist = np.zeros(Y.shape[0])
    for i in range(Y.shape[0]):
        dist[i] = calculate_dist(X, Y[i], method=method, p=p)
    idx = np.argsort(dist)[:k]

    return idx


def evaluate_similarity_matching(vectors, labels, N, m=20, method='Euclidean', p=3):
    """
    Evaluate the similarity of the vectors by nearest k neighbors
    """

    score = 0
    time_cost = 0
    random_idx = np.random.choice(vectors.shape[0], m, replace=False)
    for i in random_idx:
        start_t = time.perf_counter()
        idx = nearest_k_neighbors(vectors[i], vectors, k=N, method=method, p=p)
        end_t = time.perf_counter()
        score += np.sum(labels[idx] == labels[i])
        time_cost += end_t - start_t
    score /= (m * N)
    time_cost /= m

    return score, time_cost


In [None]:
m = 100
dist = 'Cosine'
score_original, t_original = evaluate_similarity_matching(X, y, N=sample_num, m=m, method=dist, p=3)
score_original, t_original

Dimension Estimator: identify the intrinsic dimension of embedding space (Ref: scikit-dimension)

In [None]:
import skdim


class Estimater(object):
    """Intrinsic dimension estimation class for embedding vectors.
    """

    def __init__(self, method):

        assert method in ['MLE', 'MADA', 'MiND',
                          'CorrInt', 'TwoNN', 'TLE', 'MOM', 'FisherS']
        self.method = method

    def fit(self, X, **kwargs):
        """Fit the estimator from the given data.
        """
        if self.method == 'MLE':
            self.estimator_ = skdim.id.MLE(**kwargs)
        elif self.method == 'MADA':
            self.estimator_ = skdim.id.MADA(**kwargs)
        elif self.method == 'MiND':
            self.estimator_ = skdim.id.MiND_ML(**kwargs)
        elif self.method == 'DANCo':
            self.estimator_ = skdim.id.DANCo(**kwargs)
        elif self.method == 'CorrInt':
            self.estimator_ = skdim.id.CorrInt(**kwargs)
        elif self.method == 'TwoNN':
            self.estimator_ = skdim.id.TwoNN(**kwargs)
        elif self.method == 'TLE':
            self.estimator_ = skdim.id.TLE(**kwargs)
        elif self.method == 'MOM':
            self.estimator_ = skdim.id.MOM(**kwargs)
        elif self.method == 'FisherS':
            self.estimator_ = skdim.id.FisherS(**kwargs)
        elif self.method == 'ESS':
            self.estimator_ = skdim.id.ESS(**kwargs)
        self.estimator_.fit(X)
        return self

    def transform(self, X):
        """Transform the given data.
        """
        return self.estimator_.transform(X)

    def fit_transform(self, X, **kwargs):
        """Fit the estimator from the given data and transform it.
        """
        return self.fit(X, **kwargs).transform(X)

In [None]:
%%time

dim = []
for algo in ['MLE', 'MADA', 'MiND', 'CorrInt','TwoNN', 'TLE', 'MOM', 'FisherS']:
    estimater = Estimater(method=algo)
    dim.append(estimater.fit(X).transform(X))
    print(algo, f'{dim[-1]:.1f}')
print('Mean', f'{np.mean(dim):.1f}')

n_component = round(np.mean(dim))

In [None]:
%%time

import towhee
ae = towhee.ops.dimension_reduction.ae(
    n_latent=n_component,
    n_encoders=[X.shape[-1], 32],
    n_decoders=[32, X.shape[-1]],
    activation="relu",
    max_epoch=1000,
    patience=100,
    device="cpu"
).get_op()

X_reduced_ae = ae.fit_transform(X)
score_reduced_ae, t_reduced_ae = evaluate_similarity_matching(X_reduced_ae, y, N=sample_num, m=m, method=dist, p=3)

In [None]:
%%time

pca = towhee.ops.dimension_reduction.pca(
    n_components=n_component,
).get_op()
X_reduced_pca = pca.fit_transform(X)
score_reduced_pca, t_reduced_pca = evaluate_similarity_matching(X_reduced_pca, y, N=sample_num, m=m, method=dist, p=3)

In [None]:
%%time

isomap = towhee.ops.dimension_reduction.isomap(
    n_components=n_component,
).get_op()
X_reduced_isomap = isomap.fit_transform(X)
score_reduced_isomap, t_reduced_isomap = evaluate_similarity_matching(X_reduced_isomap, y, N=sample_num, m=m, method=dist, p=3)

In [None]:
%%time

lle = towhee.ops.dimension_reduction.lle(
    n_components=n_component,
).get_op()
X_reduced_lle = lle.fit_transform(X)
score_reduced_lle, t_reduced_lle = evaluate_similarity_matching(X_reduced_lle, y, N=sample_num, m=m, method=dist, p=3)

In [None]:
%%time

mds = towhee.ops.dimension_reduction.mds(
    n_components=n_component,
).get_op()
X_reduced_mds = mds.fit_transform(X)
score_reduced_mds, t_reduced_mds = evaluate_similarity_matching(X_reduced_mds, y, N=sample_num, m=m, method=dist, p=3)

In [None]:
%%time

se = towhee.ops.dimension_reduction.se(
    n_components=n_component,
).get_op()
X_reduced_se = se.fit_transform(X)
score_reduced_se, t_reduced_se = evaluate_similarity_matching(X_reduced_se, y, N=sample_num, m=m, method=dist, p=3)

In [None]:
%%time

umap = towhee.ops.dimension_reduction.umap(
    n_components=n_component,
).get_op()
X_reduced_umap = umap.fit_transform(X)
score_reduced_umap, t_reduced_umap = evaluate_similarity_matching(X_reduced_umap, y, N=sample_num, m=m, method=dist, p=3)

In [None]:
%%time

tsne = towhee.ops.dimension_reduction.tsne(
    n_components=n_component,
).get_op()
X_reduced_tsne = tsne.fit_transform(X)
score_reduced_tsne, t_reduced_tsne = evaluate_similarity_matching(X_reduced_tsne, y, N=sample_num, m=m, method=dist, p=3)

Visualization:

1. plot_mix()
2. plot_dim_reduct_acc_comparison()

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots


def _get_color(color, color_type):

    if color_type == 'continuous':
        color = color.astype(np.float32)
    elif color_type == 'discrete':
        color = color.astype(str)
    else:
        raise ValueError('Invalid type of the color.')

    return color


def plot_mixed(data, color, marker_size, title, zoom, score1, score2, t1, t2, dim1, dim2, **kwargs):
    """
    Plot mixed figure of '2D scatter matrix', '3D scatter' and 'Bar of comparsion'
    """

    assert data.shape[-1] >= 5, 'The number of components must be greater than or equal to 5.'

    # n_components = data.shape[1]
    n_components = dim2
    bar_width = round(np.sqrt(n_components))
    bar_height = round(np.floor((n_components-bar_width)/3))

    specs = [[] for _ in range(n_components)]
    subfig_title = ['' for _ in range(n_components*(n_components+bar_width))]
    subfig_title[(n_components-3*bar_height+1) *
                 n_components + 2-1] = 'Dimension of vectors'
    subfig_title[(n_components-2*bar_height+1)*n_components +
                 3-1] = 'Score of vector matching'
    subfig_title[(n_components-bar_height+1)*n_components +
                 4-1] = 'Time cost of vector matching'
    subfig_title[n_components] = '3D scatter'
    subfig_title[round(np.floor(n_components/2))] = '2D scatter matrix'

    for i in range(n_components):
        if i == 0:
            specs[i] = [{'type': 'xy'}]*n_components + [{'type': 'scene',
                                                         'rowspan': bar_width, 'colspan': bar_width}] + [None]*(bar_width-1)
        elif i in range(n_components-3*bar_height, n_components, bar_height):
            specs[i] = [{'type': 'xy'}]*n_components + [{'type': 'bar',
                                                         'rowspan': bar_height, 'colspan': bar_width}] + [None]*(bar_width-1)
        else:
            specs[i] = [{'type': 'xy'}]*n_components+[None]*bar_width

    fig = make_subplots(
        rows=n_components, cols=n_components+bar_width,
        specs=specs,
        horizontal_spacing=0.045,
        vertical_spacing=0.06,
        subplot_titles=subfig_title,
    )

    color = _get_color(color, 'continuous')
    text = _get_color(color, 'discrete')

    # 2d scatter matrix
    for i in range(n_components):
        for j in range(n_components):
            fig.add_trace(
                go.Scatter(
                    x=data[:, i],
                    y=data[:, j],
                    mode='markers',
                    marker=dict(
                        color=color,
                        size=marker_size,
                        colorscale='Viridis',
                        showscale=True
                    ),
                    name='2D scatter matrix',
                    showlegend=False,
                    text=text,
                ),
                row=i+1, col=j+1,
            )

    # 3d scatter
    fig.add_trace(
        go.Scatter3d(
            x=data[:, 0],
            y=data[:, 1],
            z=data[:, 2],
            mode='markers',
            marker=dict(
                color=color,
                size=marker_size,
                colorscale='Viridis',
                showscale=True
            ),
            showlegend=False,
            name='3D scatter',
            text=text
        ),
        row=1, col=n_components+1,
    )

    # bar
    fig.add_trace(
        go.Bar(
            x=['Before', 'After'],
            y=[dim1, dim2],
            name='Dimension of vectors',
            text=[str(dim1), str(dim2)],
            textposition='auto',
            showlegend=False,
            marker=dict(color=['#a8cee3', '#f3be78']),
        ),
        row=n_components-3*bar_height+1, col=n_components+1,
    )
    fig.add_trace(
        go.Bar(
            x=['Before', 'After'],
            y=[score1, score2],
            name='Score of vector matching',
            text=[f'{score1*100:.2f}%', f'{score2*100:.2f}%'],
            textposition='auto',
            showlegend=False,
            marker=dict(color=['#a8cee3', '#f3be78']),
        ),
        row=n_components-2*bar_height+1, col=n_components+1,
    )
    fig.add_trace(
        go.Bar(
            x=['Before', 'After'],
            y=[t1, t2],
            name='Time cost of vector matching',
            text=[f'{t1*1e3:.2f}ms', f'{t2*1e3:.2f}ms'],
            textposition='auto',
            showlegend=False,
            marker=dict(color=['#a8cee3', '#f3be78']),
        ),
        row=n_components-bar_height+1, col=n_components+1,
    )

    # layout
    ratio = (n_components-5)/n_components  # based on 5*5
    fig.update_layout(
        title=title,
        width=int(zoom*800*(1+ratio)*(1+bar_width/n_components)),
        height=int(zoom*800*(1+ratio)),
    )

    # adjust axes
    for i in range(n_components):
        for j in range(n_components):
            fig.update_xaxes(title_text=f'dim_{i}', row=i+1, col=j+1,
                             title_standoff=0, title_font=dict(size=9.5), tickfont=dict(size=9.5))
            fig.update_yaxes(title_text=f'dim_{j}', row=i+1, col=j+1,
                             title_standoff=0, title_font=dict(size=9.5), tickfont=dict(size=9.5))
    fig.update_scenes(
        xaxis_title_text=f'dim_{0}',
        yaxis_title_text=f'dim_{1}',
        zaxis_title_text=f'dim_{2}',
        xaxis_title_font=dict(size=9.5),
        yaxis_title_font=dict(size=9.5),
        zaxis_title_font=dict(size=9.5),
        xaxis_tickfont=dict(size=9.5),
        yaxis_tickfont=dict(size=9.5),
        zaxis_tickfont=dict(size=9.5),
        row=1, col=n_components+1,
    )
    for i in range(n_components-3*bar_height, n_components, bar_height):
        fig.update_xaxes(title_font=dict(size=9.5), tickfont=dict(
            size=9.5), row=i, col=n_components+1, title_standoff=0)
        if i == 0:
            fig.update_yaxes(title_font=dict(size=9.5), tickfont=dict(size=9.5), range=[
                             0, 1], row=i, col=n_components+1, title_standoff=0)
        else:
            fig.update_yaxes(title_font=dict(size=9.5), tickfont=dict(
                size=9.5), row=i, col=n_components+1, title_standoff=0)

    fig.update_traces(marker=dict(line=dict(width=0.1)))
    
    
def plot_dim_reduct_acc_comparison(names, scores, dist, score_original):
    """
    Plot bar figure of dimension reduction accuracy
    """

    fig = go.Figure(go.Bar(
        x=names,
        y=scores,
        text=[f'{score*100:.2f}%' for score in scores],
        textposition="auto",
    )
    )
    fig.update_layout(
        title=f"Accuracy of similarity matching ({dist})",
        xaxis_title="Method",
        yaxis_title="Accuracy",
        width=800,
        height=500,
    )
    fig.add_shape(
        type="line",
        x0=-0.5,
        y0=score_original,
        x1=8.5,
        y1=score_original,
        line=dict(
            color="Red",
            width=2,
            dash="dashdot",
        ),
    )

    return fig

In [None]:
plot_mixed(data=X_reduced_umap,
    color=y,
    marker_size=2,
    title="UMAP",
    zoom=0.65,
    label='digit',
    score1=score_original,
    score2=score_reduced_umap,
    t1=t_original,
    t2=t_reduced_umap,
    dim1=X.shape[1],
    dim2=n_component,
)

In [None]:
names = ["Original", "AE", "PCA", "LLE", "Isomap", "MDS", "SE", "UMAP", "TSNE"]
scores = [score_original, score_reduced_ae, score_reduced_pca, score_reduced_lle, score_reduced_isomap, score_reduced_mds, score_reduced_se, score_reduced_umap, score_reduced_tsne]

plot_dim_reduct_acc_comparison(names, scores, dist=dist, score_original=score_original)