# Overview

I have a notebook for comparing the accuracy of a simple multi-category regression after the data dimension has been reduced using the following methods:

- Nothing (use all dimensions)
- Principal Components Analysis (PCA)
- Multidimensional Scaling (MDS)
- Kernel PCA with radial basis functions (Laplacian and Gaussian)
- Isomap
- Locally Linear Embedding (LLE)
- Laplacian Eigenmaps (Spectral Embedding)
- Hessian Eigenmaps
- Local Tangent Space Alignment (LTSA)
- Diffusion maps
- Autoencoder
- t-SNE (maybe)

I used a sample of 1000 images from the MNIST dataset with all 10 classes. To compare the dimension reduction methods, I projected down to dimensions that are powers of two, arranged by order of magnitude: 2, 16, and 256. 

My new plan is to see what happens when I reduce the dimensions of completely random data. I'll reduce the dimensions of 1000 28x28 pixel images (with values sampled from a uniform distribution).

In [1]:
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

from sklearn.decomposition import PCA, KernelPCA
from sklearn.manifold import MDS, LocallyLinearEmbedding, Isomap, SpectralEmbedding, TSNE
from sklearn.metrics.pairwise import euclidean_distances

from scipy.linalg import eigh

from kymatio.keras import Scattering2D
import pyshearlab as ps

from keras.layers import Input, Dense, Flatten
from keras.models import Model, Sequential
from keras.datasets import mnist
from keras.optimizers import SGD
from keras.utils import to_categorical

from tabulate import tabulate

In [2]:
def get_model(name = None, input_shape = None):
    """ Returns a Sequential model """
    
    return Sequential(
        [
            Input(shape = (input_shape,)),
            Dense(units = 10, activation = 'softmax', use_bias = False)
        ],
        name = name
    )

class DiffusionMap:
    def __init__(self, alpha = 0.15, n_components = None):
        self.alpha = alpha,
        self.n_components = n_components
        
    def __str__(self):
        return 'DiffusionMap(alpha = {}, n_components = {})'.format(self.alpha, self.n_components)

    def fit_transform(self, X):
        """ Function to find the diffusion matrix P

            args:
            -----
            alpha - to be used for gaussian kernel function
            X - feature matrix as numpy array
            n_components - number of lower dimensions

            returns:
            --------
            Diffusion_map as np.array object
        """

        dists = euclidean_distances(X, X)
        K = np.exp(-dists**2 / self.alpha)

        r = np.sum(K, axis = 0)
        Di = np.diag(1/r)
        P = np.matmul(Di, K)

        D_right = np.diag((r)**0.5)
        D_left = np.diag((r)**-0.5)
        P_prime = np.matmul(D_right, np.matmul(P, D_left))

        self.eigenValues, self.eigenVectors = eigh(P_prime)
        idx = self.eigenValues.argsort()[::-1]
        self.eigenValues = self.eigenValues[idx]
        self.eigenVectors = self.eigenVectors[:, idx]

        diffusion_coordinates = np.matmul(D_left, self.eigenVectors)
        
        self.diffusion_coordinates = diffusion_coordinates

        return diffusion_coordinates[:, :self.n_components]

In [3]:
class DimensionReduction:
    """ General class to run several dimension reduction methods on a dataset 
        and then train and evaluate linear classifier on the transformed data.
    """
    
    def __init__(self, dim_reduction_names, X_train, y_train, X_test, y_test, n_components, n_neighbors = 7, gamma = 0.15, alpha = 0.15):
        self.names = dim_reduction_names
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.n_components = n_components
        self.n_neighbors = n_neighbors
        self.gamma = gamma
        self.alpha = alpha
        
        self.n_neighbors_hessian = int((self.n_components * (self.n_components + 3))/2 + 1)
        
        # initialize with all dim reduction methods
        all_methods = {
            'pca': PCA(n_components = self.n_components),
            'mds': MDS(n_components = self.n_components, max_iter = 20, n_init = 1),
            'kpca_gaussian': KernelPCA(n_components = self.n_components, kernel = 'rbf', gamma = self.gamma),
#             'kpca_laplacian': KernelPCA(n_components = self.n_components, kind = 'laplacian'), # 16SEPT - doesn't work yet
            'lle': LocallyLinearEmbedding(n_components = self.n_components, method = 'standard'),
            'isomap': Isomap(n_components = self.n_components, n_neighbors = self.n_neighbors),
            'lap': SpectralEmbedding(n_components = self.n_components, n_neighbors = self.n_neighbors, gamma = self.gamma),
            'hes': LocallyLinearEmbedding(n_components = self.n_components, n_neighbors = self.n_neighbors_hessian, method = 'hessian', eigen_solver = 'dense'),
            'ltsa': LocallyLinearEmbedding(n_components = self.n_components, n_neighbors = self.n_components, method = 'ltsa'),
            'diff': DiffusionMap(n_components = self.n_components, alpha = self.alpha)
        }
        
        # select only the desired methods
        self.methods = [all_methods[name] for name in self.names]
        
        print('Trial - Training linear classifier on data reduced to {} dimensions using the following methods:\n{}'.format(self.n_components, self.names))
        
    def fit_dim_reduction(self):
        """ Uses selected dim reduction methods to fit the dim reducing transformations """
        
        self.X_train_reduced = [method.fit_transform(self.X_train) for method in self.methods]
        
        return None
    
    def reduce_test_dimension(self):
        """ Performs dim reduction on test data using fitted dim reduction methods """
        
        self.X_test_reduced = [method.transform(self.X_test) for method in self.methods]
        
        return None
    
    def build_classifiers(self):
        """ Build a simple Sequential model for each dim reduction method """
        
        self.models = [get_model(name, input_shape = self.n_components) for name, method in zip(self.names, self.methods)]
        
        for model in self.models:
            model.compile(
                loss = 'categorical_crossentropy',
                optimizer = SGD(learning_rate = 0.01),
                metrics = ['accuracy']
            )
        
        print('All models built and compiled\n')
        
        return None
    
    def train_classifiers(self):
        """ Train all classifiers on the reduced dim training data """
        
        self.training_accuracy = [
            model.fit(
                x,
                self.y_train,
                batch_size = 25,
                epochs = 100,
                verbose = False
            ) for model, x in zip(self.models, self.X_train_reduced)
        ]
        
        print('All models trained on reduced dim training data\n')
        
        return None
    
    def test_classifiers(self):
        """ Evaluate all classifiers on the reduced dim testing data """
        
        self.test_accuracy = [
            model.evaluate(
                x = x,
                y = self.y_test
            ) for model, x in zip(self.models, self.X_test_reduced)
        ]
        
        print('All models evaluated on reduced dim test data\n')
        
        return None
        
    def get_model_accuracy(self):
        """ Show the training and testing accuracy for all models """
        
        headers = ['Method ({} dimensions)'.format(self.n_components), 'Training accuracy', 'Test accuracy']
        
        self.table = [
            [name, train_result.history['accuracy'][-1]*100, test_result[-1]*100] for name, train_result, test_result in zip(self.names, self.training_accuracy, self.test_accuracy)
        ]
        
        print(tabulate(self.table, headers = headers))
        print('\n{}\n'.format('*'*100))
        
        return None
        
    def run(self):
        """ Run through the set of experiments """
        
        print('Initializing dimension reduction methods - reducing to d={}\n'.format(self.n_components))
        self.fit_dim_reduction()
        self.reduce_test_dimension()
        
        print('Building classifier models for all dimension reduction methods')
        self.build_classifiers()
        
        print('Training classifiers on reduced dimension training set')
        self.train_classifiers()
        
        print('Testing classifiers on reduced dimension testing set')
        self.test_classifiers()
        
        self.get_model_accuracy()
        
        return None

In [4]:
# used in Shearlet transform
useGPU = True

X = np.random.rand(1000, 784)
X_test = np.random.rand(10000, 784)

# randomly assign labels
y = np.random.randint(low = 0, high = 10, size = 1000)
y_test = np.random.randint(low = 0, high = 10, size = 10000)

# encode the class labels as one-hot vectors
y = to_categorical(y)
y_test = to_categorical(y_test)

# Classifier performance on the full dataset

In [5]:
full_model = get_model(name = 'full_model', input_shape = 784)
full_model.compile(
    loss = 'categorical_crossentropy',
    optimizer = SGD(learning_rate = 0.01),
    metrics = ['accuracy']
)

In [6]:
history_full = full_model.fit(
    X,
    y,
    batch_size = 25,
    epochs = 50,
    verbose = False
)

In [7]:
print('{}: {:.4f}%'.format(full_model.name, history_full.history['accuracy'][-1]*100))

full_model: 72.3000%


# Run trials

In [8]:
names = ['pca', 'kpca_gaussian', 'lle', 'isomap', 'ltsa']

trials = [
    DimensionReduction(
        names,
        X_train = X,
        y_train = y,
        X_test = X_test,
        y_test = y_test,
        n_components = k
    ) for k in [2, 16, 64, 256]
]

Trial - Training linear classifier on data reduced to 2 dimensions using the following methods:
['pca', 'kpca_gaussian', 'lle', 'isomap', 'ltsa']
Trial - Training linear classifier on data reduced to 16 dimensions using the following methods:
['pca', 'kpca_gaussian', 'lle', 'isomap', 'ltsa']
Trial - Training linear classifier on data reduced to 64 dimensions using the following methods:
['pca', 'kpca_gaussian', 'lle', 'isomap', 'ltsa']
Trial - Training linear classifier on data reduced to 256 dimensions using the following methods:
['pca', 'kpca_gaussian', 'lle', 'isomap', 'ltsa']


In [9]:
for trial in trials:
    trial.run()

Initializing dimension reduction methods - reducing to d=2



  self.M_lu = lu_factor(M)


Building classifier models for all dimension reduction methods
All models built and compiled

Training classifiers on reduced dimension training set
All models trained on reduced dim training data

Testing classifiers on reduced dimension testing set
All models evaluated on reduced dim test data

Method (2 dimensions)      Training accuracy    Test accuracy
-----------------------  -------------------  ---------------
pca                                     12.5            10.61
kpca_gaussian                            8.5            10.11
lle                                     11.2             9.84
isomap                                  12.1            10.51
ltsa                                    10              10.11

****************************************************************************************************

Initializing dimension reduction methods - reducing to d=16

Building classifier models for all dimension reduction methods
All models built and compiled

Training c

# Wavelet decomposition

In [10]:
inputs = Input(shape = (28, 28))
x = Scattering2D(J = 4, L = 8)(inputs)
x = Flatten()(x)
x_out = Dense(10, activation = 'softmax')(x)
model_scatter = Model(inputs, x_out)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Unable to identify source code of lambda function <function <lambda> at 0x7fa6204dc170>. It was defined in this code:
backend.fft = FFT(lambda x: tf.signal.fft2d(x, name='fft2d'),
                  lambda x: tf.signal.ifft2d(x, name='ifft2d'),
                  lambda x: tf.math.real(tf.signal.ifft2d(x, name='irfft2d')),
                  lambda x: None)

This code must contain a single distinguishable lambda. To avoid this problem, define each lambda in a separate expression.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Unable to identify source code of lambda function <function <lambda> at 0x7fa6204dc170>. It was defined in this code:
backend.fft = FFT(lambda x: tf.signal.fft2d(x, name='fft2d'),
    

In [11]:
model_scatter.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_22 (InputLayer)        [(None, 28, 28)]          0         
_________________________________________________________________
scattering2d (Scattering2D)  (None, 417, 1, 1)         0         
_________________________________________________________________
flatten (Flatten)            (None, 417)               0         
_________________________________________________________________
dense_21 (Dense)             (None, 10)                4180      
Total params: 4,180
Trainable params: 4,180
Non-trainable params: 0
_________________________________________________________________


In [12]:
model_scatter.compile(
        loss = 'categorical_crossentropy',
        optimizer = SGD(learning_rate = 0.01),
        metrics = ['accuracy']
)

In [13]:
X_scatter = np.array([img.reshape((28, 28)) for img in X])
X_scatter.shape

(1000, 28, 28)

In [14]:
history_scatter = model_scatter.fit(
    X_scatter,
    y,
    batch_size = 25,
    epochs = 50,
    verbose = False
)

In [15]:
print('model_scatter: {:.4f}%'.format(history_scatter.history['accuracy'][-1]*100))

model_scatter: 11.6000%


# Autoencoders

In [16]:
class Autoencoder:
    
    def __init__(self, X, y, n_components):
        self.X = X
        self.y = y
        self.n_components = n_components
        
        # create the encoder and full autoencoder models
        self.input_img = Input(shape = (784,))
        
        self.encoded = Dense(
            self.n_components,
            activation = 'relu'
        )(self.input_img)
        
        self.decoded = Dense(
            784,
            activation = 'sigmoid'
        )(self.encoded)
        
        self.autoencoder = Model(
            self.input_img,
            self.decoded
        )
        
        self.encoder = Model(
            self.input_img,
            self.encoded
        )
        
        # placeholder for encoded input
        self.encoded_input = Input(
            shape = (self.n_components,)
        )
        
        # create decoder
        self.decoder_layer = self.autoencoder.layers[-1]
        self.decoder = Model(
            self.encoded_input,
            self.decoder_layer(self.encoded_input)
        )
        
        # compile the autoencoder model
        self.autoencoder.compile(
            loss = 'categorical_crossentropy',
            optimizer = SGD(learning_rate = 0.01),
            metrics = ['accuracy']
        )
        
    def fit(self):
        self.autoencoder.fit(
            self.X,
            self.X,
            epochs = 50,
            batch_size = 25,
            shuffle = True,
            verbose = False
        )
        
    def fit_transform(self):
        return self.encoder.predict(self.X)
    
    def build_model(self):
        self.model = Sequential(
            [
                Input(shape = (self.n_components, )),
                Dense(units = 10, activation = 'softmax', use_bias = False)
            ]
        )
        
        self.model.compile(
            loss = 'categorical_crossentropy',
            optimizer = SGD(learning_rate = 0.01),
            metrics = ['accuracy']
        )
        
    def train_model(self):
        reduced_dim = self.fit_transform()
        self.history = self.model.fit(
            reduced_dim,
            self.y,
            batch_size = 25,
            epochs = 50,
            verbose = False
        )
        
        self.accuracy = self.history.history['accuracy'][-1]*100

## Autoencoder reducing to 2 dimensions

Make an ensemble of 50 autoencoders, then take the average accuracy as the reported accuracy.

In [17]:
N = 50
k = 2

auto2 = [Autoencoder(X, y, k) for _ in range(N)]
[a.fit() for a in auto2]

[a.build_model() for a in auto2]
[a.train_model() for a in auto2];

In [18]:
for a in auto2:
    print('model_autoencoder: {:.4f}%'.format(a.accuracy))
          
mean_accuracy2 = sum([a.accuracy for a in auto2])/N
print('Mean accuracy for {} models: {}'.format(N, mean_accuracy2))

model_autoencoder: 11.1000%
model_autoencoder: 9.0000%
model_autoencoder: 10.2000%
model_autoencoder: 9.8000%
model_autoencoder: 9.7000%
model_autoencoder: 11.2000%
model_autoencoder: 9.8000%
model_autoencoder: 10.0000%
model_autoencoder: 10.0000%
model_autoencoder: 9.7000%
model_autoencoder: 10.2000%
model_autoencoder: 11.4000%
model_autoencoder: 9.3000%
model_autoencoder: 10.2000%
model_autoencoder: 10.5000%
model_autoencoder: 9.8000%
model_autoencoder: 9.9000%
model_autoencoder: 10.9000%
model_autoencoder: 10.4000%
model_autoencoder: 11.3000%
model_autoencoder: 12.1000%
model_autoencoder: 10.9000%
model_autoencoder: 9.7000%
model_autoencoder: 9.6000%
model_autoencoder: 10.3000%
model_autoencoder: 10.9000%
model_autoencoder: 11.3000%
model_autoencoder: 11.8000%
model_autoencoder: 9.2000%
model_autoencoder: 10.8000%
model_autoencoder: 9.5000%
model_autoencoder: 10.1000%
model_autoencoder: 10.0000%
model_autoencoder: 11.4000%
model_autoencoder: 9.9000%
model_autoencoder: 9.7000%
model_

## Autoencoder reducing to 16 dimensions

In [19]:
k = 16

auto16 = [Autoencoder(X, y, k) for _ in range(N)]
[a.fit() for a in auto16]

[a.build_model() for a in auto16]
[a.train_model() for a in auto16];

In [20]:
for a in auto16:
    print('model_autoencoder: {:.4f}%'.format(a.accuracy))
          
mean_accuracy16 = sum([a.accuracy for a in auto16])/N
print('Mean accuracy for {} models: {}'.format(N, mean_accuracy16))

model_autoencoder: 11.5000%
model_autoencoder: 11.4000%
model_autoencoder: 11.6000%
model_autoencoder: 11.3000%
model_autoencoder: 13.1000%
model_autoencoder: 12.3000%
model_autoencoder: 11.5000%
model_autoencoder: 11.1000%
model_autoencoder: 11.5000%
model_autoencoder: 9.8000%
model_autoencoder: 11.7000%
model_autoencoder: 12.4000%
model_autoencoder: 11.6000%
model_autoencoder: 10.5000%
model_autoencoder: 9.7000%
model_autoencoder: 9.6000%
model_autoencoder: 12.8000%
model_autoencoder: 10.8000%
model_autoencoder: 10.7000%
model_autoencoder: 12.1000%
model_autoencoder: 12.3000%
model_autoencoder: 11.6000%
model_autoencoder: 12.6000%
model_autoencoder: 11.4000%
model_autoencoder: 12.1000%
model_autoencoder: 11.7000%
model_autoencoder: 11.5000%
model_autoencoder: 12.1000%
model_autoencoder: 11.9000%
model_autoencoder: 12.1000%
model_autoencoder: 13.3000%
model_autoencoder: 11.9000%
model_autoencoder: 9.7000%
model_autoencoder: 10.4000%
model_autoencoder: 13.5000%
model_autoencoder: 10.90

## Autoencoder reducing to 256 dimensions

In [21]:
k = 256

auto256 = [Autoencoder(X, y, k) for _ in range(N)]
[a.fit() for a in auto256]

[a.build_model() for a in auto256]
[a.train_model() for a in auto256];

In [22]:
for a in auto256:
    print('model_autoencoder: {:.4f}%'.format(a.accuracy))
          
mean_accuracy256 = sum([a.accuracy for a in auto256])/N
print('Mean accuracy for {} models: {}'.format(N, mean_accuracy256))

model_autoencoder: 30.7000%
model_autoencoder: 28.7000%
model_autoencoder: 29.6000%
model_autoencoder: 29.5000%
model_autoencoder: 29.6000%
model_autoencoder: 30.4000%
model_autoencoder: 26.7000%
model_autoencoder: 29.4000%
model_autoencoder: 29.2000%
model_autoencoder: 30.2000%
model_autoencoder: 25.8000%
model_autoencoder: 27.6000%
model_autoencoder: 28.7000%
model_autoencoder: 28.4000%
model_autoencoder: 28.2000%
model_autoencoder: 26.9000%
model_autoencoder: 26.1000%
model_autoencoder: 27.3000%
model_autoencoder: 28.0000%
model_autoencoder: 27.9000%
model_autoencoder: 30.3000%
model_autoencoder: 26.5000%
model_autoencoder: 25.3000%
model_autoencoder: 27.3000%
model_autoencoder: 29.0000%
model_autoencoder: 26.9000%
model_autoencoder: 29.1000%
model_autoencoder: 26.4000%
model_autoencoder: 29.0000%
model_autoencoder: 28.5000%
model_autoencoder: 26.7000%
model_autoencoder: 29.5000%
model_autoencoder: 26.0000%
model_autoencoder: 27.4000%
model_autoencoder: 27.7000%
model_autoencoder: 3

# References

[1] Ham, J., Lee, D. D., Mika, S., & Sch√∂lkopf, B. (2004, July). A kernel view of the dimensionality reduction of manifolds. In *Proceedings of the Twenty-First International Conference on Machine Learning* (p. 47).