#### Outline
- for each dataset: 
    - load dataset; 
    - for each network: 
        - load network
        - project 1000 test dataset samples
        - save to metric dataframe

In [1]:
# reload packages
%load_ext autoreload
%autoreload 2

### Choose GPU (this may not be needed on your computer)

In [2]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=0


In [3]:
import tensorflow as tf
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
if len(gpu_devices)>0:
    tf.config.experimental.set_memory_growth(gpu_devices[0], True)
print(gpu_devices)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [4]:
import numpy as np
import pickle
import pandas as pd
import time
from umap import UMAP

In [5]:
from tfumap.umap import tfUMAP
import tensorflow as tf
from sklearn.decomposition import PCA
from openTSNE import TSNE



In [6]:
from tqdm.autonotebook import tqdm

In [7]:
from tfumap.paths import ensure_dir, MODEL_DIR, DATA_DIR

In [8]:
output_dir = MODEL_DIR/'projections' 

In [9]:
projection_speeds = pd.DataFrame(columns = ['method_', 'dimensions', 'dataset', 'speed', 'nex'])

### MNIST

In [10]:
dataset = 'mnist'
dims = (28,28,1)

##### load dataset

In [11]:
from tensorflow.keras.datasets import mnist

# load dataset
(train_images, Y_train), (test_images, Y_test) = mnist.load_data()
X_train = (train_images/255.).astype('float32')
X_test = (test_images/255.).astype('float32')
X_train = X_train.reshape((len(X_train), np.product(np.shape(X_train)[1:])))
X_test = X_test.reshape((len(X_test), np.product(np.shape(X_test)[1:])))

# subset a validation set
n_valid = 10000
X_valid = X_train[-n_valid:]
Y_valid = Y_train[-n_valid:]
X_train = X_train[:-n_valid]
Y_train = Y_train[:-n_valid]

# flatten X
X_train_flat = X_train.reshape((len(X_train), np.product(np.shape(X_train)[1:])))
X_test_flat = X_test.reshape((len(X_test), np.product(np.shape(X_test)[1:])))
X_valid_flat= X_valid.reshape((len(X_valid), np.product(np.shape(X_valid)[1:])))
X_test = X_test.reshape((10000, 28,28,1))
print(len(X_train), len(X_valid), len(X_test))

50000 10000 10000


In [12]:
X_test.shape

(10000, 28, 28, 1)

#### Network 

##### 2 dims

In [13]:
load_loc = output_dir / dataset / 'network' 

In [14]:
embedder = tfUMAP(
    direct_embedding=False,
    verbose=True,
    negative_sample_rate=5,
    training_epochs=5,
    batch_size = 100,
    dims = dims
)

In [15]:
encoder = tf.keras.models.load_model((load_loc / 'encoder').as_posix())
embedder.encoder = encoder

In [16]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    encoder(X_test);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    times.append(end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['network', 2, dataset, end_time - start_time, len(X_test)]


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

seconds:  1.5903541399165988
seconds:  0.015495437197387218
seconds:  0.016865786164999008
seconds:  0.017199866008013487
seconds:  0.01833019801415503
seconds:  0.017148615093901753
seconds:  0.01729371794499457
seconds:  0.015721864067018032
seconds:  0.012831439962610602
seconds:  0.01708691311068833



In [17]:
z = embedder.transform(X_test_flat);
np.save( MODEL_DIR/'projections' / dataset / 'network' / 'z_test.npy', z)

In [18]:
##### Network CPU

with tf.device('/CPU:0'):
    n_repeats = 10
    times = []
    for i in tqdm(range(n_repeats)):
        start_time = time.monotonic()
        encoder(X_test);
        end_time = time.monotonic()
        print('seconds: ', end_time - start_time)
        times.append(end_time - start_time)
        projection_speeds.loc[len(projection_speeds)] = ['network-cpu', 2, dataset, end_time - start_time, len(X_test)]

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

seconds:  0.2520122160203755
seconds:  0.17392564518377185
seconds:  0.1790161719545722
seconds:  0.17990123806521297
seconds:  0.1807485418394208
seconds:  0.17557449312880635
seconds:  0.17832860280759633
seconds:  0.1805799570865929
seconds:  0.18067234009504318
seconds:  0.1727090000640601



##### 64 dims

In [19]:
load_loc = output_dir / dataset /"64"/ 'network' 

In [20]:
embedder = tfUMAP(
    direct_embedding=False,
    verbose=True,
    negative_sample_rate=5,
    training_epochs=5,
    batch_size = 100,
    dims = dims
)

In [21]:
encoder = tf.keras.models.load_model((load_loc / 'encoder').as_posix())
embedder.encoder = encoder

In [22]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    encoder(X_test);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    times.append(end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['network', 64, dataset, end_time - start_time, len(X_test)]

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

seconds:  0.05511659989133477
seconds:  0.017414272064343095
seconds:  0.017476384062319994
seconds:  0.017391500994563103
seconds:  0.014657802879810333
seconds:  0.01672888291068375
seconds:  0.018829293083399534
seconds:  0.017721151001751423
seconds:  0.015856026904657483
seconds:  0.018754940945655107



In [23]:
z = embedder.transform(X_test_flat);
out = MODEL_DIR/'projections' / dataset / '64' / 'network' / 'z_test.npy'
np.save( out, z)

##### Network CPU

In [24]:
with tf.device('/CPU:0'):
    n_repeats = 10
    times = []
    for i in tqdm(range(n_repeats)):
        start_time = time.monotonic()
        encoder(X_test);
        end_time = time.monotonic()
        print('seconds: ', end_time - start_time)
        times.append(end_time - start_time)
        projection_speeds.loc[len(projection_speeds)] = ['network-cpu', 64, dataset, end_time - start_time, len(X_test)]

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

seconds:  0.2366927161347121
seconds:  0.19222950306721032
seconds:  0.19645501486957073
seconds:  0.20322660100646317
seconds:  0.1947048050351441
seconds:  0.19166660704649985
seconds:  0.19242210895754397
seconds:  0.19317451119422913
seconds:  0.18985186517238617
seconds:  0.18407212803140283



### AE 

##### 2 dims

In [25]:
load_loc = output_dir / dataset / 'autoencoder' 

In [26]:
embedder = tfUMAP(
    direct_embedding=False,
    verbose=True,
    negative_sample_rate=5,
    training_epochs=5,
    decoding_method = "autoencoder",
    batch_size = 100,
    dims = dims
)

In [27]:
encoder = tf.keras.models.load_model((load_loc / 'encoder').as_posix())
embedder.encoder = encoder

In [28]:
decoder = tf.keras.models.load_model((load_loc / 'decoder').as_posix())
embedder.decoder = decoder

In [29]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    encoder(X_test);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    times.append(end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['network', 2, dataset, end_time - start_time, len(X_test)]


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

seconds:  0.06125645595602691
seconds:  0.01973986905068159
seconds:  0.02112095896154642
seconds:  0.016422383952885866
seconds:  0.016192456940189004
seconds:  0.015719482908025384
seconds:  0.016076843021437526
seconds:  0.014994381926953793
seconds:  0.01759963808581233
seconds:  0.016624809009954333



In [30]:
z = embedder.transform(X_test_flat);
np.save( MODEL_DIR/'projections' / dataset / 'autoencoder' / 'z_test.npy', z)

##### 64 dims

In [31]:
load_loc = output_dir / dataset /"64"/ 'autoencoder' 

In [32]:
embedder = tfUMAP(
    direct_embedding=False,
    verbose=True,
    negative_sample_rate=5,
    training_epochs=5,
    decoding_method = "autoencoder",
    batch_size = 100,
    dims = dims
)

In [33]:
encoder = tf.keras.models.load_model((load_loc / 'encoder').as_posix())
embedder.encoder = encoder

In [34]:
decoder = tf.keras.models.load_model((load_loc / 'decoder').as_posix())
embedder.decoder = decoder

In [35]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    encoder(X_test);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    times.append(end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['network', 64, dataset, end_time - start_time, len(X_test)]

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

seconds:  0.05627800291404128
seconds:  0.014976382022723556
seconds:  0.01427385094575584
seconds:  0.015335232019424438
seconds:  0.014611941995099187
seconds:  0.012409478891640902
seconds:  0.01584613719023764
seconds:  0.014600470894947648
seconds:  0.013914352050051093
seconds:  0.014002274023368955



In [36]:
z = embedder.transform(X_test_flat);
out = MODEL_DIR/'projections' / dataset / '64' / 'autoencoder' / 'z_test.npy'
np.save( out, z)

#### UMAP-learn

##### 2 dims

In [37]:
embedder = UMAP(n_components = 2, verbose=True)
z_umap = embedder.fit_transform(X_train_flat)

UMAP(dens_frac=0.0, dens_lambda=0.0, verbose=True)
Construct fuzzy simplicial set
Sat Jul 18 11:51:11 2020 Finding Nearest Neighbors
Sat Jul 18 11:51:11 2020 Building RP forest with 16 trees
Sat Jul 18 11:51:12 2020 parallel NN descent for 16 iterations
	 0  /  16
	 1  /  16
	 2  /  16
	 3  /  16
	 4  /  16
Sat Jul 18 11:51:22 2020 Finished Nearest Neighbor Search
Sat Jul 18 11:51:24 2020 Construct embedding
	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
Sat Jul 18 11:51:55 2020 Finished embedding


In [38]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    embedder.transform(X_test_flat);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    times.append(end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['umap-learn', 2, dataset, end_time - start_time, len(X_test)]


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
seconds:  20.360759247094393
	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
seconds:  6.164869684027508
	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
seconds:  7.775746206985787
	completed  0  /  

In [39]:
out

PosixPath('/mnt/cube/tsainbur/Projects/github_repos/umap_tf_networks/models/projections/mnist/64/autoencoder/z_test.npy')

In [40]:
z = embedder.transform(X_test_flat);
out = MODEL_DIR/'projections' / dataset / 'umap-learn' / 'z_test.npy'
ensure_dir(out)
np.save(out, z)

	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs


##### 64 dims

In [41]:
embedder = UMAP(n_components = 64, verbose=True)
z_umap = embedder.fit_transform(X_train_flat)

UMAP(dens_frac=0.0, dens_lambda=0.0, n_components=64, verbose=True)
Construct fuzzy simplicial set
Sat Jul 18 11:53:26 2020 Finding Nearest Neighbors
Sat Jul 18 11:53:26 2020 Building RP forest with 16 trees
Sat Jul 18 11:53:27 2020 parallel NN descent for 16 iterations
	 0  /  16
	 1  /  16
	 2  /  16
	 3  /  16
	 4  /  16
Sat Jul 18 11:53:30 2020 Finished Nearest Neighbor Search
Sat Jul 18 11:53:30 2020 Construct embedding
	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
Sat Jul 18 11:54:10 2020 Finished embedding


In [42]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    embedder.transform(X_test_flat);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['umap-learn', 64, dataset, end_time - start_time, len(X_test)]


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
seconds:  7.449267116840929
	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
seconds:  6.317165615968406
	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
seconds:  6.1602853608783334
	completed  0  /  

In [43]:
z = embedder.transform(X_test_flat);
out = MODEL_DIR/'projections' / dataset / '64' / 'umap-learn' / 'z_test.npy'
ensure_dir(out)
np.save(out, z)

	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs


#### PCA

##### 2 dims

In [44]:
pca = PCA(n_components=2)
z = pca.fit_transform(X_train_flat)

In [45]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    pca.transform(X_test_flat);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    times.append(end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['pca', 2, dataset, end_time - start_time, len(X_test)]


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

seconds:  0.025216451147571206
seconds:  0.01997730997391045
seconds:  0.020038530929014087
seconds:  0.01999304909259081
seconds:  0.020689320052042603
seconds:  0.02032667910680175
seconds:  0.020370081067085266
seconds:  0.020031921099871397
seconds:  0.020631727995350957
seconds:  0.020571965957060456



In [46]:
z = pca.transform(X_test_flat);
out = MODEL_DIR/'projections' / dataset / 'PCA' / 'z_test.npy'
ensure_dir(out)
np.save(out, z)

##### 64 dims

In [47]:
pca = PCA(n_components=64)
z = pca.fit_transform(X_train_flat)

In [48]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    pca.transform(X_test_flat);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    times.append(end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['pca', 64, dataset, end_time - start_time, len(X_test)]


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

seconds:  0.02394376415759325
seconds:  0.02739687403663993
seconds:  0.035712565993890166
seconds:  0.03408693801611662
seconds:  0.034951243782415986
seconds:  0.036046355962753296
seconds:  0.024760747095569968
seconds:  0.024515361059457064
seconds:  0.02491825190372765
seconds:  0.024672525003552437



In [49]:
z = pca.transform(X_test_flat);
out = MODEL_DIR/'projections' / dataset / "64" / 'PCA'  / 'z_test.npy'
ensure_dir(out)
np.save(out, z)

#### TSNE

##### 2 dims

In [50]:
tsne = TSNE(
    n_components = 2,
    n_jobs=32,
    verbose=True
)

In [51]:
embedding_train = tsne.fit(X_train_flat)



--------------------------------------------------------------------------------
TSNE(n_jobs=32, neighbors=None, verbose=True)
--------------------------------------------------------------------------------
===> Finding 90 nearest neighbors using Annoy approximate search using euclidean distance...
   --> Time elapsed: 39.66 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.44 seconds
===> Calculating PCA-based initialization...
   --> Time elapsed: 0.44 seconds
===> Running optimization with exaggeration=12.00, lr=4166.67 for 250 iterations...
Iteration   50, KL divergence 6.3483, 50 iterations in 1.6705 sec
Iteration  100, KL divergence 5.8474, 50 iterations in 1.7598 sec
Iteration  150, KL divergence 5.7250, 50 iterations in 1.6233 sec
Iteration  200, KL divergence 5.6724, 50 iterations in 1.6108 sec
Iteration  250, KL divergence 5.6436, 50 iterations in 1.5874 sec
   --> Time elapsed: 8.25 seconds
===> Running optimization with exaggeration=1.00, lr=4166.67 for 50

In [52]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    embedding_train.transform(X_test_flat);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    times.append(end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['TSNE', 2, dataset, end_time - start_time, len(X_test)]

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

===> Finding 15 nearest neighbors in existing embedding using Annoy approximate search...
   --> Time elapsed: 4.32 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.04 seconds
===> Running optimization with exaggeration=4.00, lr=0.10 for 0 iterations...
   --> Time elapsed: 0.00 seconds
===> Running optimization with exaggeration=1.50, lr=0.10 for 250 iterations...
Iteration   50, KL divergence 201003.5997, 50 iterations in 0.2982 sec
Iteration  100, KL divergence 201129.3922, 50 iterations in 0.2801 sec
Iteration  150, KL divergence 201212.6191, 50 iterations in 0.2892 sec
Iteration  200, KL divergence 201282.5353, 50 iterations in 0.2804 sec
Iteration  250, KL divergence 201336.2487, 50 iterations in 0.2806 sec
   --> Time elapsed: 1.43 seconds
seconds:  6.965839101932943
===> Finding 15 nearest neighbors in existing embedding using Annoy approximate search...
   --> Time elapsed: 4.35 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.03 seconds
===

In [53]:
z = embedding_train.transform(X_test_flat);
out = MODEL_DIR/'projections' / dataset / 'TSNE' / 'z_test.npy'
ensure_dir(out)
np.save(out, z)

===> Finding 15 nearest neighbors in existing embedding using Annoy approximate search...
   --> Time elapsed: 4.33 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.05 seconds
===> Running optimization with exaggeration=4.00, lr=0.10 for 0 iterations...
   --> Time elapsed: 0.00 seconds
===> Running optimization with exaggeration=1.50, lr=0.10 for 250 iterations...
Iteration   50, KL divergence 201003.5997, 50 iterations in 0.2781 sec
Iteration  100, KL divergence 201129.3922, 50 iterations in 0.2691 sec
Iteration  150, KL divergence 201212.6191, 50 iterations in 0.2832 sec
Iteration  200, KL divergence 201282.5353, 50 iterations in 0.2659 sec
Iteration  250, KL divergence 201336.2487, 50 iterations in 0.2611 sec
   --> Time elapsed: 1.36 seconds


In [54]:
projection_speeds

Unnamed: 0,method_,dimensions,dataset,speed,nex
0,network,2,mnist,1.590354,10000
1,network,2,mnist,0.015495,10000
2,network,2,mnist,0.016866,10000
3,network,2,mnist,0.017200,10000
4,network,2,mnist,0.018330,10000
...,...,...,...,...,...
105,TSNE,2,mnist,5.765349,10000
106,TSNE,2,mnist,5.763799,10000
107,TSNE,2,mnist,5.627446,10000
108,TSNE,2,mnist,5.754782,10000


### Save

In [55]:
save_loc = DATA_DIR / 'projection_speeds' / (dataset + '.pickle')
ensure_dir(save_loc)
projection_speeds.to_pickle(save_loc)