#### Outline
- for each dataset: 
    - load dataset; 
    - for each network: 
        - load network
        - project 1000 test dataset samples
        - save to metric dataframe

In [1]:
# reload packages
%load_ext autoreload
%autoreload 2

### Choose GPU (this may not be needed on your computer)

In [2]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=0


In [3]:
import numpy as np
import pickle
import pandas as pd
import time
from umap import UMAP

In [4]:
from tfumap.umap import tfUMAP
import tensorflow as tf
from sklearn.decomposition import PCA
from openTSNE import TSNE



In [5]:
from tqdm.autonotebook import tqdm

In [6]:
from tfumap.paths import ensure_dir, MODEL_DIR, DATA_DIR

In [7]:
output_dir = MODEL_DIR/'projections' 

In [8]:
projection_speeds = pd.DataFrame(columns = ['method_', 'dimensions', 'dataset', 'speed', "nex"])

### FMNIST

In [9]:
dataset = 'fmnist'
dims = (28,28,1)

##### load dataset

In [10]:
from tensorflow.keras.datasets import fashion_mnist

# load dataset
(train_images, Y_train), (test_images, Y_test) = fashion_mnist.load_data()
X_train = (train_images/255.).astype('float32')
X_test = (test_images/255.).astype('float32')
X_train = X_train.reshape((len(X_train), np.product(np.shape(X_train)[1:])))
X_test = X_test.reshape((len(X_test), np.product(np.shape(X_test)[1:])))

# subset a validation set
n_valid = 10000
X_valid = X_train[-n_valid:]
Y_valid = Y_train[-n_valid:]
X_train = X_train[:-n_valid]
Y_train = Y_train[:-n_valid]

# flatten X
X_train_flat = X_train.reshape((len(X_train), np.product(np.shape(X_train)[1:])))
X_test_flat = X_test.reshape((len(X_test), np.product(np.shape(X_test)[1:])))
X_valid_flat= X_valid.reshape((len(X_valid), np.product(np.shape(X_valid)[1:])))
X_test = X_test.reshape((10000, 28,28,1))
print(len(X_train), len(X_valid), len(X_test))

50000 10000 10000


#### Network 

##### 2 dims

In [11]:
load_loc = output_dir / dataset / 'network' 

In [12]:
embedder = tfUMAP(
    direct_embedding=False,
    verbose=True,
    negative_sample_rate=5,
    training_epochs=5,
    batch_size = 100,
    dims = dims
)

In [13]:
encoder = tf.keras.models.load_model((load_loc / 'encoder').as_posix())
embedder.encoder = encoder

In [14]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    encoder(X_test);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    times.append(end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['network', 2, dataset, end_time - start_time, len(X_test)]


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

seconds:  1.523000975837931
seconds:  0.0161127548199147
seconds:  0.021156250033527613
seconds:  0.018329129088670015
seconds:  0.016372433165088296
seconds:  0.015420375159010291
seconds:  0.01771849114447832
seconds:  0.015456926077604294
seconds:  0.015544889029115438
seconds:  0.015059004770591855



In [15]:
##### Network CPU

with tf.device('/CPU:0'):
    n_repeats = 10
    times = []
    for i in tqdm(range(n_repeats)):
        start_time = time.monotonic()
        encoder(X_test);
        end_time = time.monotonic()
        print('seconds: ', end_time - start_time)
        times.append(end_time - start_time)
        projection_speeds.loc[len(projection_speeds)] = ['network-cpu', 2, dataset, end_time - start_time, len(X_test)]

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

seconds:  0.2501369018573314
seconds:  0.1769578589592129
seconds:  0.1868099740240723
seconds:  0.17968848906457424
seconds:  0.182453908957541
seconds:  0.18018855201080441
seconds:  0.176207727054134
seconds:  0.18667540093883872
seconds:  0.18546372489072382
seconds:  0.2119065779261291



##### 64 dims

In [16]:
load_loc = output_dir / dataset /"64"/ 'network' 

In [17]:
embedder = tfUMAP(
    direct_embedding=False,
    verbose=True,
    negative_sample_rate=5,
    training_epochs=5,
    batch_size = 100,
    dims = dims
)

In [18]:
encoder = tf.keras.models.load_model((load_loc / 'encoder').as_posix())
embedder.encoder = encoder

In [19]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    encoder(X_test);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    times.append(end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['network', 64, dataset, end_time - start_time, len(X_test)]

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

seconds:  0.21663769497536123
seconds:  0.018942847149446607
seconds:  0.01586881815455854
seconds:  0.017651899019256234
seconds:  0.018030270002782345
seconds:  0.017754513071849942
seconds:  0.01863340800628066
seconds:  0.018833584152162075
seconds:  0.01667224196717143
seconds:  0.017112073954194784



In [20]:
##### Network CPU

with tf.device('/CPU:0'):
    n_repeats = 10
    times = []
    for i in tqdm(range(n_repeats)):
        start_time = time.monotonic()
        encoder(X_test);
        end_time = time.monotonic()
        print('seconds: ', end_time - start_time)
        times.append(end_time - start_time)
        projection_speeds.loc[len(projection_speeds)] = ['network-cpu', 64, dataset, end_time - start_time, len(X_test)]

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

seconds:  0.28203710401430726
seconds:  0.18293683184310794
seconds:  0.1842855210416019
seconds:  0.19608442205935717
seconds:  0.19756417511962354
seconds:  0.18991611409001052
seconds:  0.18055140390060842
seconds:  0.17655440815724432
seconds:  0.17609573411755264
seconds:  0.1744745068717748



#### UMAP-learn

##### 2 dims

In [21]:
embedder = UMAP(n_components = 2, verbose=True)
z_umap = embedder.fit_transform(X_train_flat)

UMAP(dens_frac=0.0, dens_lambda=0.0, verbose=True)
Construct fuzzy simplicial set
Sat Jul 18 12:14:13 2020 Finding Nearest Neighbors
Sat Jul 18 12:14:13 2020 Building RP forest with 16 trees
Sat Jul 18 12:14:15 2020 parallel NN descent for 16 iterations
	 0  /  16
	 1  /  16
	 2  /  16
	 3  /  16
Sat Jul 18 12:14:25 2020 Finished Nearest Neighbor Search
Sat Jul 18 12:14:27 2020 Construct embedding
	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
Sat Jul 18 12:14:57 2020 Finished embedding


In [22]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    embedder.transform(X_test_flat);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    times.append(end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['umap-learn', 2, dataset, end_time - start_time, len(X_test)]


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
seconds:  17.635929187992588
	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
seconds:  4.941008719149977
	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
seconds:  5.159472896018997
	completed  0  /  

In [23]:
projection_speeds

Unnamed: 0,method_,dimensions,dataset,speed,nex
0,network,2,fmnist,1.523001,10000
1,network,2,fmnist,0.016113,10000
2,network,2,fmnist,0.021156,10000
3,network,2,fmnist,0.018329,10000
4,network,2,fmnist,0.016372,10000
5,network,2,fmnist,0.01542,10000
6,network,2,fmnist,0.017718,10000
7,network,2,fmnist,0.015457,10000
8,network,2,fmnist,0.015545,10000
9,network,2,fmnist,0.015059,10000


##### 64 dims

In [24]:
embedder = UMAP(n_components = 64, verbose=True)
z_umap = embedder.fit_transform(X_train_flat)

UMAP(dens_frac=0.0, dens_lambda=0.0, n_components=64, verbose=True)
Construct fuzzy simplicial set
Sat Jul 18 12:16:02 2020 Finding Nearest Neighbors
Sat Jul 18 12:16:02 2020 Building RP forest with 16 trees
Sat Jul 18 12:16:03 2020 parallel NN descent for 16 iterations
	 0  /  16
	 1  /  16
	 2  /  16
	 3  /  16
Sat Jul 18 12:16:05 2020 Finished Nearest Neighbor Search
Sat Jul 18 12:16:05 2020 Construct embedding
	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
Sat Jul 18 12:16:49 2020 Finished embedding


In [25]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    embedder.transform(X_test_flat);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['umap-learn', 64, dataset, end_time - start_time, len(X_test)]


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
seconds:  6.194998310878873
	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
seconds:  5.266623386880383
	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
seconds:  5.183475145837292
	completed  0  /  1

In [26]:
projection_speeds

Unnamed: 0,method_,dimensions,dataset,speed,nex
0,network,2,fmnist,1.523001,10000
1,network,2,fmnist,0.016113,10000
2,network,2,fmnist,0.021156,10000
3,network,2,fmnist,0.018329,10000
4,network,2,fmnist,0.016372,10000
5,network,2,fmnist,0.01542,10000
6,network,2,fmnist,0.017718,10000
7,network,2,fmnist,0.015457,10000
8,network,2,fmnist,0.015545,10000
9,network,2,fmnist,0.015059,10000


#### PCA

##### 2 dims

In [27]:
pca = PCA(n_components=2)
z = pca.fit_transform(X_train_flat)

In [28]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    pca.transform(X_test_flat);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    times.append(end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['pca', 2, dataset, end_time - start_time, len(X_test)]


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

seconds:  0.01146728009916842
seconds:  0.023875259095802903
seconds:  0.023056175094097853
seconds:  0.023046415066346526
seconds:  0.023067115806043148
seconds:  0.021578952902927995
seconds:  0.023042334942147136
seconds:  0.023047995986416936
seconds:  0.023027664981782436
seconds:  0.0240302630700171



In [29]:
projection_speeds

Unnamed: 0,method_,dimensions,dataset,speed,nex
0,network,2,fmnist,1.523001,10000
1,network,2,fmnist,0.016113,10000
2,network,2,fmnist,0.021156,10000
3,network,2,fmnist,0.018329,10000
4,network,2,fmnist,0.016372,10000
...,...,...,...,...,...
65,pca,2,fmnist,0.021579,10000
66,pca,2,fmnist,0.023042,10000
67,pca,2,fmnist,0.023048,10000
68,pca,2,fmnist,0.023028,10000


##### 64 dims

In [30]:
pca = PCA(n_components=64)
z = pca.fit_transform(X_train_flat)

In [31]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    pca.transform(X_test_flat);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    times.append(end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['pca', 64, dataset, end_time - start_time, len(X_test)]


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

seconds:  0.01972310896962881
seconds:  0.020001007011160254
seconds:  0.019843593006953597
seconds:  0.02003844897262752
seconds:  0.020194053184241056
seconds:  0.020864831982180476
seconds:  0.019978526048362255
seconds:  0.02000176697038114
seconds:  0.019965047016739845
seconds:  0.020434258971363306



In [32]:
projection_speeds

Unnamed: 0,method_,dimensions,dataset,speed,nex
0,network,2,fmnist,1.523001,10000
1,network,2,fmnist,0.016113,10000
2,network,2,fmnist,0.021156,10000
3,network,2,fmnist,0.018329,10000
4,network,2,fmnist,0.016372,10000
...,...,...,...,...,...
75,pca,64,fmnist,0.020865,10000
76,pca,64,fmnist,0.019979,10000
77,pca,64,fmnist,0.020002,10000
78,pca,64,fmnist,0.019965,10000


#### TSNE

##### 2 dims

In [33]:
tsne = TSNE(
    n_components = 2,
    n_jobs=32,
    verbose=True
)

In [34]:
embedding_train = tsne.fit(X_train_flat)

--------------------------------------------------------------------------------
TSNE(n_jobs=32, neighbors=None, verbose=True)
--------------------------------------------------------------------------------
===> Finding 90 nearest neighbors using Annoy approximate search using euclidean distance...




   --> Time elapsed: 36.42 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.48 seconds
===> Calculating PCA-based initialization...
   --> Time elapsed: 0.48 seconds
===> Running optimization with exaggeration=12.00, lr=4166.67 for 250 iterations...
Iteration   50, KL divergence 5.9437, 50 iterations in 1.6770 sec
Iteration  100, KL divergence 5.4383, 50 iterations in 1.6499 sec
Iteration  150, KL divergence 5.3136, 50 iterations in 1.6637 sec
Iteration  200, KL divergence 5.2569, 50 iterations in 1.6556 sec
Iteration  250, KL divergence 5.2247, 50 iterations in 1.7098 sec
   --> Time elapsed: 8.36 seconds
===> Running optimization with exaggeration=1.00, lr=4166.67 for 500 iterations...
Iteration   50, KL divergence 3.8582, 50 iterations in 1.5851 sec
Iteration  100, KL divergence 3.3417, 50 iterations in 1.6268 sec
Iteration  150, KL divergence 3.0760, 50 iterations in 2.1215 sec
Iteration  200, KL divergence 2.9086, 50 iterations in 2.9020 sec
Iteration  250, KL di

In [35]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    embedding_train.transform(X_test_flat);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    times.append(end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['TSNE', 2, dataset, end_time - start_time, len(X_test)]

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

===> Finding 15 nearest neighbors in existing embedding using Annoy approximate search...
   --> Time elapsed: 4.39 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.03 seconds
===> Running optimization with exaggeration=4.00, lr=0.10 for 0 iterations...
   --> Time elapsed: 0.00 seconds
===> Running optimization with exaggeration=1.50, lr=0.10 for 250 iterations...
Iteration   50, KL divergence 197902.7177, 50 iterations in 0.2684 sec
Iteration  100, KL divergence 198133.2713, 50 iterations in 0.3065 sec
Iteration  150, KL divergence 198263.6537, 50 iterations in 0.2768 sec
Iteration  200, KL divergence 198370.6803, 50 iterations in 0.3042 sec
Iteration  250, KL divergence 198443.4647, 50 iterations in 0.3006 sec
   --> Time elapsed: 1.46 seconds
seconds:  6.186875977087766
===> Finding 15 nearest neighbors in existing embedding using Annoy approximate search...
   --> Time elapsed: 4.43 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.03 seconds
===

In [36]:
projection_speeds

Unnamed: 0,method_,dimensions,dataset,speed,nex
0,network,2,fmnist,1.523001,10000
1,network,2,fmnist,0.016113,10000
2,network,2,fmnist,0.021156,10000
3,network,2,fmnist,0.018329,10000
4,network,2,fmnist,0.016372,10000
...,...,...,...,...,...
85,TSNE,2,fmnist,5.871438,10000
86,TSNE,2,fmnist,5.966355,10000
87,TSNE,2,fmnist,5.735305,10000
88,TSNE,2,fmnist,5.793795,10000


### Save

In [37]:
save_loc = DATA_DIR / 'projection_speeds' / (dataset + '.pickle')
ensure_dir(save_loc)
projection_speeds.to_pickle(save_loc)