#### Outline
- for each dataset: 
    - load dataset; 
    - for each network: 
        - load network
        - project 1000 test dataset samples
        - save to metric dataframe

In [1]:
# reload packages
%load_ext autoreload
%autoreload 2

### Choose GPU (this may not be needed on your computer)

In [2]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=0


In [3]:
import tensorflow as tf
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
if len(gpu_devices)>0:
    tf.config.experimental.set_memory_growth(gpu_devices[0], True)
gpu_devices

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [4]:
import numpy as np
import pickle
import pandas as pd
import time
from umap import UMAP

In [5]:
from tfumap.umap import tfUMAP
import tensorflow as tf
from sklearn.decomposition import PCA
from openTSNE import TSNE



In [6]:
from tqdm.autonotebook import tqdm

In [7]:
from tfumap.paths import ensure_dir, MODEL_DIR, DATA_DIR

In [8]:
output_dir = MODEL_DIR/'projections' 

In [9]:
projection_speeds = pd.DataFrame(columns = ['method_', 'dimensions', 'dataset', 'speed', "nex"])

### FMNIST

In [10]:
dataset = 'cifar10'
dims = (32,32,3)

##### load dataset

In [11]:
from tensorflow.keras.datasets import cifar10

# load dataset
(train_images, Y_train), (test_images, Y_test) = cifar10.load_data()
X_train = (train_images/255.).astype('float32')
X_test = (test_images/255.).astype('float32')
X_train_flat = X_train.reshape((len(X_train), np.product(np.shape(X_train)[1:])))
X_test_flat = X_test.reshape((len(X_test), np.product(np.shape(X_test)[1:])))

# subset a validation set
n_valid = 10000
X_valid = X_train[-n_valid:]
Y_valid = Y_train[-n_valid:].flatten()
X_train = X_train[:-n_valid]
Y_train = Y_train[:-n_valid].flatten()
Y_test = Y_test.flatten()

print(len(X_train), len(X_valid), len(X_test))

#X_test_flat = X_test
#X_train_flat = X_train

X_test = X_test.reshape((10000, 32,32,3))

40000 10000 10000


#### Network 

##### 2 dims

In [12]:
load_loc = output_dir / dataset / 'network' 

In [13]:
embedder = tfUMAP(
    direct_embedding=False,
    verbose=True,
    negative_sample_rate=5,
    training_epochs=5,
    batch_size = 100,
    dims = dims
)

In [14]:
encoder = tf.keras.models.load_model((load_loc / 'encoder').as_posix())
embedder.encoder = encoder

In [15]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    encoder(X_test);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    times.append(end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['network', 2, dataset, end_time - start_time, len(X_test)]


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

seconds:  1.6944030558224767
seconds:  0.1331504750996828
seconds:  0.137508912011981
seconds:  0.13732270617038012
seconds:  0.13475496205501258
seconds:  0.1383017050102353
seconds:  0.13622579514048994
seconds:  0.13965108385309577
seconds:  0.13799251592718065
seconds:  0.1371516310609877



In [16]:
##### Network CPU

with tf.device('/CPU:0'):
    n_repeats = 10
    times = []
    for i in tqdm(range(n_repeats)):
        start_time = time.monotonic()
        encoder(X_test);
        end_time = time.monotonic()
        print('seconds: ', end_time - start_time)
        times.append(end_time - start_time)
        projection_speeds.loc[len(projection_speeds)] = ['network-cpu', 2, dataset, end_time - start_time, len(X_test)]

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

seconds:  0.29402544419281185
seconds:  0.2258605498354882
seconds:  0.22414181102067232
seconds:  0.22331379586830735
seconds:  0.22412178991362453
seconds:  0.22888335911557078
seconds:  0.22352040209807456
seconds:  0.22269204817712307
seconds:  0.22520092106424272
seconds:  0.22392825409770012



##### 64 dims

In [17]:
load_loc = output_dir / dataset /"64"/ 'network' 

In [18]:
embedder = tfUMAP(
    direct_embedding=False,
    verbose=True,
    negative_sample_rate=5,
    training_epochs=5,
    batch_size = 100,
    dims = dims
)

In [19]:
encoder = tf.keras.models.load_model((load_loc / 'encoder').as_posix())
embedder.encoder = encoder

In [20]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    encoder(X_test);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    times.append(end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['network', 64, dataset, end_time - start_time, len(X_test)]

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

seconds:  0.1890984659548849
seconds:  0.1351398630067706
seconds:  0.13791371299885213
seconds:  0.13186913914978504
seconds:  0.13407498295418918
seconds:  0.13538287999108434
seconds:  0.13897335389629006
seconds:  0.1379034430719912
seconds:  0.13806657795794308
seconds:  0.13495389791205525



In [21]:
##### Network CPU

with tf.device('/CPU:0'):
    n_repeats = 10
    times = []
    for i in tqdm(range(n_repeats)):
        start_time = time.monotonic()
        encoder(X_test);
        end_time = time.monotonic()
        print('seconds: ', end_time - start_time)
        times.append(end_time - start_time)
        projection_speeds.loc[len(projection_speeds)] = ['network-cpu', 64, dataset, end_time - start_time, len(X_test)]

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

seconds:  0.26436580484732985
seconds:  0.22410616022534668
seconds:  0.22713676700368524
seconds:  0.22316977311857045
seconds:  0.22398739703930914
seconds:  0.22468416602350771
seconds:  0.22367100697010756
seconds:  0.2228943738155067
seconds:  0.22649329900741577
seconds:  0.2234757710248232



#### UMAP-learn

##### 2 dims

In [22]:
embedder = UMAP(n_components = 2, verbose=True)
z_umap = embedder.fit_transform(X_train_flat)

UMAP(dens_frac=0.0, dens_lambda=0.0, verbose=True)
Construct fuzzy simplicial set
Sat Jul 18 13:36:55 2020 Finding Nearest Neighbors
Sat Jul 18 13:36:55 2020 Building RP forest with 16 trees
Sat Jul 18 13:36:58 2020 parallel NN descent for 16 iterations
	 0  /  16
	 1  /  16
	 2  /  16
	 3  /  16
	 4  /  16
	 5  /  16
Sat Jul 18 13:37:11 2020 Finished Nearest Neighbor Search
Sat Jul 18 13:37:14 2020 Construct embedding
	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
Sat Jul 18 13:38:04 2020 Finished embedding


In [23]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    embedder.transform(X_test_flat);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    times.append(end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['umap-learn', 2, dataset, end_time - start_time, len(X_test)]

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
seconds:  24.390005162917078
	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
seconds:  12.295497893122956
	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
seconds:  11.066727950936183
	completed  0  /

In [24]:
projection_speeds

Unnamed: 0,method_,dimensions,dataset,speed,nex
0,network,2,cifar10,1.694403,10000
1,network,2,cifar10,0.13315,10000
2,network,2,cifar10,0.137509,10000
3,network,2,cifar10,0.137323,10000
4,network,2,cifar10,0.134755,10000
5,network,2,cifar10,0.138302,10000
6,network,2,cifar10,0.136226,10000
7,network,2,cifar10,0.139651,10000
8,network,2,cifar10,0.137993,10000
9,network,2,cifar10,0.137152,10000


##### 64 dims

In [25]:
embedder = UMAP(n_components = 64, verbose=True)
z_umap = embedder.fit_transform(X_train_flat)

UMAP(dens_frac=0.0, dens_lambda=0.0, n_components=64, verbose=True)
Construct fuzzy simplicial set
Sat Jul 18 13:40:17 2020 Finding Nearest Neighbors
Sat Jul 18 13:40:17 2020 Building RP forest with 16 trees
Sat Jul 18 13:40:19 2020 parallel NN descent for 16 iterations
	 0  /  16
	 1  /  16
	 2  /  16
	 3  /  16
	 4  /  16
	 5  /  16
Sat Jul 18 13:40:24 2020 Finished Nearest Neighbor Search
Sat Jul 18 13:40:24 2020 Construct embedding
	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
Sat Jul 18 13:41:06 2020 Finished embedding


In [26]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    embedder.transform(X_test_flat);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['umap-learn', 64, dataset, end_time - start_time, len(X_test)]

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
seconds:  13.350383965997025
	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
seconds:  11.229863347951323
	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
seconds:  10.996063767001033
	completed  0  /

In [27]:
projection_speeds

Unnamed: 0,method_,dimensions,dataset,speed,nex
0,network,2,cifar10,1.694403,10000
1,network,2,cifar10,0.13315,10000
2,network,2,cifar10,0.137509,10000
3,network,2,cifar10,0.137323,10000
4,network,2,cifar10,0.134755,10000
5,network,2,cifar10,0.138302,10000
6,network,2,cifar10,0.136226,10000
7,network,2,cifar10,0.139651,10000
8,network,2,cifar10,0.137993,10000
9,network,2,cifar10,0.137152,10000


#### PCA

##### 2 dims

In [28]:
pca = PCA(n_components=2)
z = pca.fit_transform(X_train_flat)

In [29]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    pca.transform(X_test_flat);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    times.append(end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['pca', 2, dataset, end_time - start_time, len(X_test)]

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

seconds:  0.06944070011377335
seconds:  0.06099954596720636
seconds:  0.062307962914928794
seconds:  0.06693859794177115
seconds:  0.07157812104560435
seconds:  0.0635866008233279
seconds:  0.08393564890138805
seconds:  0.0515290021430701
seconds:  0.06470321281813085
seconds:  0.06381637696176767



In [30]:
projection_speeds

Unnamed: 0,method_,dimensions,dataset,speed,nex
0,network,2,cifar10,1.694403,10000
1,network,2,cifar10,0.133150,10000
2,network,2,cifar10,0.137509,10000
3,network,2,cifar10,0.137323,10000
4,network,2,cifar10,0.134755,10000
...,...,...,...,...,...
65,pca,2,cifar10,0.063587,10000
66,pca,2,cifar10,0.083936,10000
67,pca,2,cifar10,0.051529,10000
68,pca,2,cifar10,0.064703,10000


##### 64 dims

In [31]:
pca = PCA(n_components=64)
z = pca.fit_transform(X_train_flat)

In [32]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    pca.transform(X_test_flat);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    times.append(end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['pca', 64, dataset, end_time - start_time, len(X_test)]

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

seconds:  0.07126907305791974
seconds:  0.08022611192427576
seconds:  0.07506861304864287
seconds:  0.08183348900638521
seconds:  0.10386767704039812
seconds:  0.09858391317538917
seconds:  0.09864274505525827
seconds:  0.10220566904172301
seconds:  0.09957754309289157
seconds:  0.09958715294487774



In [33]:
projection_speeds

Unnamed: 0,method_,dimensions,dataset,speed,nex
0,network,2,cifar10,1.694403,10000
1,network,2,cifar10,0.133150,10000
2,network,2,cifar10,0.137509,10000
3,network,2,cifar10,0.137323,10000
4,network,2,cifar10,0.134755,10000
...,...,...,...,...,...
75,pca,64,cifar10,0.098584,10000
76,pca,64,cifar10,0.098643,10000
77,pca,64,cifar10,0.102206,10000
78,pca,64,cifar10,0.099578,10000


#### TSNE

##### 2 dims

In [34]:
tsne = TSNE(
    n_components = 2,
    n_jobs=32,
    verbose=True
)

In [35]:
embedding_train = tsne.fit(X_train_flat)

--------------------------------------------------------------------------------
TSNE(n_jobs=32, neighbors=None, verbose=True)
--------------------------------------------------------------------------------
===> Finding 90 nearest neighbors using Annoy approximate search using euclidean distance...




   --> Time elapsed: 103.79 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.44 seconds
===> Calculating PCA-based initialization...
   --> Time elapsed: 1.59 seconds
===> Running optimization with exaggeration=12.00, lr=4166.67 for 250 iterations...
Iteration   50, KL divergence 6.9787, 50 iterations in 2.6563 sec
Iteration  100, KL divergence 6.5663, 50 iterations in 9.8989 sec
Iteration  150, KL divergence 6.5641, 50 iterations in 7.4331 sec
Iteration  200, KL divergence 6.6146, 50 iterations in 49.9821 sec
Iteration  250, KL divergence 6.5808, 50 iterations in 19.8111 sec
   --> Time elapsed: 89.78 seconds
===> Running optimization with exaggeration=1.00, lr=4166.67 for 500 iterations...
Iteration   50, KL divergence 4.7555, 50 iterations in 4.9346 sec
Iteration  100, KL divergence 4.4596, 50 iterations in 1.6660 sec
Iteration  150, KL divergence 4.3322, 50 iterations in 1.5989 sec
Iteration  200, KL divergence 4.2528, 50 iterations in 1.9477 sec
Iteration  250, K

In [36]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    embedding_train.transform(X_test_flat);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    times.append(end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['TSNE', 2, dataset, end_time - start_time, len(X_test)]

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

===> Finding 15 nearest neighbors in existing embedding using Annoy approximate search...
   --> Time elapsed: 10.21 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.03 seconds
===> Running optimization with exaggeration=4.00, lr=0.10 for 0 iterations...
   --> Time elapsed: 0.00 seconds
===> Running optimization with exaggeration=1.50, lr=0.10 for 250 iterations...
Iteration   50, KL divergence 196374.3910, 50 iterations in 0.2748 sec
Iteration  100, KL divergence 196366.9843, 50 iterations in 0.2674 sec
Iteration  150, KL divergence 196409.3210, 50 iterations in 0.2626 sec
Iteration  200, KL divergence 196440.0223, 50 iterations in 0.2684 sec
Iteration  250, KL divergence 196453.1957, 50 iterations in 0.2622 sec
   --> Time elapsed: 1.34 seconds
seconds:  11.800203969003633
===> Finding 15 nearest neighbors in existing embedding using Annoy approximate search...
   --> Time elapsed: 10.43 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.04 seconds


In [37]:
projection_speeds

Unnamed: 0,method_,dimensions,dataset,speed,nex
0,network,2,cifar10,1.694403,10000
1,network,2,cifar10,0.133150,10000
2,network,2,cifar10,0.137509,10000
3,network,2,cifar10,0.137323,10000
4,network,2,cifar10,0.134755,10000
...,...,...,...,...,...
85,TSNE,2,cifar10,11.863082,10000
86,TSNE,2,cifar10,11.653182,10000
87,TSNE,2,cifar10,11.819419,10000
88,TSNE,2,cifar10,11.761753,10000


### Save

In [38]:
save_loc = DATA_DIR / 'projection_speeds' / (dataset + '.pickle')
ensure_dir(save_loc)
projection_speeds.to_pickle(save_loc)