#### Outline
- for each dataset: 
    - load dataset; 
    - for each network: 
        - load network
        - project 1000 test dataset samples
        - save to metric dataframe

In [1]:
# reload packages
%load_ext autoreload
%autoreload 2

### Choose GPU (this may not be needed on your computer)

In [2]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=0


In [3]:
import tensorflow as tf
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
if len(gpu_devices)>0:
    tf.config.experimental.set_memory_growth(gpu_devices[0], True)

In [4]:
import numpy as np
import pickle
import pandas as pd
import time
from umap import UMAP

In [5]:
from tfumap.umap import tfUMAP
import tensorflow as tf
from sklearn.decomposition import PCA
from openTSNE import TSNE



In [6]:
from tqdm.autonotebook import tqdm

In [7]:
from tfumap.paths import ensure_dir, MODEL_DIR, DATA_DIR

In [8]:
output_dir = MODEL_DIR/'projections' 

In [9]:
projection_speeds = pd.DataFrame(columns = ['method_', 'dimensions', 'dataset', 'speed', "nex"])

### cassins

In [10]:
dataset = 'cassins_dtw'
dims = (32,31,1)

##### load dataset

In [11]:
from tfumap.paths import ensure_dir, MODEL_DIR, DATA_DIR

syllable_df = pd.read_pickle(DATA_DIR/'cassins'/ 'cassins.pickle')

#syllable_df= syllable_df[:1000]

syllable_df[:3]

top_labels = (
    pd.DataFrame(
        {i: [np.sum(syllable_df.labels.values == i)] for i in syllable_df.labels.unique()}
    )
    .T.sort_values(by=0, ascending=False)[:20]
    .T
)
top_labels

syllable_df = syllable_df[syllable_df.labels.isin(top_labels.columns)]

syllable_df[:3]

syllable_df = syllable_df.reset_index()

syllable_df['subset'] = 'train'
syllable_df.loc[:1000, 'subset'] = 'valid'
syllable_df.loc[1000:1999, 'subset'] = 'test'
#syllable_df.loc[:100, 'subset'] = 'valid'
#syllable_df.loc[100:199, 'subset'] = 'test'

specs = np.array(list(syllable_df.spectrogram.values))
specs = np.array([np.concatenate([np.zeros((32,1)), i], axis=1) for i in tqdm(specs)])
specs.shape

syllable_df['spectrogram'] = syllable_df['spectrogram'].astype('object')
syllable_df['spectrogram'] = list(specs)

np.shape(syllable_df['spectrogram'].values[0])

len(syllable_df)

Y_train = np.array(list(syllable_df.labels.values[syllable_df.subset == 'train']))
Y_valid = np.array(list(syllable_df.labels.values[syllable_df.subset == 'valid']))
Y_test = np.array(list(syllable_df.labels.values[syllable_df.subset == 'test']))

X_train = np.array(list(syllable_df.spectrogram.values[syllable_df.subset == 'train'])) #/ 255.
X_valid = np.array(list(syllable_df.spectrogram.values[syllable_df.subset == 'valid']))# / 255.
X_test = np.array(list(syllable_df.spectrogram.values[syllable_df.subset == 'test'])) #/ 255.

X_train_flat = X_train.reshape((len(X_train), np.product(np.shape(X_train)[1:])))
X_test_flat = X_test.reshape((len(X_test), np.product(np.shape(X_test)[1:])))

from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
Y_train = enc.fit_transform([[i] for i in Y_train]).astype('int').flatten()
X_test = X_test.reshape((len(X_test), 32,32,1))

HBox(children=(IntProgress(value=0, max=26984), HTML(value='')))




#### Network 

##### 2 dims

In [12]:
load_loc = output_dir / dataset / 'network' 

In [13]:
embedder = tfUMAP(
    direct_embedding=False,
    verbose=True,
    negative_sample_rate=5,
    training_epochs=5,
    batch_size = 100,
    dims = dims
)

In [14]:
encoder = tf.keras.models.load_model((load_loc / 'encoder').as_posix())
embedder.encoder = encoder

In [15]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    encoder(X_test);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    times.append(end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['network', 2, dataset, end_time - start_time, len(X_test)]


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

seconds:  1.9986373430583626
seconds:  0.007758233929052949
seconds:  0.008959567872807384
seconds:  0.0075960601679980755
seconds:  0.009051150875166059
seconds:  0.00730384117923677
seconds:  0.007466196082532406
seconds:  0.0073777439538389444
seconds:  0.007055864203721285
seconds:  0.008038301952183247



In [16]:
##### Network CPU

with tf.device('/CPU:0'):
    n_repeats = 10
    times = []
    for i in tqdm(range(n_repeats)):
        start_time = time.monotonic()
        encoder(X_test);
        end_time = time.monotonic()
        print('seconds: ', end_time - start_time)
        times.append(end_time - start_time)
        projection_speeds.loc[len(projection_speeds)] = ['network-cpu', 2, dataset, end_time - start_time, len(X_test)]

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

seconds:  0.22179583087563515
seconds:  0.05232092319056392
seconds:  0.05061970301903784
seconds:  0.052772385999560356
seconds:  0.05594308814033866
seconds:  0.05203477409668267
seconds:  0.04802542901597917
seconds:  0.04842807003296912
seconds:  0.048062809044495225
seconds:  0.047522274078801274



##### 64 dims

In [17]:
load_loc = output_dir / dataset /"64"/ 'network' 

In [18]:
embedder = tfUMAP(
    direct_embedding=False,
    verbose=True,
    negative_sample_rate=5,
    training_epochs=5,
    batch_size = 100,
    dims = dims
)

In [19]:
encoder = tf.keras.models.load_model((load_loc / 'encoder').as_posix())
embedder.encoder = encoder

In [20]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    encoder(X_test);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    times.append(end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['network', 64, dataset, end_time - start_time, len(X_test)]

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

seconds:  0.20757481991313398
seconds:  0.008206608006730676
seconds:  0.010095322038978338
seconds:  0.007917959010228515
seconds:  0.00722349900752306
seconds:  0.007565918145701289
seconds:  0.007455145008862019
seconds:  0.007130536017939448
seconds:  0.007478926097974181
seconds:  0.00735634402371943



In [22]:
##### Network CPU

with tf.device('/CPU:0'):
    n_repeats = 10
    times = []
    for i in tqdm(range(n_repeats)):
        start_time = time.monotonic()
        encoder(X_test);
        end_time = time.monotonic()
        print('seconds: ', end_time - start_time)
        times.append(end_time - start_time)
        projection_speeds.loc[len(projection_speeds)] = ['network-cpu', 64, dataset, end_time - start_time, len(X_test)]

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

seconds:  0.20587171893566847
seconds:  0.04768748488277197
seconds:  0.04786087106913328
seconds:  0.04897981300018728
seconds:  0.048601401038467884
seconds:  0.046968074049800634
seconds:  0.048188708955422044
seconds:  0.04788177995942533
seconds:  0.04846215690486133
seconds:  0.04742997791618109



#### UMAP-learn

##### 2 dims

In [23]:
embedder = UMAP(n_components = 2, verbose=True)
z_umap = embedder.fit_transform(X_train_flat)

UMAP(dens_frac=0.0, dens_lambda=0.0, verbose=True)
Construct fuzzy simplicial set
Sat Jul 18 13:23:48 2020 Finding Nearest Neighbors
Sat Jul 18 13:23:48 2020 Building RP forest with 13 trees
Sat Jul 18 13:23:49 2020 parallel NN descent for 15 iterations
	 0  /  15
	 1  /  15
	 2  /  15
	 3  /  15
Sat Jul 18 13:23:58 2020 Finished Nearest Neighbor Search
Sat Jul 18 13:24:01 2020 Construct embedding
	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
Sat Jul 18 13:24:25 2020 Finished embedding


In [24]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    embedder.transform(X_test_flat);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    times.append(end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['umap-learn', 2, dataset, end_time - start_time, len(X_test)]


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
seconds:  14.819833575980738
	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
seconds:  3.252251919126138
	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
seconds:  2.8433641069568694
	completed  0  / 

In [25]:
projection_speeds

Unnamed: 0,method_,dimensions,dataset,speed,nex
0,network,2,cassins_dtw,1.998637,1000
1,network,2,cassins_dtw,0.007758,1000
2,network,2,cassins_dtw,0.00896,1000
3,network,2,cassins_dtw,0.007596,1000
4,network,2,cassins_dtw,0.009051,1000
5,network,2,cassins_dtw,0.007304,1000
6,network,2,cassins_dtw,0.007466,1000
7,network,2,cassins_dtw,0.007378,1000
8,network,2,cassins_dtw,0.007056,1000
9,network,2,cassins_dtw,0.008038,1000


##### 64 dims

In [26]:
embedder = UMAP(n_components = 64, verbose=True)
z_umap = embedder.fit_transform(X_train_flat)

UMAP(dens_frac=0.0, dens_lambda=0.0, n_components=64, verbose=True)
Construct fuzzy simplicial set
Sat Jul 18 13:25:09 2020 Finding Nearest Neighbors
Sat Jul 18 13:25:09 2020 Building RP forest with 13 trees
Sat Jul 18 13:25:09 2020 parallel NN descent for 15 iterations
	 0  /  15
	 1  /  15
	 2  /  15
	 3  /  15
Sat Jul 18 13:25:10 2020 Finished Nearest Neighbor Search
Sat Jul 18 13:25:11 2020 Construct embedding
	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
Sat Jul 18 13:25:33 2020 Finished embedding


In [27]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    embedder.transform(X_test_flat);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['umap-learn', 64, dataset, end_time - start_time, len(X_test)]


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
seconds:  3.8669534591026604
	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
seconds:  2.920220490079373
	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
seconds:  3.1979359490796924
	completed  0  / 

In [28]:
projection_speeds

Unnamed: 0,method_,dimensions,dataset,speed,nex
0,network,2,cassins_dtw,1.998637,1000
1,network,2,cassins_dtw,0.007758,1000
2,network,2,cassins_dtw,0.00896,1000
3,network,2,cassins_dtw,0.007596,1000
4,network,2,cassins_dtw,0.009051,1000
5,network,2,cassins_dtw,0.007304,1000
6,network,2,cassins_dtw,0.007466,1000
7,network,2,cassins_dtw,0.007378,1000
8,network,2,cassins_dtw,0.007056,1000
9,network,2,cassins_dtw,0.008038,1000


#### PCA

##### 2 dims

In [29]:
pca = PCA(n_components=2)
z = pca.fit_transform(X_train_flat)

In [30]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    pca.transform(X_test_flat);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    times.append(end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['pca', 2, dataset, end_time - start_time, len(X_test)]


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

seconds:  0.004573963116854429
seconds:  0.0063557850662618876
seconds:  0.004374526906758547
seconds:  0.004209551960229874
seconds:  0.004284874070435762
seconds:  0.004268313990905881
seconds:  0.004245693096891046
seconds:  0.004172991029918194
seconds:  0.004197912057861686
seconds:  0.004239782923832536



In [31]:
projection_speeds

Unnamed: 0,method_,dimensions,dataset,speed,nex
0,network,2,cassins_dtw,1.998637,1000
1,network,2,cassins_dtw,0.007758,1000
2,network,2,cassins_dtw,0.008960,1000
3,network,2,cassins_dtw,0.007596,1000
4,network,2,cassins_dtw,0.009051,1000
...,...,...,...,...,...
65,pca,2,cassins_dtw,0.004268,1000
66,pca,2,cassins_dtw,0.004246,1000
67,pca,2,cassins_dtw,0.004173,1000
68,pca,2,cassins_dtw,0.004198,1000


##### 64 dims

In [32]:
pca = PCA(n_components=64)
z = pca.fit_transform(X_train_flat)

In [33]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    pca.transform(X_test_flat);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    times.append(end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['pca', 64, dataset, end_time - start_time, len(X_test)]


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

seconds:  0.005040086107328534
seconds:  0.005261223064735532
seconds:  0.005180489970371127
seconds:  0.00519216014072299
seconds:  0.005373456049710512
seconds:  0.0051990600768476725
seconds:  0.005407147808000445
seconds:  0.0051269689574837685
seconds:  0.005323165096342564
seconds:  0.005099037894979119



In [34]:
projection_speeds

Unnamed: 0,method_,dimensions,dataset,speed,nex
0,network,2,cassins_dtw,1.998637,1000
1,network,2,cassins_dtw,0.007758,1000
2,network,2,cassins_dtw,0.008960,1000
3,network,2,cassins_dtw,0.007596,1000
4,network,2,cassins_dtw,0.009051,1000
...,...,...,...,...,...
75,pca,64,cassins_dtw,0.005199,1000
76,pca,64,cassins_dtw,0.005407,1000
77,pca,64,cassins_dtw,0.005127,1000
78,pca,64,cassins_dtw,0.005323,1000


#### TSNE

##### 2 dims

In [35]:
tsne = TSNE(
    n_components = 2,
    n_jobs=32,
    verbose=True
)

In [36]:
embedding_train = tsne.fit(X_train_flat)



--------------------------------------------------------------------------------
TSNE(n_jobs=32, neighbors=None, verbose=True)
--------------------------------------------------------------------------------
===> Finding 90 nearest neighbors using Annoy approximate search using euclidean distance...
   --> Time elapsed: 18.60 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.25 seconds
===> Calculating PCA-based initialization...
   --> Time elapsed: 0.45 seconds
===> Running optimization with exaggeration=12.00, lr=2082.00 for 250 iterations...
Iteration   50, KL divergence 5.1153, 50 iterations in 1.4122 sec
Iteration  100, KL divergence 4.3927, 50 iterations in 1.4483 sec
Iteration  150, KL divergence 4.1636, 50 iterations in 1.4167 sec
Iteration  200, KL divergence 4.0462, 50 iterations in 1.4677 sec
Iteration  250, KL divergence 3.9724, 50 iterations in 1.4516 sec
   --> Time elapsed: 7.20 seconds
===> Running optimization with exaggeration=1.00, lr=2082.00 for 50

In [37]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    embedding_train.transform(X_test_flat);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    times.append(end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['TSNE', 2, dataset, end_time - start_time, len(X_test)]

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

===> Finding 15 nearest neighbors in existing embedding using Annoy approximate search...
   --> Time elapsed: 0.42 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.02 seconds
===> Running optimization with exaggeration=4.00, lr=0.10 for 0 iterations...
   --> Time elapsed: 0.00 seconds
===> Running optimization with exaggeration=1.50, lr=0.10 for 250 iterations...
Iteration   50, KL divergence 17068.6992, 50 iterations in 0.0636 sec
Iteration  100, KL divergence 17098.0039, 50 iterations in 0.0627 sec
Iteration  150, KL divergence 17113.9243, 50 iterations in 0.0625 sec
Iteration  200, KL divergence 17126.0292, 50 iterations in 0.0620 sec
Iteration  250, KL divergence 17137.4071, 50 iterations in 0.0642 sec
   --> Time elapsed: 0.32 seconds
seconds:  1.0166890290565789
===> Finding 15 nearest neighbors in existing embedding using Annoy approximate search...
   --> Time elapsed: 0.68 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.01 seconds
===> Ru

In [38]:
projection_speeds

Unnamed: 0,method_,dimensions,dataset,speed,nex
0,network,2,cassins_dtw,1.998637,1000
1,network,2,cassins_dtw,0.007758,1000
2,network,2,cassins_dtw,0.008960,1000
3,network,2,cassins_dtw,0.007596,1000
4,network,2,cassins_dtw,0.009051,1000
...,...,...,...,...,...
85,TSNE,2,cassins_dtw,1.017818,1000
86,TSNE,2,cassins_dtw,0.988583,1000
87,TSNE,2,cassins_dtw,0.998336,1000
88,TSNE,2,cassins_dtw,1.001547,1000


### Save

In [39]:
save_loc = DATA_DIR / 'projection_speeds' / (dataset + '.pickle')
ensure_dir(save_loc)
projection_speeds.to_pickle(save_loc)