#### Outline
- for each dataset: 
    - load dataset; 
    - for each network: 
        - load network
        - project 1000 test dataset samples
        - save to metric dataframe

In [1]:
# reload packages
%load_ext autoreload
%autoreload 2

### Choose GPU (this may not be needed on your computer)

In [2]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=0


In [3]:
import tensorflow as tf
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
if len(gpu_devices)>0:
    tf.config.experimental.set_memory_growth(gpu_devices[0], True)

In [4]:
import numpy as np
import pickle
import pandas as pd
import time
from umap import UMAP

In [5]:
from tfumap.umap import tfUMAP
import tensorflow as tf
from sklearn.decomposition import PCA
from openTSNE import TSNE



In [6]:
from tqdm.autonotebook import tqdm

In [7]:
from tfumap.paths import ensure_dir, MODEL_DIR, DATA_DIR

In [8]:
output_dir = MODEL_DIR/'projections' 

In [9]:
projection_speeds = pd.DataFrame(columns = ['method_', 'dimensions', 'dataset', 'speed', "nex"])

### macosko2015

In [10]:
dataset = 'macosko2015'
dims = [50]

##### load dataset

In [11]:
from tfumap.paths import ensure_dir, MODEL_DIR, DATA_DIR

#dataset_address = 'http://file.biolab.si/opentsne/macosko_2015.pkl.gz'
# https://opentsne.readthedocs.io/en/latest/examples/01_simple_usage/01_simple_usage.html
# also see https://github.com/berenslab/rna-seq-tsne/blob/master/umi-datasets.ipynb

import gzip
import pickle

with gzip.open(DATA_DIR / 'macosko_2015.pkl.gz', "rb") as f:
    data = pickle.load(f)

x = data["pca_50"]
y = data["CellType1"].astype(str)

print("Data set contains %d samples with %d features" % x.shape)

from sklearn.model_selection import train_test_split

def zero_one_norm(x):
    return (x- np.min(x, axis=0))/ (np.max(x, axis=0)-np.min(x, axis=0))

x_norm = zero_one_norm(x)

X_train, X_test, Y_train, Y_test = train_test_split(x_norm, y, test_size=.1, random_state=42)

np.shape(X_train)

n_valid = 10000
X_valid = X_train[-n_valid:]
Y_valid = Y_train[-n_valid:]
X_train = X_train[:-n_valid]
Y_train = Y_train[:-n_valid]

X_train_flat = X_train

from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()

Y_train = enc.fit_transform([[i] for i in Y_train]).flatten()

X_train_flat = X_train
X_test_flat = X_test
print(len(X_train), len(X_test), len(X_valid))

Data set contains 44808 samples with 50 features
30327 4481 10000


#### Network 

##### 2 dims

In [12]:
load_loc = output_dir / dataset / 'network' 

In [13]:
embedder = tfUMAP(
    direct_embedding=False,
    verbose=True,
    negative_sample_rate=5,
    training_epochs=5,
    batch_size = 100,
    dims = dims
)

In [14]:
encoder = tf.keras.models.load_model((load_loc / 'encoder').as_posix())
embedder.encoder = encoder

In [15]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    encoder(X_test);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    times.append(end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['network', 2, dataset, end_time - start_time, len(X_test)]


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

seconds:  0.2574630610179156
seconds:  0.002170303137972951
seconds:  0.0017843919340521097
seconds:  0.00172641989775002
seconds:  0.0017111101187765598
seconds:  0.002237174892798066
seconds:  0.0022244241554290056
seconds:  0.0022942570503801107
seconds:  0.0022107430268079042
seconds:  0.0017905519343912601



In [16]:
##### Network CPU

with tf.device('/CPU:0'):
    n_repeats = 10
    times = []
    for i in tqdm(range(n_repeats)):
        start_time = time.monotonic()
        encoder(X_test);
        end_time = time.monotonic()
        print('seconds: ', end_time - start_time)
        times.append(end_time - start_time)
        projection_speeds.loc[len(projection_speeds)] = ['network-cpu', 2, dataset, end_time - start_time, len(X_test)]

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

seconds:  0.05977769708260894
seconds:  0.004847739823162556
seconds:  0.005114718107506633
seconds:  0.005286332918331027
seconds:  0.0054284571669995785
seconds:  0.005149489035829902
seconds:  0.005041016032919288
seconds:  0.005160759901627898
seconds:  0.004493069835007191
seconds:  0.004703545942902565



##### 64 dims

In [17]:
load_loc = output_dir / dataset /"64"/ 'network' 

In [18]:
embedder = tfUMAP(
    direct_embedding=False,
    verbose=True,
    negative_sample_rate=5,
    training_epochs=5,
    batch_size = 100,
    dims = dims
)

In [19]:
encoder = tf.keras.models.load_model((load_loc / 'encoder').as_posix())
embedder.encoder = encoder

In [20]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    encoder(X_test);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    times.append(end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['network', 64, dataset, end_time - start_time, len(X_test)]

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

seconds:  0.04194766306318343
seconds:  0.002163493074476719
seconds:  0.0023062070831656456
seconds:  0.002255275147035718
seconds:  0.0022284151054918766
seconds:  0.0018087918870151043
seconds:  0.0018007908947765827
seconds:  0.002295226091518998
seconds:  0.0023006570991128683
seconds:  0.001860274001955986



In [21]:
##### Network CPU

with tf.device('/CPU:0'):
    n_repeats = 10
    times = []
    for i in tqdm(range(n_repeats)):
        start_time = time.monotonic()
        encoder(X_test);
        end_time = time.monotonic()
        print('seconds: ', end_time - start_time)
        times.append(end_time - start_time)
        projection_speeds.loc[len(projection_speeds)] = ['network-cpu', 64, dataset, end_time - start_time, len(X_test)]

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

seconds:  0.03173177712596953
seconds:  0.0062287000473588705
seconds:  0.00512987794354558
seconds:  0.0049795531667768955
seconds:  0.005220860941335559
seconds:  0.005381984869018197
seconds:  0.006303942063823342
seconds:  0.006635800935328007
seconds:  0.005075856810435653
seconds:  0.004542571026831865



#### UMAP-learn

##### 2 dims

In [22]:
embedder = UMAP(n_components = 2, verbose=True)
z_umap = embedder.fit_transform(X_train_flat)

UMAP(dens_frac=0.0, dens_lambda=0.0, verbose=True)
Construct fuzzy simplicial set
Sat Jul 18 13:05:34 2020 Finding Nearest Neighbors
Sat Jul 18 13:05:34 2020 Building RP forest with 14 trees
Sat Jul 18 13:05:35 2020 parallel NN descent for 15 iterations
	 0  /  15
	 1  /  15
	 2  /  15
	 3  /  15
	 4  /  15
	 5  /  15
Sat Jul 18 13:05:44 2020 Finished Nearest Neighbor Search
Sat Jul 18 13:05:47 2020 Construct embedding
	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
Sat Jul 18 13:06:18 2020 Finished embedding


In [23]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    embedder.transform(X_test_flat);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    times.append(end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['umap-learn', 2, dataset, end_time - start_time, len(X_test)]


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
seconds:  15.667166848899797
	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
seconds:  4.18031515693292
	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
seconds:  3.8012523220386356
	completed  0  /  

In [24]:
projection_speeds

Unnamed: 0,method_,dimensions,dataset,speed,nex
0,network,2,macosko2015,0.257463,4481
1,network,2,macosko2015,0.00217,4481
2,network,2,macosko2015,0.001784,4481
3,network,2,macosko2015,0.001726,4481
4,network,2,macosko2015,0.001711,4481
5,network,2,macosko2015,0.002237,4481
6,network,2,macosko2015,0.002224,4481
7,network,2,macosko2015,0.002294,4481
8,network,2,macosko2015,0.002211,4481
9,network,2,macosko2015,0.001791,4481


##### 64 dims

In [25]:
embedder = UMAP(n_components = 64, verbose=True)
z_umap = embedder.fit_transform(X_train_flat)

UMAP(dens_frac=0.0, dens_lambda=0.0, n_components=64, verbose=True)
Construct fuzzy simplicial set
Sat Jul 18 13:07:11 2020 Finding Nearest Neighbors
Sat Jul 18 13:07:11 2020 Building RP forest with 14 trees
Sat Jul 18 13:07:11 2020 parallel NN descent for 15 iterations
	 0  /  15
	 1  /  15
	 2  /  15
	 3  /  15
	 4  /  15
	 5  /  15
Sat Jul 18 13:07:13 2020 Finished Nearest Neighbor Search
Sat Jul 18 13:07:13 2020 Construct embedding
	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
Sat Jul 18 13:07:41 2020 Finished embedding


In [26]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    embedder.transform(X_test_flat);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['umap-learn', 64, dataset, end_time - start_time, len(X_test)]


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
seconds:  4.355257257819176
	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
seconds:  3.864216635003686
	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
seconds:  3.6083944719284773
	completed  0  /  

In [27]:
projection_speeds

Unnamed: 0,method_,dimensions,dataset,speed,nex
0,network,2,macosko2015,0.257463,4481
1,network,2,macosko2015,0.00217,4481
2,network,2,macosko2015,0.001784,4481
3,network,2,macosko2015,0.001726,4481
4,network,2,macosko2015,0.001711,4481
5,network,2,macosko2015,0.002237,4481
6,network,2,macosko2015,0.002224,4481
7,network,2,macosko2015,0.002294,4481
8,network,2,macosko2015,0.002211,4481
9,network,2,macosko2015,0.001791,4481


#### PCA

##### 2 dims

In [28]:
pca = PCA(n_components=2)
z = pca.fit_transform(X_train_flat)

In [29]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    pca.transform(X_test_flat);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    times.append(end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['pca', 2, dataset, end_time - start_time, len(X_test)]


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

seconds:  0.0014454619958996773
seconds:  0.0013370581436902285
seconds:  0.0011735039297491312
seconds:  0.0011858840007334948
seconds:  0.0011814238969236612
seconds:  0.0011741940397769213
seconds:  0.0011680638417601585
seconds:  0.0011674531269818544
seconds:  0.0012008449994027615
seconds:  0.0011656039860099554



In [30]:
projection_speeds

Unnamed: 0,method_,dimensions,dataset,speed,nex
0,network,2,macosko2015,0.257463,4481
1,network,2,macosko2015,0.002170,4481
2,network,2,macosko2015,0.001784,4481
3,network,2,macosko2015,0.001726,4481
4,network,2,macosko2015,0.001711,4481
...,...,...,...,...,...
65,pca,2,macosko2015,0.001174,4481
66,pca,2,macosko2015,0.001168,4481
67,pca,2,macosko2015,0.001167,4481
68,pca,2,macosko2015,0.001201,4481


##### 64 dims

In [31]:
x_train_flat_padded = np.concatenate([X_train_flat, np.zeros((len(X_train_flat), 14))], axis=1)
X_test_flat_padded = np.concatenate([X_test_flat, np.zeros((len(X_test_flat), 14))], axis=1)

In [32]:
pca = PCA(n_components=64)
z = pca.fit_transform(x_train_flat_padded)

In [33]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    pca.transform(X_test_flat_padded);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    times.append(end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['pca', 64, dataset, end_time - start_time, len(X_test)]


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

seconds:  0.0028941440396010876
seconds:  0.0017902308609336615
seconds:  0.0018056221306324005
seconds:  0.001770860981196165
seconds:  0.0017668809741735458
seconds:  0.001783912070095539
seconds:  0.001784980995580554
seconds:  0.0017701711039990187
seconds:  0.0017756109591573477
seconds:  0.0017782710492610931



In [34]:
projection_speeds

Unnamed: 0,method_,dimensions,dataset,speed,nex
0,network,2,macosko2015,0.257463,4481
1,network,2,macosko2015,0.002170,4481
2,network,2,macosko2015,0.001784,4481
3,network,2,macosko2015,0.001726,4481
4,network,2,macosko2015,0.001711,4481
...,...,...,...,...,...
75,pca,64,macosko2015,0.001784,4481
76,pca,64,macosko2015,0.001785,4481
77,pca,64,macosko2015,0.001770,4481
78,pca,64,macosko2015,0.001776,4481


#### TSNE

##### 2 dims

In [35]:
tsne = TSNE(
    n_components = 2,
    n_jobs=32,
    verbose=True
)

In [36]:
embedding_train = tsne.fit(X_train_flat)

--------------------------------------------------------------------------------
TSNE(n_jobs=32, neighbors=None, verbose=True)
--------------------------------------------------------------------------------
===> Finding 90 nearest neighbors using Annoy approximate search using euclidean distance...




   --> Time elapsed: 10.11 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.38 seconds
===> Calculating PCA-based initialization...
   --> Time elapsed: 0.09 seconds
===> Running optimization with exaggeration=12.00, lr=2527.25 for 250 iterations...
Iteration   50, KL divergence 5.8205, 50 iterations in 1.3960 sec
Iteration  100, KL divergence 5.3222, 50 iterations in 1.2890 sec
Iteration  150, KL divergence 5.2496, 50 iterations in 1.2600 sec
Iteration  200, KL divergence 5.2204, 50 iterations in 1.2814 sec
Iteration  250, KL divergence 5.2056, 50 iterations in 1.2783 sec
   --> Time elapsed: 6.51 seconds
===> Running optimization with exaggeration=1.00, lr=2527.25 for 500 iterations...
Iteration   50, KL divergence 3.6400, 50 iterations in 1.2393 sec
Iteration  100, KL divergence 3.2482, 50 iterations in 1.2282 sec
Iteration  150, KL divergence 3.0520, 50 iterations in 1.5182 sec
Iteration  200, KL divergence 2.9298, 50 iterations in 2.1680 sec
Iteration  250, KL di

In [37]:
n_repeats = 10
times = []
for i in tqdm(range(n_repeats)):
    start_time = time.monotonic()
    embedding_train.transform(X_test_flat);
    end_time = time.monotonic()
    print('seconds: ', end_time - start_time)
    times.append(end_time - start_time)
    projection_speeds.loc[len(projection_speeds)] = ['TSNE', 2, dataset, end_time - start_time, len(X_test)]

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

===> Finding 15 nearest neighbors in existing embedding using Annoy approximate search...
   --> Time elapsed: 0.83 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.02 seconds
===> Running optimization with exaggeration=4.00, lr=0.10 for 0 iterations...
   --> Time elapsed: 0.00 seconds
===> Running optimization with exaggeration=1.50, lr=0.10 for 250 iterations...
Iteration   50, KL divergence 83006.5255, 50 iterations in 0.1605 sec
Iteration  100, KL divergence 83072.7557, 50 iterations in 0.1370 sec
Iteration  150, KL divergence 83118.0247, 50 iterations in 0.1354 sec
Iteration  200, KL divergence 83143.5822, 50 iterations in 0.1354 sec
Iteration  250, KL divergence 83165.7081, 50 iterations in 0.1370 sec
   --> Time elapsed: 0.71 seconds
seconds:  2.3207843399140984
===> Finding 15 nearest neighbors in existing embedding using Annoy approximate search...
   --> Time elapsed: 0.94 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 0.02 seconds
===> Ru

In [38]:
projection_speeds

Unnamed: 0,method_,dimensions,dataset,speed,nex
0,network,2,macosko2015,0.257463,4481
1,network,2,macosko2015,0.002170,4481
2,network,2,macosko2015,0.001784,4481
3,network,2,macosko2015,0.001726,4481
4,network,2,macosko2015,0.001711,4481
...,...,...,...,...,...
85,TSNE,2,macosko2015,1.706167,4481
86,TSNE,2,macosko2015,1.553048,4481
87,TSNE,2,macosko2015,1.679678,4481
88,TSNE,2,macosko2015,1.695992,4481


### Save

In [39]:
save_loc = DATA_DIR / 'projection_speeds' / (dataset + '.pickle')
ensure_dir(save_loc)
projection_speeds.to_pickle(save_loc)