In [1]:
import nptsne
from nptsne import hsne_analysis
import multiscale_phate as mp

import time
import os
import scprep
import demap
import math
import random
import numpy as np
import pandas as pd
import hierarchical_umap as h_umap
import matplotlib.pyplot as plt

from tqdm import tqdm

from sklearn.neighbors import NearestNeighbors
from sklearn.utils import check_array
from sklearn.preprocessing import normalize, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from sklearn.neighbors import NearestNeighbors

from scipy.stats import pearsonr, spearmanr

In [2]:
def load_fmnist():
    fashionTrain = pd.read_csv('data/fashion-train.csv')

    fashionX = fashionTrain.values[:,2:]
    fashionY = fashionTrain.values[:, 1].astype(int)

    X = normalize(fashionX)
    y = fashionY
#     X = PCA(n_components=15).fit_transform(X)
    X = check_array(X, dtype=np.float32, accept_sparse='csr', order='C')
    
    return X, y

def load_mnist():
    X = np.load('./data/MNIST_70000.npy')
    y = np.load('./data/MNIST_70000_label.npy').astype(int)
    X = normalize(X)
#     X = PCA(n_components=15).fit_transform(X)
    X = check_array(X, dtype=np.float32, accept_sparse='csr', order='C')
    
    return X, y

def load_scRNAseq():
    download_path = os.path.expanduser("~/Documentos/HierarchicalUMAP/umap-cpp/umap/cpp/data")
    sparse=True
    T1 = scprep.io.load_10X(os.path.join(download_path, "scRNAseq", "T0_1A"), sparse=sparse, gene_labels='both')
    T2 = scprep.io.load_10X(os.path.join(download_path, "scRNAseq", "T2_3B"), sparse=sparse, gene_labels='both')
    T3 = scprep.io.load_10X(os.path.join(download_path, "scRNAseq", "T4_5C"), sparse=sparse, gene_labels='both')
    T4 = scprep.io.load_10X(os.path.join(download_path, "scRNAseq", "T6_7D"), sparse=sparse, gene_labels='both')
    T5 = scprep.io.load_10X(os.path.join(download_path, "scRNAseq", "T8_9E"), sparse=sparse, gene_labels='both')
    filtered_batches = []
    for batch in [T1, T2, T3, T4, T5]:
        batch = scprep.filter.filter_library_size(batch, percentile=20, keep_cells='above')
        batch = scprep.filter.filter_library_size(batch, percentile=75, keep_cells='below')
        filtered_batches.append(batch)
    del T1, T2, T3, T4, T5
    EBT_counts, sample_labels = scprep.utils.combine_batches(
        filtered_batches, 
        ["Day 00-03", "Day 06-09", "Day 12-15", "Day 18-21", "Day 24-27"],
        append_to_cell_names=True
    )
    del filtered_batches # removes objects from memory
    EBT_counts = scprep.filter.filter_rare_genes(EBT_counts, min_cells=10)
    EBT_counts = scprep.normalize.library_size_normalize(EBT_counts)
    mito_genes = scprep.select.get_gene_set(EBT_counts, starts_with="MT-") # Get all mitochondrial genes. There are 14, FYI.
    EBT_counts, sample_labels = scprep.filter.filter_gene_set_expression(
    EBT_counts, sample_labels, genes=mito_genes, 
    percentile=90, keep_cells='below')
    EBT_counts = scprep.transform.sqrt(EBT_counts)
    
    le = LabelEncoder()
    le.fit(sample_labels)
    labels = le.transform(sample_labels)
    X = PCA(n_components=50).fit_transform(EBT_counts.values)
    X = check_array(X, dtype=np.float32, accept_sparse='csr', order='C')
    return X, labels

def load_mammals():
    X = np.loadtxt("data/mammals-20000_features.txt")
    y = np.loadtxt("data/mammals-20000_classes.txt")
    X = normalize(X)
    
    return X, y

In [3]:
n_executions = 3
levels = 3

df_humapFLANN_level2 = pd.DataFrame()
df_humapFLANN_level0 = pd.DataFrame()

df_humapKDTREE_NN_level2 = pd.DataFrame()
df_humapKDTREE_NN_level0 = pd.DataFrame()

df_hsneCPU_level2 = pd.DataFrame()
df_hsneCPU_level0 = pd.DataFrame()

df_hsneGPU_level2 = pd.DataFrame()
df_hsneGPU_level0 = pd.DataFrame()

df_humap_level2 = pd.DataFrame()
df_humap_level0 = pd.DataFrame()

df_mphate_level2 = pd.DataFrame()
df_mphate_level0 = pd.DataFrame()

datasets = []

# datasets.append({
#    'load': load_scRNAseq,
#    'name': 'scRNAseq'
# })
datasets.append({
   'load': load_mammals,
   'name': 'mammals'
})
# datasets.append({
#     'load': load_fmnist,
#     'name': 'fmnist'
# })
# datasets.append({
#     'load': load_mnist,
#     'name': 'mnist'
# })
    
    
for dataset in datasets:
    print("Loading %s dataset..." % (dataset['name']))
    X, y = dataset['load']()
    print("Done.")
    init = 2

    print("initing with %d" % (init))
#     init = 0

#     if not os.path.exists("experiments/comparison/"+dataset['name']):
#         os.mkdir("experiments/comparison/"+dataset['name'])
        
        
#     for i in range(n_executions):
#         if os.path.exists('experiments/comparison/'+dataset['name']+'/humapKDTREE_NN_it'+str(i)+'_level0.csv'):
#             init = i+1
          
                
    print("Initing at %d" % (init))

        
    level2 = 0
    for execution in tqdm(range(init, n_executions)):
        
        time_file = open("experiments/comparison/"+dataset['name']+'/run-time.csv', 'a')
        size_file = open("experiments/comparison/"+dataset['name']+'/mphate_size_level2.csv', 'a')

        hsneCPU = nptsne.HSne(True)
        tic = time.time()
        hsneCPU.create_hsne(X, 3)
        execution_hsneCPU_fit = time.time()-tic

        hsneGPU = nptsne.HSne(True)
        tic = time.time()
        hsneGPU.create_hsne(X, 3)
        execution_hsneGPU_fit = time.time()-tic

        n_level0 = hsneGPU.get_scale(0).num_points
        n_level1 = hsneGPU.get_scale(1).num_points
        n_level2 = hsneGPU.get_scale(2).num_points

        hUmap = h_umap.HUMAP('precomputed', np.array([n_level1/n_level0, n_level2/n_level1]), 100, 0.15, "NNDescent", 0.0, True)
        hUmap.set_distance_similarity(False)
        hUmap.set_path_increment(False)
        hUmap.set_influence_neighborhood(0)
        
        executed_mphate = False
        
        if dataset['name'] != 'mnist' and dataset['name'] != 'fmnist':
            mp_op = mp.Multiscale_PHATE(n_jobs=10)

            """
                Executing Multiscale PHATE
            """

            execution_mphate = -1
#             try: 
            tic = time.time()
            levels = mp_op.fit(X)
            execution_mphate_fit = time.time()-tic


            level2 = 0
            dif = np.abs(len(np.unique(mp_op.NxTs[0])) - n_level2)
            for level in range(len(levels)):
                d = np.abs(len(np.unique(mp_op.NxTs[level])) - n_level2)
                if d < dif:
                    dif = d
                    level2 = level
            print("level: %d, n: %d, dif: %d" % (level2, len(np.unique(mp_op.NxTs[level2])), len(np.unique(mp_op.NxTs[level2]))-n_level2))

            tic = time.time()
            embedding2, _, _ = mp_op.transform(level2, level2)
            execution_mphate_level2 = (time.time()-tic)

            tic = time.time()
            embedding0, _, _ = mp_op.transform(0, 0)
            execution_mphate_level0 = (time.time()-tic)

            df_mphate_level2['label'+str(execution)] = y[np.unique(mp_op.NxTs[level2])]
            df_mphate_level2['x'+str(execution)] = embedding2[:, 0]
            df_mphate_level2['y'+str(execution)] = embedding2[:, 1]
            df_mphate_level2['inds'+str(execution)] = np.unique(mp_op.NxTs[level2])

            df_mphate_level0['label'+str(execution)] = y
            df_mphate_level0['x'+str(execution)] = embedding0[:, 0]
            df_mphate_level0['y'+str(execution)] = embedding0[:, 1]
            df_mphate_level0['inds'+str(execution)] = np.arange(len(y))

            df_mphate_level2.to_csv("experiments/comparison/"+dataset['name']+'/mphate_it'+str(execution)+'_level2.csv', index=False)
            df_mphate_level0.to_csv("experiments/comparison/"+dataset['name']+'/mphate_it'+str(execution)+'_level0.csv', index=False)
            executed_mphate = True
#             except:
#                 executed_mphate = False
#                 print(("Could not compute Multiscale PHATE embeddings."))



        """
            Executing HSNE in CPU
        """
        tic = time.time()
        container = hsne_analysis.AnalysisModel(hsneCPU, hsne_analysis.EmbedderType.CPU)
        analysis_level2 = container.top_analysis
        for i in range(500):
            analysis_level2.do_iteration()
        execution_hsneCPU_level2 = (time.time()-tic)

        analysis_level1 = container.add_new_analysis(analysis_level2, np.arange(n_level2))
    #     for i in range(500):
    #         analysis_level1.do_iteration()
        
        tic = time.time()
        analysis_level0 = container.add_new_analysis(analysis_level1, np.arange(n_level1))
        for i in range(500):
            analysis_level0.do_iteration()

        execution_hsneCPU_level0 = (time.time()-tic)


        df_hsneCPU_level2['label'+str(execution)] = y[analysis_level2.landmark_orig_indexes]
        df_hsneCPU_level2['x'+str(execution)] = analysis_level2.embedding[:, 0]
        df_hsneCPU_level2['y'+str(execution)] = analysis_level2.embedding[:, 1]
        df_hsneCPU_level2['inds'+str(execution)] = analysis_level2.landmark_orig_indexes

        df_hsneCPU_level0['label'+str(execution)] = y[analysis_level0.landmark_orig_indexes]
        df_hsneCPU_level0['x'+str(execution)] = analysis_level0.embedding[:, 0]
        df_hsneCPU_level0['y'+str(execution)] = analysis_level0.embedding[:, 1]
        df_hsneCPU_level0['inds'+str(execution)] = analysis_level0.landmark_orig_indexes


        """
            Executing HSNE in GPU
        """
        tic = time.time()
        container = hsne_analysis.AnalysisModel(hsneGPU, hsne_analysis.EmbedderType.GPU)
        analysis_level2 = container.top_analysis
        for i in range(500):
            analysis_level2.do_iteration()
        execution_hsneGPU_level2 = (time.time()-tic) 

        analysis_level1 = container.add_new_analysis(analysis_level2, np.arange(n_level2))
    #     for i in range(500):
    #         analysis_level1.do_iteration()
        
        tic = time.time()
        analysis_level0 = container.add_new_analysis(analysis_level1, np.arange(n_level1))
        for i in range(500):
            analysis_level0.do_iteration()

        execution_hsneGPU_level0 = (time.time()-tic) 


        df_hsneGPU_level2['label'+str(execution)] = y[analysis_level2.landmark_orig_indexes]
        df_hsneGPU_level2['x'+str(execution)] = analysis_level2.embedding[:, 0]
        df_hsneGPU_level2['y'+str(execution)] = analysis_level2.embedding[:, 1]
        df_hsneGPU_level2['inds'+str(execution)] = analysis_level2.landmark_orig_indexes

        df_hsneGPU_level0['label'+str(execution)] = y[analysis_level0.landmark_orig_indexes]
        df_hsneGPU_level0['x'+str(execution)] = analysis_level0.embedding[:, 0]
        df_hsneGPU_level0['y'+str(execution)] = analysis_level0.embedding[:, 1]
        df_hsneGPU_level0['inds'+str(execution)] = analysis_level0.landmark_orig_indexes


        """
            Executing HUMAP
        """
        tic = time.time()
        hUmap.fit(X, y)
        execution_humap_fit = time.time() - tic
        
        tic = time.time()
        embedding2 = hUmap.transform(2)
        execution_humap_level2 = time.time() - tic
        
        tic = time.time()
        embedding0 = hUmap.transform(0)
        execution_humap_level0 = time.time() - tic


        df_humap_level2['label'+str(execution)] = hUmap.get_labels(2)
        df_humap_level2['x'+str(execution)] = embedding2[:, 0]
        df_humap_level2['y'+str(execution)] = embedding2[:, 1]
        df_humap_level2['inds'+str(execution)] = hUmap.get_original_indices(2)

        df_humap_level0['label'+str(execution)] = y
        df_humap_level0['x'+str(execution)] = embedding0[:, 0]
        df_humap_level0['y'+str(execution)] = embedding0[:, 1]
        df_humap_level0['inds'+str(execution)] = np.arange(len(y))
        
        
        hUmapFLANN = h_umap.HUMAP('precomputed', np.array([n_level1/n_level0, n_level2/n_level1]), 100, 0.15, "FLANN", 0.0, True)
        hUmapFLANN.set_distance_similarity(False)
        hUmapFLANN.set_path_increment(False)
        hUmapFLANN.set_influence_neighborhood(0)
        

        tic = time.time()
        hUmapFLANN.fit(X, y)
        execution_humapFLANN_fit = time.time() - tic
        
        tic = time.time()
        embedding2 = hUmapFLANN.transform(2)
        execution_humapFLANN_level2 = time.time() - tic
            
        tic = time.time()
        embedding0 = hUmapFLANN.transform(0)
        execution_humapFLANN_level0 = time.time() - tic

        df_humapFLANN_level2['label'+str(execution)] = hUmapFLANN.get_labels(2)
        df_humapFLANN_level2['x'+str(execution)] = embedding2[:, 0]
        df_humapFLANN_level2['y'+str(execution)] = embedding2[:, 1]
        df_humapFLANN_level2['inds'+str(execution)] = hUmapFLANN.get_original_indices(2)

        df_humapFLANN_level0['label'+str(execution)] = y
        df_humapFLANN_level0['x'+str(execution)] = embedding0[:, 0]
        df_humapFLANN_level0['y'+str(execution)] = embedding0[:, 1]
        df_humapFLANN_level0['inds'+str(execution)] = np.arange(len(y))
        
        
        hUmapKDTREE_NN = h_umap.HUMAP('precomputed', np.array([n_level1/n_level0, n_level2/n_level1]), 100, 0.15, "KDTree_NNDescent", 0.0, True)
        hUmapKDTREE_NN.set_distance_similarity(False)
        hUmapKDTREE_NN.set_path_increment(False)
        hUmapKDTREE_NN.set_influence_neighborhood(0)
        

        tic = time.time()

        hUmapKDTREE_NN.fit(X, y)
        execution_humapKDTREE_NN_fit = time.time() - tic
        
        tic = time.time()
        embedding2 = hUmapKDTREE_NN.transform(2)
        execution_humapKDTREE_NN_level2 = time.time() - tic
        
        embedding0 = hUmapKDTREE_NN.transform(0)
        execution_humapKDTREE_NN_level0 = time.time() - tic

        df_humapKDTREE_NN_level2['label'+str(execution)] = hUmapKDTREE_NN.get_labels(2)
        df_humapKDTREE_NN_level2['x'+str(execution)] = embedding2[:, 0]
        df_humapKDTREE_NN_level2['y'+str(execution)] = embedding2[:, 1]
        df_humapKDTREE_NN_level2['inds'+str(execution)] = hUmapKDTREE_NN.get_original_indices(2)

        df_humapKDTREE_NN_level0['label'+str(execution)] = y
        df_humapKDTREE_NN_level0['x'+str(execution)] = embedding0[:, 0]
        df_humapKDTREE_NN_level0['y'+str(execution)] = embedding0[:, 1]
        df_humapKDTREE_NN_level0['inds'+str(execution)] = np.arange(len(y))
        

        df_hsneCPU_level2.to_csv("experiments/comparison/"+dataset['name']+'/hsneCPU_it'+str(execution)+'_level2.csv', index=False)
        df_hsneCPU_level0.to_csv("experiments/comparison/"+dataset['name']+'/hsneCPU_it'+str(execution)+'_level0.csv', index=False)
        df_hsneGPU_level2.to_csv("experiments/comparison/"+dataset['name']+'/hsneGPU_it'+str(execution)+'_level2.csv', index=False)
        df_hsneGPU_level0.to_csv("experiments/comparison/"+dataset['name']+'/hsneGPU_it'+str(execution)+'_level0.csv', index=False)
        df_humap_level2.to_csv("experiments/comparison/"+dataset['name']+'/humap_it'+str(execution)+'_level2.csv', index=False)
        df_humap_level0.to_csv("experiments/comparison/"+dataset['name']+'/humap_it'+str(execution)+'_level0.csv', index=False)
        df_humapFLANN_level2.to_csv("experiments/comparison/"+dataset['name']+'/humapFLANN_it'+str(execution)+'_level2.csv', index=False)
        df_humapFLANN_level0.to_csv("experiments/comparison/"+dataset['name']+'/humapFLANN_it'+str(execution)+'_level0.csv', index=False)
        df_humapKDTREE_NN_level2.to_csv("experiments/comparison/"+dataset['name']+'/humapKDTREE_NN_it'+str(execution)+'_level2.csv', index=False)
        df_humapKDTREE_NN_level0.to_csv("experiments/comparison/"+dataset['name']+'/humapKDTREE_NN_it'+str(execution)+'_level0.csv', index=False)        
        
        df_hsneCPU_level2 = pd.DataFrame()
        df_hsneCPU_level0 = pd.DataFrame()

        df_hsneGPU_level2 = pd.DataFrame()
        df_hsneGPU_level0 = pd.DataFrame()

        df_humap_level2 = pd.DataFrame()
        df_humap_level0 = pd.DataFrame()

        df_mphate_level2 = pd.DataFrame()
        df_mphate_level0 = pd.DataFrame()
        
        df_humapFLANN_level2 = pd.DataFrame()
        df_humapFLANN_level0 = pd.DataFrame()
        
        df_humapKDTREE_NN_level2 = pd.DataFrame()
        df_humapKDTREE_NN_level0 = pd.DataFrame()
        

        
        
        time_file.write('HSNE CPU,Fit,'+str(execution_hsneCPU_fit)+'\n')
        time_file.write('HSNE GPU,Fit,'+str(execution_hsneGPU_fit)+'\n')
        time_file.write('HUMAP,Fit,'+str(execution_humap_fit)+'\n')        
        time_file.write('HUMAP FLANN,Fit,'+str(execution_humapFLANN_fit)+'\n')
        time_file.write('HUMAP KDTree + NNDescent,Fit,'+str(execution_humapKDTREE_NN_fit)+'\n')
        
        time_file.write('HSNE CPU,Level 2,'+str(execution_hsneCPU_level2)+'\n')
        time_file.write('HSNE GPU,Level 2,'+str(execution_hsneGPU_level2)+'\n')
        time_file.write('HUMAP,Level 2,'+str(execution_humap_level2)+'\n')
        time_file.write('HUMAP FLANN,Level 2,'+str(execution_humapFLANN_level2)+'\n')
        time_file.write('HUMAP KDTree + NNDescent,Level 2,'+str(execution_humapKDTREE_NN_level2)+'\n')
        
        time_file.write('HSNE CPU,Level 0,'+str(execution_hsneCPU_level2)+'\n')
        time_file.write('HSNE GPU,Level 0,'+str(execution_hsneGPU_level2)+'\n')
        time_file.write('HUMAP,Level 0,'+str(execution_humap_level2)+'\n')
        time_file.write('HUMAP FLANN,Level 0,'+str(execution_humapFLANN_level0)+'\n')
        time_file.write('HUMAP KDTree + NNDescent,Level 0,'+str(execution_humapKDTREE_NN_level0)+'\n')
        
        
        if executed_mphate:
            time_file.write('Multiscale PHATE,Fit,'+str(execution_mphate_level2)+'\n')
            time_file.write('Multiscale PHATE,Level 2,'+str(execution_mphate_level2)+'\n')
            time_file.write('Multiscale PHATE,Level 0,'+str(execution_mphate_level0)+'\n')
            size_file.write(str(level2)+','+str(len(np.unique(mp_op.NxTs[level2])))+'\n')
            
            
            
        



        
    
        
        
        
        
        
        
            
            
        size_file.close()
        time_file.close()
        
        
        
            
#     df_times = pd.DataFrame({
#         'HSNE CPU': hsneCPU_time,
#         'HSNE GPU': hsneGPU_time,
#         'HUMAP': humap_time,
#         'Multiscale PHATE': mphate_time
#     })
    
#     df_sizes = pd.DataFrame({
#         'Size': mphate_sizes
#     })
    
    
#     df_times.to_csv("experiments/comparison/"+dataset['name']+'/time_execution.csv', index=False)
#     df_sizes.to_csv("experiments/comparison/"+dataset['name']+'/mphate_size_level2.csv', index=False)
    
    
    
    
    
    
    
    

Loading mammals dataset...


  0%|          | 0/1 [00:00<?, ?it/s]

Done.
initing with 2
Initing at 2
Calculating Multiscale PHATE tree...
  Calculating PCA...
  Calculated PCA in 0.25 seconds.
  Calculating diffusion potential...




  Calculated diffusion potential in 19.35 seconds.
  Setting epsilon to 7.862
  Setting merge threshold to 0.001
  Calculating condensation...
  Calculated condensation in 40.68 seconds.
Calculated Multiscale PHATE tree in 61.75 seconds.
Computing gradient...
Identifying salient levels of resolution...
level: 1, n: 922, dif: 268


100%|██████████| 1/1 [08:42<00:00, 522.19s/it]


In [1]:
# testar HUMAP com FAISS, KDtree e FLANN

In [None]:
plt.scatter(df_hsneCPU_level0['x0'].values, df_hsneCPU_level0['y0'].values, c=df_hsneCPU_level0['label0'].values)

In [None]:
plt.scatter(df_hsneGPU_level2['x0'].values, df_hsneGPU_level2['y0'].values, c=df_hsneGPU_level2['label0'].values)

In [None]:
plt.scatter(df_hsneGPU_level0['x0'].values, df_hsneGPU_level0['y0'].values, c=df_hsneGPU_level0['label0'].values)

In [None]:
plt.scatter(df_humap_level2['x0'].values, df_humap_level2['y0'].values, c=df_humap_level2['label0'].values)

In [None]:
plt.scatter(df_humap_level0['x0'].values, df_humap_level0['y0'].values, c=df_humap_level0['label0'].values)

In [None]:
plt.scatter(df_mphate_level2['x0'].values, df_mphate_level2['y0'].values, c=df_mphate_level2['label0'].values)

In [None]:
plt.scatter(df_mphate_level0['x0'].values, df_mphate_level0['y0'].values, c=df_mphate_level0['label0'].values)