In [1]:
import nptsne
from nptsne import hsne_analysis
import multiscale_phate as mp

import time
import os
import scprep
import demap
import math
import random
import numpy as np
import pandas as pd
import humap
import matplotlib.pyplot as plt

from tqdm import tqdm

from sklearn.neighbors import NearestNeighbors
from sklearn.utils import check_array
from sklearn.preprocessing import normalize, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from sklearn.neighbors import NearestNeighbors

from scipy.stats import pearsonr, spearmanr

In [2]:
def load_fmnist():
    fashionTrain = pd.read_csv('./../data/fashion-train.csv')

    fashionX = fashionTrain.values[:,2:]
    fashionY = fashionTrain.values[:, 1].astype(int)

    X = normalize(fashionX)
    y = fashionY
#     X = PCA(n_components=15).fit_transform(X)
    X = check_array(X, dtype=np.float32, accept_sparse='csr', order='C')
    
    return X, y

def load_mnist():
    X = np.load('./../data/MNIST_70000.npy')
    y = np.load('./../data/MNIST_70000_label.npy').astype(int)
    X = normalize(X)
#     X = PCA(n_components=15).fit_transform(X)
    X = check_array(X, dtype=np.float32, accept_sparse='csr', order='C')
    
    return X, y

def load_scRNAseq():
    download_path = os.path.expanduser("~/Documentos/HierarchicalUMAP/umap-cpp/umap/cpp/data")
    sparse=True
    T1 = scprep.io.load_10X(os.path.join(download_path, "scRNAseq", "T0_1A"), sparse=sparse, gene_labels='both')
    T2 = scprep.io.load_10X(os.path.join(download_path, "scRNAseq", "T2_3B"), sparse=sparse, gene_labels='both')
    T3 = scprep.io.load_10X(os.path.join(download_path, "scRNAseq", "T4_5C"), sparse=sparse, gene_labels='both')
    T4 = scprep.io.load_10X(os.path.join(download_path, "scRNAseq", "T6_7D"), sparse=sparse, gene_labels='both')
    T5 = scprep.io.load_10X(os.path.join(download_path, "scRNAseq", "T8_9E"), sparse=sparse, gene_labels='both')
    filtered_batches = []
    for batch in [T1, T2, T3, T4, T5]:
        batch = scprep.filter.filter_library_size(batch, percentile=20, keep_cells='above')
        batch = scprep.filter.filter_library_size(batch, percentile=75, keep_cells='below')
        filtered_batches.append(batch)
    del T1, T2, T3, T4, T5
    EBT_counts, sample_labels = scprep.utils.combine_batches(
        filtered_batches, 
        ["Day 00-03", "Day 06-09", "Day 12-15", "Day 18-21", "Day 24-27"],
        append_to_cell_names=True
    )
    del filtered_batches # removes objects from memory
    EBT_counts = scprep.filter.filter_rare_genes(EBT_counts, min_cells=10)
    EBT_counts = scprep.normalize.library_size_normalize(EBT_counts)
    mito_genes = scprep.select.get_gene_set(EBT_counts, starts_with="MT-") # Get all mitochondrial genes. There are 14, FYI.
    EBT_counts, sample_labels = scprep.filter.filter_gene_set_expression(
    EBT_counts, sample_labels, genes=mito_genes, 
    percentile=90, keep_cells='below')
    EBT_counts = scprep.transform.sqrt(EBT_counts)
    
    le = LabelEncoder()
    le.fit(sample_labels)
    labels = le.transform(sample_labels)
    X = PCA(n_components=50).fit_transform(EBT_counts.values)
    X = check_array(X, dtype=np.float32, accept_sparse='csr', order='C')
    return X, labels

def load_mammals():
    X = np.loadtxt("./../data/mammals-20000_features.txt")
    y = np.loadtxt("./../data/mammals-20000_classes.txt")
    X = normalize(X)
    
    return X, y

In [3]:
n_executions = 20
levels = 3

df_humapTOP_0 = pd.DataFrame()
df_humapLevel1_0 = pd.DataFrame()
df_humapLevel0_0 = pd.DataFrame()

df_humapTOP_30 = pd.DataFrame()
df_humapLevel1_30 = pd.DataFrame()
df_humapLevel0_30 = pd.DataFrame()

df_humapTOP_50 = pd.DataFrame()
df_humapLevel1_50 = pd.DataFrame()
df_humapLevel0_50 = pd.DataFrame()

df_humapTOP_70 = pd.DataFrame()
df_humapLevel1_70 = pd.DataFrame()
df_humapLevel0_70 = pd.DataFrame()

df_humapTOP_100 = pd.DataFrame()
df_humapLevel1_100 = pd.DataFrame()
df_humapLevel0_100 = pd.DataFrame()

df_hsneTOP = pd.DataFrame()
df_hsneLevel1 = pd.DataFrame()
df_hsneLevel0 = pd.DataFrame()

datasets = []

# datasets.append({
#    'load': load_scRNAseq,
#    'name': 'scRNAseq'
# })
# datasets.append({
#    'load': load_mammals,
#    'name': 'mammals-drill'
# })
# datasets.append({
#     'load': load_fmnist,
#     'name': 'fmnist-drill'
# })
datasets.append({
    'load': load_mnist,
    'name': 'mnist-np'
})
    
    
for dataset in datasets:
    print("Loading %s dataset..." % (dataset['name']))
    X, y = dataset['load']()
    print("Done.")
    init = 0

    if not os.path.exists("comparison-np/"+dataset['name']):
        os.mkdir("comparison-np/"+dataset['name'])
        
        
    for i in range(n_executions):
        if os.path.exists('comparison-np/'+dataset['name']+'/humap100_it'+str(i)+'_level0.csv'):
            init = i+1
          
                
    print("Initing at %d" % (init))

        
    level2 = 0
    for execution in tqdm(range(init, n_executions)):
        
        time_file = open("comparison-np/"+dataset['name']+'/run-time.csv', 'a')

        hsne = nptsne.HSne(True)
        tic = time.time()
        hsne.create_hsne(X, 3)
        execution_hsne_fit = time.time()-tic

        n_level0 = hsne.get_scale(0).num_points
        n_level1 = hsne.get_scale(1).num_points
        n_level2 = hsne.get_scale(2).num_points

        
        
     


        """
            Executing HSNE in CPU
        """
        tic = time.time()
        container = hsne_analysis.AnalysisModel(hsne, hsne_analysis.EmbedderType.GPU)
        analysis_level2 = container.top_analysis
        for i in range(500):
            analysis_level2.do_iteration()
        execution_hsne_top = (time.time()-tic)
        
        tic = time.time()
        analysis_level1 = container.add_new_analysis(analysis_level2, np.arange(n_level2))
        for i in range(500):
            analysis_level1.do_iteration()
        execution_hsne_level1 = (time.time()-tic)
        
        tic = time.time()
        analysis_level0 = container.add_new_analysis(analysis_level1, np.arange(n_level1))
        for i in range(500):
            analysis_level0.do_iteration()

        execution_hsne_level0 = (time.time()-tic)


        df_hsneTOP['label'+str(execution)] = y[analysis_level2.landmark_orig_indexes]
        df_hsneTOP['x'+str(execution)] = analysis_level2.embedding[:, 0]
        df_hsneTOP['y'+str(execution)] = analysis_level2.embedding[:, 1]
        df_hsneTOP['inds'+str(execution)] = analysis_level2.landmark_orig_indexes
        
        df_hsneLevel1['label'+str(execution)] = y[analysis_level1.landmark_orig_indexes]
        df_hsneLevel1['x'+str(execution)] = analysis_level1.embedding[:, 0]
        df_hsneLevel1['y'+str(execution)] = analysis_level1.embedding[:, 1]
        df_hsneLevel1['inds'+str(execution)] = analysis_level1.landmark_orig_indexes

        df_hsneLevel0['label'+str(execution)] = y[analysis_level0.landmark_orig_indexes]
        df_hsneLevel0['x'+str(execution)] = analysis_level0.embedding[:, 0]
        df_hsneLevel0['y'+str(execution)] = analysis_level0.embedding[:, 1]
        df_hsneLevel0['inds'+str(execution)] = analysis_level0.landmark_orig_indexes


        """
            Executing HUMAP0
        """
        
        hUmap0 = humap.HUMAP(np.array([n_level1/n_level0, n_level2/n_level1]))
        hUmap0.set_distance_similarity(False)
        hUmap0.set_path_increment(False)
        hUmap0.set_influence_neighborhood(0)
        
        tic = time.time()
        hUmap0.fit(X, y)
        execution_humap0_fit = time.time() - tic
        
        tic = time.time()
        embedding2 = hUmap0.transform(2)
        execution_humap0_top = time.time() - tic
        
        tic = time.time()
        embedding1 = hUmap0.transform(1)
        execution_humap0_level1 = time.time() - tic
        
        tic = time.time()
        embedding0 = hUmap0.transform(0)
        execution_humap0_level0 = time.time() - tic


        df_humapTOP_0['label'+str(execution)] = hUmap0.labels(2)
        df_humapTOP_0['x'+str(execution)] = embedding2[:, 0]
        df_humapTOP_0['y'+str(execution)] = embedding2[:, 1]
        df_humapTOP_0['inds'+str(execution)] = hUmap0.original_indices(2)
        
        df_humapLevel1_0['label'+str(execution)] = hUmap0.labels(1)
        df_humapLevel1_0['x'+str(execution)] = embedding1[:, 0]
        df_humapLevel1_0['y'+str(execution)] = embedding1[:, 1]
        df_humapLevel1_0['inds'+str(execution)] = hUmap0.original_indices(1)

        df_humapLevel0_0['label'+str(execution)] = y
        df_humapLevel0_0['x'+str(execution)] = embedding0[:, 0]
        df_humapLevel0_0['y'+str(execution)] = embedding0[:, 1]
        df_humapLevel0_0['inds'+str(execution)] = np.arange(len(y))
        
        """
            Executing HUMAP30
        """
        
        hUmap30 = humap.HUMAP(np.array([n_level1/n_level0, n_level2/n_level1]))
        hUmap30.set_distance_similarity(False)
        hUmap30.set_path_increment(False)
        hUmap30.set_influence_neighborhood(30)
        
        tic = time.time()
        hUmap30.fit(X, y)
        execution_humap30_fit = time.time() - tic
        
        tic = time.time()
        embedding2 = hUmap30.transform(2)
        execution_humap30_top = time.time() - tic
        
        tic = time.time()
        embedding1 = hUmap30.transform(1)
        execution_humap30_level1 = time.time() - tic
        
        tic = time.time()
        embedding0 = hUmap30.transform(0)
        execution_humap30_level0 = time.time() - tic


        df_humapTOP_30['label'+str(execution)] = hUmap30.labels(2)
        df_humapTOP_30['x'+str(execution)] = embedding2[:, 0]
        df_humapTOP_30['y'+str(execution)] = embedding2[:, 1]
        df_humapTOP_30['inds'+str(execution)] = hUmap30.original_indices(2)
        
        df_humapLevel1_30['label'+str(execution)] = hUmap30.labels(1)
        df_humapLevel1_30['x'+str(execution)] = embedding1[:, 0]
        df_humapLevel1_30['y'+str(execution)] = embedding1[:, 1]
        df_humapLevel1_30['inds'+str(execution)] = hUmap30.original_indices(1)

        df_humapLevel0_30['label'+str(execution)] = y
        df_humapLevel0_30['x'+str(execution)] = embedding0[:, 0]
        df_humapLevel0_30['y'+str(execution)] = embedding0[:, 1]
        df_humapLevel0_30['inds'+str(execution)] = np.arange(len(y))
        
        """
            Executing HUMAP50
        """
        
        hUmap50 = humap.HUMAP(np.array([n_level1/n_level0, n_level2/n_level1]))
        hUmap50.set_distance_similarity(False)
        hUmap50.set_path_increment(False)
        hUmap50.set_influence_neighborhood(50)
        
        tic = time.time()
        hUmap50.fit(X, y)
        execution_humap50_fit = time.time() - tic
        
        tic = time.time()
        embedding2 = hUmap50.transform(2)
        execution_humap50_top = time.time() - tic
        
        tic = time.time()
        embedding1 = hUmap50.transform(1)
        execution_humap50_level1 = time.time() - tic
        
        tic = time.time()
        embedding0 = hUmap50.transform(0)
        execution_humap50_level0 = time.time() - tic


        df_humapTOP_50['label'+str(execution)] = hUmap50.labels(2)
        df_humapTOP_50['x'+str(execution)] = embedding2[:, 0]
        df_humapTOP_50['y'+str(execution)] = embedding2[:, 1]
        df_humapTOP_50['inds'+str(execution)] = hUmap50.original_indices(2)

        df_humapLevel1_50['label'+str(execution)] = hUmap50.labels(1)
        df_humapLevel1_50['x'+str(execution)] = embedding1[:, 0]
        df_humapLevel1_50['y'+str(execution)] = embedding1[:, 1]
        df_humapLevel1_50['inds'+str(execution)] = hUmap50.original_indices(1)
        
        df_humapLevel0_50['label'+str(execution)] = y
        df_humapLevel0_50['x'+str(execution)] = embedding0[:, 0]
        df_humapLevel0_50['y'+str(execution)] = embedding0[:, 1]
        df_humapLevel0_50['inds'+str(execution)] = np.arange(len(y))
        
        """
            Executing HUMAP70
        """
        
        hUmap70 = humap.HUMAP(np.array([n_level1/n_level0, n_level2/n_level1]))
        hUmap70.set_distance_similarity(False)
        hUmap70.set_path_increment(False)
        hUmap70.set_influence_neighborhood(70)
        
        tic = time.time()
        hUmap70.fit(X, y)
        execution_humap70_fit = time.time() - tic
        
        tic = time.time()
        embedding2 = hUmap70.transform(2)
        execution_humap70_top = time.time() - tic
        
        tic = time.time()
        embedding1 = hUmap70.transform(1)
        execution_humap70_level1 = time.time() - tic
        
        tic = time.time()
        embedding0 = hUmap70.transform(0)
        execution_humap70_level0 = time.time() - tic


        df_humapTOP_70['label'+str(execution)] = hUmap70.labels(2)
        df_humapTOP_70['x'+str(execution)] = embedding2[:, 0]
        df_humapTOP_70['y'+str(execution)] = embedding2[:, 1]
        df_humapTOP_70['inds'+str(execution)] = hUmap70.original_indices(2)

        df_humapLevel1_70['label'+str(execution)] = hUmap70.labels(1)
        df_humapLevel1_70['x'+str(execution)] = embedding1[:, 0]
        df_humapLevel1_70['y'+str(execution)] = embedding1[:, 1]
        df_humapLevel1_70['inds'+str(execution)] = hUmap70.original_indices(1)
        
        df_humapLevel0_70['label'+str(execution)] = y
        df_humapLevel0_70['x'+str(execution)] = embedding0[:, 0]
        df_humapLevel0_70['y'+str(execution)] = embedding0[:, 1]
        df_humapLevel0_70['inds'+str(execution)] = np.arange(len(y))
        
        
        """
            Executing HUMAP100
        """
        
        hUmap100 = humap.HUMAP(np.array([n_level1/n_level0, n_level2/n_level1]))
        hUmap100.set_distance_similarity(False)
        hUmap100.set_path_increment(False)
        hUmap100.set_influence_neighborhood(100)
        
        tic = time.time()
        hUmap100.fit(X, y)
        execution_humap100_fit = time.time() - tic
        
        tic = time.time()
        embedding2 = hUmap100.transform(2)
        execution_humap100_top = time.time() - tic
        
        tic = time.time()
        embedding1 = hUmap100.transform(1)
        execution_humap100_level1 = time.time() - tic
        
        tic = time.time()
        embedding0 = hUmap100.transform(0)
        execution_humap100_level0 = time.time() - tic
        
        


        df_humapTOP_100['label'+str(execution)] = hUmap100.labels(2)
        df_humapTOP_100['x'+str(execution)] = embedding2[:, 0]
        df_humapTOP_100['y'+str(execution)] = embedding2[:, 1]
        df_humapTOP_100['inds'+str(execution)] = hUmap100.original_indices(2)

        df_humapLevel1_100['label'+str(execution)] = hUmap100.labels(1)
        df_humapLevel1_100['x'+str(execution)] = embedding1[:, 0]
        df_humapLevel1_100['y'+str(execution)] = embedding1[:, 1]
        df_humapLevel1_100['inds'+str(execution)] = hUmap100.original_indices(1)
        
        df_humapLevel0_100['label'+str(execution)] = y
        df_humapLevel0_100['x'+str(execution)] = embedding0[:, 0]
        df_humapLevel0_100['y'+str(execution)] = embedding0[:, 1]
        df_humapLevel0_100['inds'+str(execution)] = np.arange(len(y))
        
        
        df_hsneTOP.to_csv("comparison-np/"+dataset['name']+'/hsne_it'+str(execution)+'_TOP.csv', index=False)
        df_hsneLevel1.to_csv("comparison-np/"+dataset['name']+'/hsne_it'+str(execution)+'_level1.csv', index=False)
        df_hsneLevel0.to_csv("comparison-np/"+dataset['name']+'/hsne_it'+str(execution)+'_level0.csv', index=False)
        
        df_humapTOP_0.to_csv("comparison-np/"+dataset['name']+'/humap0_it'+str(execution)+'_TOP.csv', index=False)
        df_humapLevel1_0.to_csv("comparison-np/"+dataset['name']+'/humap0_it'+str(execution)+'_level1.csv', index=False)
        df_humapLevel0_0.to_csv("comparison-np/"+dataset['name']+'/humap0_it'+str(execution)+'_level0.csv', index=False)
        
        df_humapTOP_30.to_csv("comparison-np/"+dataset['name']+'/humap30_it'+str(execution)+'_TOP.csv', index=False)
        df_humapLevel1_30.to_csv("comparison-np/"+dataset['name']+'/humap30_it'+str(execution)+'_level1.csv', index=False)
        df_humapLevel0_30.to_csv("comparison-np/"+dataset['name']+'/humap30_it'+str(execution)+'_level0.csv', index=False)
        
        df_humapTOP_50.to_csv("comparison-np/"+dataset['name']+'/humap50_it'+str(execution)+'_TOP.csv', index=False)
        df_humapLevel1_50.to_csv("comparison-np/"+dataset['name']+'/humap50_it'+str(execution)+'_level1.csv', index=False)
        df_humapLevel0_50.to_csv("comparison-np/"+dataset['name']+'/humap50_it'+str(execution)+'_level0.csv', index=False)
        
        df_humapTOP_70.to_csv("comparison-np/"+dataset['name']+'/humap70_it'+str(execution)+'_TOP.csv', index=False)
        df_humapLevel1_70.to_csv("comparison-np/"+dataset['name']+'/humap70_it'+str(execution)+'_level1.csv', index=False)
        df_humapLevel0_70.to_csv("comparison-np/"+dataset['name']+'/humap70_it'+str(execution)+'_level0.csv', index=False)
        
        df_humapTOP_100.to_csv("comparison-np/"+dataset['name']+'/humap100_it'+str(execution)+'_TOP.csv', index=False)
        df_humapLevel1_100.to_csv("comparison-np/"+dataset['name']+'/humap100_it'+str(execution)+'_level1.csv', index=False)
        df_humapLevel0_100.to_csv("comparison-np/"+dataset['name']+'/humap100_it'+str(execution)+'_level0.csv', index=False)
       
        
        df_humapTOP_0 = pd.DataFrame()
        df_humapLevel1_0 = pd.DataFrame()
        df_humapLevel0_0 = pd.DataFrame()

        df_humapTOP_30 = pd.DataFrame()
        df_humapLevel1_30 = pd.DataFrame()
        df_humapLevel0_30 = pd.DataFrame()

        df_humapTOP_50 = pd.DataFrame()
        df_humapLevel1_50 = pd.DataFrame()
        df_humapLevel0_50 = pd.DataFrame()

        df_humapTOP_70 = pd.DataFrame()
        df_humapLevel1_70 = pd.DataFrame()
        df_humapLevel0_70 = pd.DataFrame()

        df_humapTOP_100 = pd.DataFrame()
        df_humapLevel1_100 = pd.DataFrame()
        df_humapLevel0_100 = pd.DataFrame()

        df_hsneTOP = pd.DataFrame()
        df_hsneLevel1= pd.DataFrame()
        df_hsneLevel0 = pd.DataFrame()
        

        
        
        time_file.write('HSNE,Fit,'+str(execution_hsne_fit)+'\n')
        time_file.write('HUMAP 0,Fit,'+str(execution_humap0_fit)+'\n')        
        time_file.write('HUMAP 30,Fit,'+str(execution_humap30_fit)+'\n') 
        time_file.write('HUMAP 50,Fit,'+str(execution_humap50_fit)+'\n') 
        time_file.write('HUMAP 70,Fit,'+str(execution_humap70_fit)+'\n') 
        time_file.write('HUMAP 100,Fit,'+str(execution_humap100_fit)+'\n') 
        
        
        time_file.write('HSNE,Top,'+str(execution_hsne_top)+'\n')
        time_file.write('HUMAP 0,Top,'+str(execution_humap0_top)+'\n')        
        time_file.write('HUMAP 30,Top,'+str(execution_humap30_top)+'\n') 
        time_file.write('HUMAP 50,Top,'+str(execution_humap50_top)+'\n') 
        time_file.write('HUMAP 70,Top,'+str(execution_humap70_top)+'\n') 
        time_file.write('HUMAP 100,Top,'+str(execution_humap100_top)+'\n') 
        
        time_file.write('HSNE,Level 1,'+str(execution_hsne_level1)+'\n')
        time_file.write('HUMAP 0,Level 1,'+str(execution_humap0_level1)+'\n')        
        time_file.write('HUMAP 30,Level 1,'+str(execution_humap30_level1)+'\n') 
        time_file.write('HUMAP 50,Level 1,'+str(execution_humap50_level1)+'\n') 
        time_file.write('HUMAP 70,Level 1,'+str(execution_humap70_level1)+'\n') 
        time_file.write('HUMAP 100,Level 1,'+str(execution_humap100_level1)+'\n')
        
        time_file.write('HSNE,Level 0,'+str(execution_hsne_level0)+'\n')
        time_file.write('HUMAP 0,Level 0,'+str(execution_humap0_level0)+'\n')        
        time_file.write('HUMAP 30,Level 0,'+str(execution_humap30_level0)+'\n') 
        time_file.write('HUMAP 50,Level 0,'+str(execution_humap50_level0)+'\n') 
        time_file.write('HUMAP 70,Level 0,'+str(execution_humap70_level0)+'\n') 
        time_file.write('HUMAP 100,Level 0,'+str(execution_humap100_level0)+'\n')
    
    
    
    
    

Loading mnist-np dataset...


  0%|          | 0/7 [00:00<?, ?it/s]

Done.
Initing at 13


100%|██████████| 7/7 [37:18<00:00, 319.80s/it]


In [1]:
# testar HUMAP com FAISS, KDtree e FLANN

In [None]:
plt.scatter(df_hsneCPU_level0['x0'].values, df_hsneCPU_level0['y0'].values, c=df_hsneCPU_level0['label0'].values)

In [None]:
plt.scatter(df_hsneGPU_level2['x0'].values, df_hsneGPU_level2['y0'].values, c=df_hsneGPU_level2['label0'].values)

In [None]:
plt.scatter(df_hsneGPU_level0['x0'].values, df_hsneGPU_level0['y0'].values, c=df_hsneGPU_level0['label0'].values)

In [None]:
plt.scatter(df_humap_level2['x0'].values, df_humap_level2['y0'].values, c=df_humap_level2['label0'].values)

In [None]:
plt.scatter(df_humap_level0['x0'].values, df_humap_level0['y0'].values, c=df_humap_level0['label0'].values)

In [None]:
plt.scatter(df_mphate_level2['x0'].values, df_mphate_level2['y0'].values, c=df_mphate_level2['label0'].values)

In [None]:
plt.scatter(df_mphate_level0['x0'].values, df_mphate_level0['y0'].values, c=df_mphate_level0['label0'].values)