In [5]:
import nptsne
from nptsne import hsne_analysis
import multiscale_phate as mp

import time
import os
import scprep
import demap
import math
import random
import numpy as np
import pandas as pd
import hierarchical_umap as h_umap
import matplotlib.pyplot as plt

from tqdm import tqdm

from sklearn.neighbors import NearestNeighbors
from sklearn.utils import check_array
from sklearn.preprocessing import normalize, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from sklearn.neighbors import NearestNeighbors

from scipy.stats import pearsonr, spearmanr

In [6]:
def load_fmnist():
    fashionTrain = pd.read_csv('data/fashion-train.csv')

    fashionX = fashionTrain.values[:,2:]
    fashionY = fashionTrain.values[:, 1].astype(int)

    X = normalize(fashionX)
    y = fashionY

    X = check_array(X, dtype=np.float32, accept_sparse='csr', order='C')
    
    return X, y

def load_mnist():
    X = np.load('./data/MNIST_70000.npy')
    y = np.load('./data/MNIST_70000_label.npy').astype(int)
    X = normalize(X)
    X = check_array(X, dtype=np.float32, accept_sparse='csr', order='C')
    
    return X, y

def load_scRNAseq():
    download_path = os.path.expanduser("~/Documentos/HierarchicalUMAP/umap-cpp/umap/cpp/data")
    sparse=True
    T1 = scprep.io.load_10X(os.path.join(download_path, "scRNAseq", "T0_1A"), sparse=sparse, gene_labels='both')
    T2 = scprep.io.load_10X(os.path.join(download_path, "scRNAseq", "T2_3B"), sparse=sparse, gene_labels='both')
    T3 = scprep.io.load_10X(os.path.join(download_path, "scRNAseq", "T4_5C"), sparse=sparse, gene_labels='both')
    T4 = scprep.io.load_10X(os.path.join(download_path, "scRNAseq", "T6_7D"), sparse=sparse, gene_labels='both')
    T5 = scprep.io.load_10X(os.path.join(download_path, "scRNAseq", "T8_9E"), sparse=sparse, gene_labels='both')
    filtered_batches = []
    for batch in [T1, T2, T3, T4, T5]:
        batch = scprep.filter.filter_library_size(batch, percentile=20, keep_cells='above')
        batch = scprep.filter.filter_library_size(batch, percentile=75, keep_cells='below')
        filtered_batches.append(batch)
    del T1, T2, T3, T4, T5
    EBT_counts, sample_labels = scprep.utils.combine_batches(
        filtered_batches, 
        ["Day 00-03", "Day 06-09", "Day 12-15", "Day 18-21", "Day 24-27"],
        append_to_cell_names=True
    )
    del filtered_batches # removes objects from memory
    EBT_counts = scprep.filter.filter_rare_genes(EBT_counts, min_cells=10)
    EBT_counts = scprep.normalize.library_size_normalize(EBT_counts)
    mito_genes = scprep.select.get_gene_set(EBT_counts, starts_with="MT-") # Get all mitochondrial genes. There are 14, FYI.
    EBT_counts, sample_labels = scprep.filter.filter_gene_set_expression(
    EBT_counts, sample_labels, genes=mito_genes, 
    percentile=90, keep_cells='below')
    EBT_counts = scprep.transform.sqrt(EBT_counts)
    
    le = LabelEncoder()
    le.fit(sample_labels)
    labels = le.transform(sample_labels)
    X = PCA(n_components=50).fit_transform(EBT_counts.values)
    X = check_array(X, dtype=np.float32, accept_sparse='csr', order='C')
    return X, labels

def load_mammals():
    X = np.loadtxt("data/mammals-20000_features.txt")
    y = np.loadtxt("data/mammals-20000_classes.txt")
    X = normalize(X)
    
    return X, y

In [10]:
n_executions = 20
levels = 3

    



df_humapFLANN_level2 = pd.DataFrame()
df_humapFLANN_level0 = pd.DataFrame()

df_humapKDTREE_NN_level2 = pd.DataFrame()
df_humapKDTREE_NN_level0 = pd.DataFrame()

datasets = []

datasets.append({
   'load': load_scRNAseq,
   'name': 'scRNAseq'
})
datasets.append({
   'load': load_mammals,
   'name': 'mammals'
})
datasets.append({
    'load': load_fmnist,
    'name': 'fmnist'
})
datasets.append({
    'load': load_mnist,
    'name': 'mnist'
})
    
    
for dataset in datasets:
    print("Loading %s dataset..." % (dataset['name']))
    X, y = dataset['load']()
    print("Done.")
    init = 0

    hsneGPU = nptsne.HSne(True)
    hsneGPU.create_hsne(X, 3)

    n_level0 = hsneGPU.get_scale(0).num_points
    n_level1 = hsneGPU.get_scale(1).num_points
    n_level2 = hsneGPU.get_scale(2).num_points
    
    for execution in tqdm(range(init, n_executions)):
        
        time_file = open("experiments/comparison/"+dataset['name']+'/run-time.csv', 'a')
        size_file = open("experiments/comparison/"+dataset['name']+'/mphate_size_level2.csv', 'a')

        

        hUmapFLANN = h_umap.HUMAP('precomputed', np.array([n_level1/n_level0, n_level2/n_level2]), 100, 0.15, "FLANN", 0.0, True)
        hUmapFLANN.set_distance_similarity(False)
        hUmapFLANN.set_path_increment(False)
        hUmapFLANN.set_influence_neighborhood(0)
        

        tic = time.time()

        hUmapFLANN.fit(X, y)
        embedding2 = hUmapFLANN.transform(2)
        embedding0 = hUmapFLANN.transform(0)
        execution_humapFLANN = time.time() - tic

        df_humapFLANN_level2['label'+str(execution)] = hUmapFLANN.get_labels(2)
        df_humapFLANN_level2['x'+str(execution)] = embedding2[:, 0]
        df_humapFLANN_level2['y'+str(execution)] = embedding2[:, 1]
        df_humapFLANN_level2['inds'+str(execution)] = hUmapFLANN.get_original_indices(2)

        df_humapFLANN_level0['label'+str(execution)] = y
        df_humapFLANN_level0['x'+str(execution)] = embedding0[:, 0]
        df_humapFLANN_level0['y'+str(execution)] = embedding0[:, 1]
        df_humapFLANN_level0['inds'+str(execution)] = np.arange(len(y))
        
        
        hUmapKDTREE_NN = h_umap.HUMAP('precomputed', np.array([n_level1/n_level0, n_level2/n_level2]), 100, 0.15, "KDTree_NNDescent", 0.0, True)
        hUmapKDTREE_NN.set_distance_similarity(False)
        hUmapKDTREE_NN.set_path_increment(False)
        hUmapKDTREE_NN.set_influence_neighborhood(0)
        

        tic = time.time()

        hUmapKDTREE_NN.fit(X, y)
        embedding2 = hUmapKDTREE_NN.transform(2)
        embedding0 = hUmapKDTREE_NN.transform(0)
        execution_humapKDTREE_NN = time.time() - tic

        df_humapKDTREE_NN_level2['label'+str(execution)] = hUmapKDTREE_NN.get_labels(2)
        df_humapKDTREE_NN_level2['x'+str(execution)] = embedding2[:, 0]
        df_humapKDTREE_NN_level2['y'+str(execution)] = embedding2[:, 1]
        df_humapKDTREE_NN_level2['inds'+str(execution)] = hUmapKDTREE_NN.get_original_indices(2)

        df_humapKDTREE_NN_level0['label'+str(execution)] = y
        df_humapKDTREE_NN_level0['x'+str(execution)] = embedding0[:, 0]
        df_humapKDTREE_NN_level0['y'+str(execution)] = embedding0[:, 1]
        df_humapKDTREE_NN_level0['inds'+str(execution)] = np.arange(len(y))



        
    
        
        df_humapFLANN_level2.to_csv("experiments/comparison/"+dataset['name']+'/humapFLANN_it'+str(execution)+'_level2.csv', index=False)
        df_humapFLANN_level0.to_csv("experiments/comparison/"+dataset['name']+'/humapFLANN_it'+str(execution)+'_level0.csv', index=False)
        
        df_humapKDTREE_NN_level2.to_csv("experiments/comparison/"+dataset['name']+'/humapKDTREE_NN_it'+str(execution)+'_level2.csv', index=False)
        df_humapKDTREE_NN_level0.to_csv("experiments/comparison/"+dataset['name']+'/humapKDTREE_NN_it'+str(execution)+'_level0.csv', index=False)
        
        df_humapFLANN_level2 = pd.DataFrame()
        df_humapFLANN_level0 = pd.DataFrame()
        
        df_humapKDTREE_NN_level2 = pd.DataFrame()
        df_humapKDTREE_NN_level0 = pd.DataFrame()
        
        time_file.write('HUMAP FLANN,'+str(execution_humapFLANN)+'\n')
        time_file.write('HUMAP KDTree + NNDescent,'+str(execution_humapKDTREE_NN)+'\n')
        
        
            
#     df_times = pd.DataFrame({
#         'HSNE CPU': hsneCPU_time,
#         'HSNE GPU': hsneGPU_time,
#         'HUMAP': humap_time,
#         'Multiscale PHATE': mphate_time
#     })
    
#     df_sizes = pd.DataFrame({
#         'Size': mphate_sizes
#     })
    
    
#     df_times.to_csv("experiments/comparison/"+dataset['name']+'/time_execution.csv', index=False)
#     df_sizes.to_csv("experiments/comparison/"+dataset['name']+'/mphate_size_level2.csv', index=False)
    
    
    
    
    
    
    
    

Loading scRNAseq dataset...


  0%|          | 0/20 [00:00<?, ?it/s]

Done.


100%|██████████| 20/20 [19:47<00:00, 59.38s/it]


Loading mammals dataset...
Done.


100%|██████████| 20/20 [24:15<00:00, 72.77s/it]


Loading fmnist dataset...


  0%|          | 0/20 [00:00<?, ?it/s]

Done.


100%|██████████| 20/20 [1:29:28<00:00, 268.44s/it]


Loading mnist dataset...


  0%|          | 0/20 [00:00<?, ?it/s]

Done.


100%|██████████| 20/20 [1:52:13<00:00, 336.70s/it]


In [None]:
# testar HUMAP com FAISS, KDtree e FLANN

In [None]:
plt.scatter(df_hsneCPU_level0['x0'].values, df_hsneCPU_level0['y0'].values, c=df_hsneCPU_level0['label0'].values)

In [None]:
plt.scatter(df_hsneGPU_level2['x0'].values, df_hsneGPU_level2['y0'].values, c=df_hsneGPU_level2['label0'].values)

In [None]:
plt.scatter(df_hsneGPU_level0['x0'].values, df_hsneGPU_level0['y0'].values, c=df_hsneGPU_level0['label0'].values)

In [None]:
plt.scatter(df_humap_level2['x0'].values, df_humap_level2['y0'].values, c=df_humap_level2['label0'].values)

In [None]:
plt.scatter(df_humap_level0['x0'].values, df_humap_level0['y0'].values, c=df_humap_level0['label0'].values)

In [None]:
plt.scatter(df_mphate_level2['x0'].values, df_mphate_level2['y0'].values, c=df_mphate_level2['label0'].values)

In [None]:
plt.scatter(df_mphate_level0['x0'].values, df_mphate_level0['y0'].values, c=df_mphate_level0['label0'].values)