In [15]:
import os
import scprep
import demap
import math
import random
import numpy as np
import pandas as pd
import hierarchical_umap as h_umap
import matplotlib.pyplot as plt

from tqdm import tqdm
from sklearn.neighbors import NearestNeighbors
from sklearn.utils import check_array
from sklearn.preprocessing import normalize, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from sklearn.neighbors import NearestNeighbors

from scipy.stats import pearsonr, spearmanr

In [16]:
def correlation(X, X_emb):
    
    dist_orig = np.square(euclidean_distances(X, X)).flatten()
    dist_emb = np.square(euclidean_distances(X_emb, X_emb)).flatten()
    
    
    coef, p = spearmanr(dist_orig, dist_emb)
    return coef

def stress(X, X_emb):
    
    DE = euclidean_distances(X_emb)
    DE = DE/np.max(DE)
    DH = euclidean_distances(X)
    DH = DH/np.max(DH)
    stress = 0.5 * np.sum((DE - DH)**2)
    
    return np.sqrt(stress/(0.5*np.sum(DH**2)))
    

def neighborhood_preservation(X, X_emb, Khigh=30):
    
    neigh_high = NearestNeighbors(n_neighbors=Khigh+1, n_jobs=-1)
    neigh_high.fit(X)
    high_dists, high_indices = neigh_high.kneighbors(X)


    neigh_emb = NearestNeighbors(n_neighbors=Khigh+1, n_jobs=-1)
    neigh_emb.fit(X_emb)
    emb_dists, emb_indices = neigh_emb.kneighbors(X_emb)

    npres = np.zeros(Khigh)
    
    for k in range(1, Khigh+1):
        for i in range(X.shape[0]):
            high_current = high_indices[i][1:k+1]
            emb_current = emb_indices[i][1:k+1]
            
            tp = len(np.intersect1d(high_current, emb_current))
            
            npres[k-1] += (tp/k)
        
        
    npres /= float(X.shape[0])
    
    return npres

In [17]:
def load_fmnist():
    fashionTrain = pd.read_csv('data/fashion-train.csv')

    fashionX = fashionTrain.values[:,2:]
    fashionY = fashionTrain.values[:, 1].astype(int)

    X = normalize(fashionX)
    y = fashionY

    X = check_array(X, dtype=np.float32, accept_sparse='csr', order='C')
    
    return X, y

def load_mnist():
    X = np.load('./data/MNIST_70000.npy')
    y = np.load('./data/MNIST_70000_label.npy').astype(int)
    X = normalize(X)
    X = check_array(X, dtype=np.float32, accept_sparse='csr', order='C')
    
    return X, y

def load_scRNAseq():
    download_path = os.path.expanduser("~/Documentos/HierarchicalUMAP/umap-cpp/umap/cpp/data")
    sparse=True
    T1 = scprep.io.load_10X(os.path.join(download_path, "scRNAseq", "T0_1A"), sparse=sparse, gene_labels='both')
    T2 = scprep.io.load_10X(os.path.join(download_path, "scRNAseq", "T2_3B"), sparse=sparse, gene_labels='both')
    T3 = scprep.io.load_10X(os.path.join(download_path, "scRNAseq", "T4_5C"), sparse=sparse, gene_labels='both')
    T4 = scprep.io.load_10X(os.path.join(download_path, "scRNAseq", "T6_7D"), sparse=sparse, gene_labels='both')
    T5 = scprep.io.load_10X(os.path.join(download_path, "scRNAseq", "T8_9E"), sparse=sparse, gene_labels='both')
    filtered_batches = []
    for batch in [T1, T2, T3, T4, T5]:
        batch = scprep.filter.filter_library_size(batch, percentile=20, keep_cells='above')
        batch = scprep.filter.filter_library_size(batch, percentile=75, keep_cells='below')
        filtered_batches.append(batch)
    del T1, T2, T3, T4, T5
    EBT_counts, sample_labels = scprep.utils.combine_batches(
        filtered_batches, 
        ["Day 00-03", "Day 06-09", "Day 12-15", "Day 18-21", "Day 24-27"],
        append_to_cell_names=True
    )
    del filtered_batches # removes objects from memory
    EBT_counts = scprep.filter.filter_rare_genes(EBT_counts, min_cells=10)
    EBT_counts = scprep.normalize.library_size_normalize(EBT_counts)
    mito_genes = scprep.select.get_gene_set(EBT_counts, starts_with="MT-") # Get all mitochondrial genes. There are 14, FYI.
    EBT_counts, sample_labels = scprep.filter.filter_gene_set_expression(
    EBT_counts, sample_labels, genes=mito_genes, 
    percentile=90, keep_cells='below')
    EBT_counts = scprep.transform.sqrt(EBT_counts)
    
    le = LabelEncoder()
    le.fit(sample_labels)
    labels = le.transform(sample_labels)
    X = PCA(n_components=50).fit_transform(EBT_counts.values)
    X = check_array(X, dtype=np.float32, accept_sparse='csr', order='C')
    return X, labels

def load_mammals():
    X = np.loadtxt("data/mammals-20000_features.txt")
    y = np.loadtxt("data/mammals-20000_classes.txt")
    X = normalize(X)
    
    return X, y

In [18]:
datasets = []
datasets.append({
    'load': load_mammals,
    'name': 'mammals'
})
datasets.append({
    'load': load_scRNAseq,
    'name': 'scRNAseq'
})
datasets.append({
    'load': load_fmnist,
    'name': 'fmnist'
})
datasets.append({
    'load': load_mnist,
    'name': 'mnist'
})

In [33]:
def compute_metrics(datasets, techniques = ['hsneCPU', 'hsneGPU', 'mphate', 'humap', 'humapFLANN', 'humapKDTREE_NN'], n_executions = 20):
    map_name = {}
    map_name['hsneCPU'] = 'HSNE CPU'
    map_name['hsneGPU'] = 'HSNE GPU'
    map_name['mphate'] = 'Multiscale PHATE'
    map_name['humap'] = 'HUMAP'
    map_name['humapFLANN'] = 'HUMAP FLANN'
    map_name['humapKDTREE_NN'] = 'HUMAP KDTree + NNDescent'
    
    
    corr_values = []
    corr_level = []
    corr_technique = []
    
    demap_values = []
    demap_level = []
    demap_technique = []
    
    neighborhood_values = []
    np_values = []
    np_level = []
    np_technique = []
    
    dataset_values_corr = []
    dataset_values_demap = []
    dataset_values_np = []
    
    size_before_corr = 0
    size_before_demap = 0
    size_before_np = 0
    
    for dataset in datasets:
        
        path = 'experiments/comparison/'+dataset['name']
        X, y = dataset['load']()
        
        print("DATASET: %s" % (dataset['name']))
        
                
        for technique in techniques:
            print("Technique: %s" % (technique))
            
            if (dataset['name'] == 'fmnist' or dataset['name'] == 'mnist') and technique == 'mphate':
                continue
                
            for i, level in enumerate(['_level2.csv', '_level0.csv']):
                
                print("Level: %s" % (level))
                
                demap_list = []
                correlation_list = []
                np_list = []

                for execution in tqdm(range(n_executions)):
                    it = str(execution)
                    
                    df = pd.read_csv(path+'/'+technique+'_it'+it+level)
                    x = df['x'+it].values
                    y = df['y'+it].values
                    indices = df['inds'+it].values
                    
                    emb = np.stack((x, y), axis=-1)
                    sample = random.sample(range(0, len(x)), min(2000, len(x)))
                    
                    subset_emb = emb[sample]
                    subset_X = X[indices][sample]
                    
                    demap_value = demap.DEMaP(subset_X, subset_emb)
                    demap_list.append(demap_value)
                    
                    corr_value = correlation(subset_X, subset_emb)
                    correlation_list.append(corr_value)
                    
                    npres_values = neighborhood_preservation(subset_X, subset_emb)
                    np_list = np_list + npres_values.tolist()
                    
                    
                corr_values = corr_values + correlation_list
                corr_technique = corr_technique + [map_name[technique]]*len(correlation_list)
                corr_level = corr_level + ['Level '+str(i)]*len(correlation_list)
                
                demap_values = demap_values + demap_list
                demap_technique = demap_technique + [map_name[technique]]*len(demap_list)
                demap_level = demap_level + ['Level '+str(i)]*len(demap_list)
                
                np_values = np_values + np_list
                neighborhood_values = neighborhood_values + list(range(30))*n_executions
                np_technique = np_technique + [map_name[technique]]*len(np_list)
                np_level = np_level + ['Level '+str(i)]*len(np_list)
        
        
        dataset_values_corr = dataset_values_corr + [dataset['name']]*(len(corr_values) - size_before_corr)
        dataset_values_demap = dataset_values_demap + [dataset['name']]*(len(demap_values) - size_before_demap)
        dataset_values_np = dataset_values_np + [dataset['name']]*(len(np_values) - size_before_np)
        
        size_before_corr = len(corr_values)
        size_before_demap = len(demap_values)
        size_before_np = len(np_values)
        
        
        
    
    print(len(dataset_values_corr), len(corr_technique), len(corr_values))
    
    df_correlation = pd.DataFrame({
        'dataset': dataset_values_corr,
        'technique': corr_technique,
        'level': corr_level,
        'values': corr_values        
    })
    
    print(len(dataset_values_demap), len(demap_technique), len(demap_values))
    
    df_demap = pd.DataFrame({
        'dataset': dataset_values_demap,
        'technique': demap_technique,
        'level': demap_level,
        'values': demap_values
    })
    
    print(len(dataset_values_np), len(np_technique), len(neighborhood_values), len(np_values))
    
    
    df_np = pd.DataFrame({
        'dataset': dataset_values_np,
        'technique': np_technique,
        'level': np_level,
        'neighbors': neighborhood_values,
        'np': np_values
    })
        
    return df_correlation, df_demap, df_np            
                

In [34]:
df_correlation, df_demap, df_np = compute_metrics(datasets)

  0%|          | 0/20 [00:00<?, ?it/s]

DATASET: mammals
Technique: hsneCPU
Level: _level2.csv


100%|██████████| 20/20 [00:15<00:00,  1.26it/s]
  0%|          | 0/20 [00:00<?, ?it/s]

Level: _level0.csv


100%|██████████| 20/20 [01:08<00:00,  3.44s/it]
  0%|          | 0/20 [00:00<?, ?it/s]

Technique: hsneGPU
Level: _level2.csv


100%|██████████| 20/20 [00:16<00:00,  1.19it/s]
  0%|          | 0/20 [00:00<?, ?it/s]

Level: _level0.csv


100%|██████████| 20/20 [01:10<00:00,  3.55s/it]
  0%|          | 0/20 [00:00<?, ?it/s]

Technique: mphate
Level: _level2.csv


100%|██████████| 20/20 [00:24<00:00,  1.23s/it]
  0%|          | 0/20 [00:00<?, ?it/s]

Level: _level0.csv


100%|██████████| 20/20 [01:06<00:00,  3.32s/it]
  0%|          | 0/20 [00:00<?, ?it/s]

Technique: humap
Level: _level2.csv


100%|██████████| 20/20 [01:06<00:00,  3.31s/it]
  0%|          | 0/20 [00:00<?, ?it/s]

Level: _level0.csv


100%|██████████| 20/20 [01:07<00:00,  3.35s/it]
  0%|          | 0/20 [00:00<?, ?it/s]

Technique: humapFLANN
Level: _level2.csv


100%|██████████| 20/20 [01:06<00:00,  3.34s/it]
  0%|          | 0/20 [00:00<?, ?it/s]

Level: _level0.csv


100%|██████████| 20/20 [01:06<00:00,  3.34s/it]
  0%|          | 0/20 [00:00<?, ?it/s]

Technique: humapKDTREE_NN
Level: _level2.csv


100%|██████████| 20/20 [01:06<00:00,  3.32s/it]
  0%|          | 0/20 [00:00<?, ?it/s]

Level: _level0.csv


100%|██████████| 20/20 [01:06<00:00,  3.34s/it]
  0%|          | 0/20 [00:00<?, ?it/s]

DATASET: scRNAseq
Technique: hsneCPU
Level: _level2.csv


100%|██████████| 20/20 [00:19<00:00,  1.02it/s]
  0%|          | 0/20 [00:00<?, ?it/s]

Level: _level0.csv


100%|██████████| 20/20 [01:39<00:00,  4.99s/it]
  0%|          | 0/20 [00:00<?, ?it/s]

Technique: hsneGPU
Level: _level2.csv


100%|██████████| 20/20 [00:19<00:00,  1.03it/s]
  0%|          | 0/20 [00:00<?, ?it/s]

Level: _level0.csv


100%|██████████| 20/20 [01:44<00:00,  5.22s/it]
  0%|          | 0/20 [00:00<?, ?it/s]

Technique: mphate
Level: _level2.csv


100%|██████████| 20/20 [01:40<00:00,  5.04s/it]
  0%|          | 0/20 [00:00<?, ?it/s]

Level: _level0.csv


100%|██████████| 20/20 [01:43<00:00,  5.17s/it]
  0%|          | 0/20 [00:00<?, ?it/s]

Technique: humap
Level: _level2.csv


100%|██████████| 20/20 [01:39<00:00,  4.96s/it]
  0%|          | 0/20 [00:00<?, ?it/s]

Level: _level0.csv


100%|██████████| 20/20 [01:42<00:00,  5.10s/it]
  0%|          | 0/20 [00:00<?, ?it/s]

Technique: humapFLANN
Level: _level2.csv


100%|██████████| 20/20 [01:38<00:00,  4.92s/it]
  0%|          | 0/20 [00:00<?, ?it/s]

Level: _level0.csv


100%|██████████| 20/20 [01:43<00:00,  5.19s/it]
  0%|          | 0/20 [00:00<?, ?it/s]

Technique: humapKDTREE_NN
Level: _level2.csv


100%|██████████| 20/20 [01:41<00:00,  5.06s/it]
  0%|          | 0/20 [00:00<?, ?it/s]

Level: _level0.csv


100%|██████████| 20/20 [01:44<00:00,  5.23s/it]
  0%|          | 0/20 [00:00<?, ?it/s]

DATASET: fmnist
Technique: hsneCPU
Level: _level2.csv


100%|██████████| 20/20 [01:54<00:00,  5.75s/it]


Level: _level0.csv


100%|██████████| 20/20 [02:05<00:00,  6.30s/it]


Technique: hsneGPU
Level: _level2.csv


100%|██████████| 20/20 [01:55<00:00,  5.78s/it]


Level: _level0.csv


100%|██████████| 20/20 [02:09<00:00,  6.48s/it]


Technique: mphate
Technique: humap
Level: _level2.csv


100%|██████████| 20/20 [01:55<00:00,  5.75s/it]


Level: _level0.csv


100%|██████████| 20/20 [02:05<00:00,  6.29s/it]


Technique: humapFLANN
Level: _level2.csv


100%|██████████| 20/20 [01:52<00:00,  5.65s/it]


Level: _level0.csv


100%|██████████| 20/20 [02:06<00:00,  6.34s/it]


Technique: humapKDTREE_NN
Level: _level2.csv


100%|██████████| 20/20 [01:54<00:00,  5.74s/it]


Level: _level0.csv


100%|██████████| 20/20 [02:06<00:00,  6.31s/it]
  0%|          | 0/20 [00:00<?, ?it/s]

DATASET: mnist
Technique: hsneCPU
Level: _level2.csv


100%|██████████| 20/20 [02:04<00:00,  6.25s/it]


Level: _level0.csv


100%|██████████| 20/20 [02:06<00:00,  6.34s/it]


Technique: hsneGPU
Level: _level2.csv


100%|██████████| 20/20 [01:59<00:00,  6.00s/it]


Level: _level0.csv


100%|██████████| 20/20 [02:02<00:00,  6.14s/it]


Technique: mphate
Technique: humap
Level: _level2.csv


100%|██████████| 20/20 [01:59<00:00,  5.96s/it]


Level: _level0.csv


100%|██████████| 20/20 [02:00<00:00,  6.01s/it]


Technique: humapFLANN
Level: _level2.csv


100%|██████████| 20/20 [01:59<00:00,  5.96s/it]


Level: _level0.csv


100%|██████████| 20/20 [02:07<00:00,  6.38s/it]


Technique: humapKDTREE_NN
Level: _level2.csv


100%|██████████| 20/20 [02:01<00:00,  6.09s/it]


Level: _level0.csv


100%|██████████| 20/20 [02:00<00:00,  6.02s/it]

880 880 880
880 880 880
26400 26400 26400 26400





In [39]:
df_correlation.to_csv('./correlation_values.csv', index=False)
df_demap.to_csv('./demap_values.csv', index=False)
df_np.to_csv('./n0p_values.csv', index=False)