In [2]:
import nptsne
from nptsne import hsne_analysis
import multiscale_phate as mp

import time
import os
import scprep
import demap
import math
import random
import numpy as np
import pandas as pd
import humap
import matplotlib.pyplot as plt

from tqdm import tqdm

from sklearn.neighbors import NearestNeighbors
from sklearn.utils import check_array
from sklearn.preprocessing import normalize, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from sklearn.neighbors import NearestNeighbors

from scipy.stats import pearsonr, spearmanr

In [3]:
def load_fmnist():
    fashionTrain = pd.read_csv('./../data/fashion-train.csv')

    fashionX = fashionTrain.values[:,2:]
    fashionY = fashionTrain.values[:, 1].astype(int)

    X = normalize(fashionX)
    y = fashionY
    X = check_array(X, dtype=np.float32, accept_sparse='csr', order='C')
    
    return X, y


def load_mammals():
    X = np.loadtxt("./../data/mammals-20000_features.txt")
    y = np.loadtxt("./../data/mammals-20000_classes.txt")
    X = normalize(X)
    
    return X, y

In [4]:
n_executions = 20
levels = 3

df_humapTOP = pd.DataFrame()
df_humapLevel1 = pd.DataFrame()
df_humapLevel0 = pd.DataFrame()

df_hsneTOP = pd.DataFrame()
df_hsneLevel1 = pd.DataFrame()
df_hsneLevel0 = pd.DataFrame()

datasets = []


# datasets.append({
#    'load': load_mammals,
#    'name': 'mammals-drill'
# })
datasets.append({
    'load': load_fmnist,
    'name': 'fmnist-drill'
})
    
    
for dataset in datasets:
    print("Loading %s dataset..." % (dataset['name']))
    X, y = dataset['load']()
    print("Done.")
    init = 0
    
    if not os.path.exists("comparison-drill/"):
        os.mkdir("comparison-drill")

    if not os.path.exists("comparison-drill/"+dataset['name']):
        os.mkdir("comparison-drill/"+dataset['name'])
        
        
    for i in range(n_executions):
        if os.path.exists('comparison-drill/'+dataset['name']+'/humap_it'+str(i)+'_level0.csv'):
            init = i+1
          
                
    print("Initing at %d" % (init))

        
    for execution in tqdm(range(init, n_executions)):
        
        time_file = open("comparison-drill/"+dataset['name']+'/run-time.csv', 'a')

        hsne = nptsne.HSne(True)
        tic = time.time()
        hsne.create_hsne(X, 3)
        execution_hsne_fit = time.time()-tic

        n_level0 = hsne.get_scale(0).num_points
        n_level1 = hsne.get_scale(1).num_points
        n_level2 = hsne.get_scale(2).num_points

        hUmap = humap.HUMAP(np.array([n_level1/n_level0, n_level2/n_level1]))
        hUmap.set_influence_neighborhood(0)
        hUmap.set_fixing_term(0.01)
     


        """
            Executing HSNE in CPU
        """
        tic = time.time()
        container = hsne_analysis.AnalysisModel(hsne, hsne_analysis.EmbedderType.GPU)
        analysis_level2 = container.top_analysis
        for i in range(500):
            analysis_level2.do_iteration()
        execution_hsneTOP = (time.time()-tic)
        
        y_analysis_level2 = y[analysis_level2.landmark_orig_indexes]
        
        classes = None
        if dataset['name'] == 'mammals-drill':
            classes = [1,2, 3]
        else:
            classes = [0,2,3,4,6,8]
            
        indices = []
        for i in range(len(y_analysis_level2)):
            if y_analysis_level2[i] in classes:
                indices.append(i)
        
        tic = time.time()
        analysis_cluster1 = container.add_new_analysis(analysis_level2, np.array(indices))
        for i in range(500):
            analysis_cluster1.do_iteration()
        execution_hsneLevel1 = time.time()-tic
        
        y_analysis_cluster1 = y[analysis_cluster1.landmark_orig_indexes]
        tic = time.time()
        analysis_cluster0 = container.add_new_analysis(analysis_cluster1, np.arange(len(y_analysis_cluster1)))
        for i in range(500):
            analysis_cluster0.do_iteration()

        execution_hsneLevel0 = (time.time()-tic)


        df_hsneTOP['label'+str(execution)] = y[analysis_level2.landmark_orig_indexes]
        df_hsneTOP['x'+str(execution)] = analysis_level2.embedding[:, 0]
        df_hsneTOP['y'+str(execution)] = analysis_level2.embedding[:, 1]
        df_hsneTOP['inds'+str(execution)] = analysis_level2.landmark_orig_indexes

        df_hsneLevel1['label'+str(execution)] = y[analysis_cluster1.landmark_orig_indexes]
        df_hsneLevel1['x'+str(execution)] = analysis_cluster1.embedding[:, 0]
        df_hsneLevel1['y'+str(execution)] = analysis_cluster1.embedding[:, 1]
        df_hsneLevel1['inds'+str(execution)] = analysis_cluster1.landmark_orig_indexes
        
        df_hsneLevel0['label'+str(execution)] = y[analysis_cluster0.landmark_orig_indexes]
        df_hsneLevel0['x'+str(execution)] = analysis_cluster0.embedding[:, 0]
        df_hsneLevel0['y'+str(execution)] = analysis_cluster0.embedding[:, 1]
        df_hsneLevel0['inds'+str(execution)] = analysis_cluster0.landmark_orig_indexes


        """
            Executing HUMAP
        """
        tic = time.time()
        hUmap.fit(X, y)
        execution_humap_fit = time.time() - tic
        
        tic = time.time()
        embedding2 = hUmap.transform(2)
        execution_humapTOP = time.time() - tic
        
        
        indices_class = []
        labels2 = hUmap.labels(2)
        datapoints = []
        for i, label in enumerate(labels2):
            if label in classes:
                indices_class.append(i)
                datapoints.append(embedding2[i])
        
        
        
        tic = time.time()
        hUmap.fix_datapoints(datapoints)
        
        embedding_clusterLevel1, y_clusterLevel1, indices_cluster = hUmap.transform(2, indices=indices_class)#indices=np.array(classes), class_based=True)
        orig_index = hUmap.original_indices(1)
        inds_level1 = orig_index[indices_cluster]
        execution_humapLevel1 = time.time() - tic
        
        
        indices_class = []
        datapoints = []
        for i, label in enumerate(y_clusterLevel1):
            if label in classes:
                indices_class.append(indices_cluster[i])
                datapoints.append(embedding_clusterLevel1[i])
        
        tic = time.time()
        
        hUmap.fix_datapoints(datapoints)
        
        embedding_clusterLevel0, y_clusterLevel0, indices_cluster = hUmap.transform(1, indices=indices_class)#indices=np.array(classes), class_based=True)
        orig_index = hUmap.original_indices(0)
        inds_level0 = orig_index[indices_cluster]
        execution_humapLevel0 = time.time() - tic


        df_humapTOP['label'+str(execution)] = hUmap.labels(2)
        df_humapTOP['x'+str(execution)] = embedding2[:, 0]
        df_humapTOP['y'+str(execution)] = embedding2[:, 1]
        df_humapTOP['inds'+str(execution)] = hUmap.original_indices(2)

        df_humapLevel1['label'+str(execution)] = y_clusterLevel1
        df_humapLevel1['x'+str(execution)] = embedding_clusterLevel1[:, 0]
        df_humapLevel1['y'+str(execution)] = embedding_clusterLevel1[:, 1]
        df_humapLevel1['inds'+str(execution)] = inds_level1
        
        df_humapLevel0['label'+str(execution)] = y_clusterLevel0
        df_humapLevel0['x'+str(execution)] = embedding_clusterLevel0[:, 0]
        df_humapLevel0['y'+str(execution)] = embedding_clusterLevel0[:, 1]
        df_humapLevel0['inds'+str(execution)] = inds_level0
        
        
        df_hsneTOP.to_csv("comparison-drill/"+dataset['name']+'/hsne_it'+str(execution)+'_TOP.csv', index=False)
        df_hsneLevel1.to_csv("comparison-drill/"+dataset['name']+'/hsne_it'+str(execution)+'_level1.csv', index=False)
        df_hsneLevel0.to_csv("comparison-drill/"+dataset['name']+'/hsne_it'+str(execution)+'_level0.csv', index=False)
        
        df_humapTOP.to_csv("comparison-drill/"+dataset['name']+'/humap_it'+str(execution)+'_TOP.csv', index=False)
        df_humapLevel1.to_csv("comparison-drill/"+dataset['name']+'/humap_it'+str(execution)+'_level1.csv', index=False)
        df_humapLevel0.to_csv("comparison-drill/"+dataset['name']+'/humap_it'+str(execution)+'_level0.csv', index=False)
       
        
        df_humapTOP = pd.DataFrame()
        df_humapLevel1 = pd.DataFrame()
        df_humapLevel0 = pd.DataFrame()

        df_hsneTOP = pd.DataFrame()
        df_hsneLevel1 = pd.DataFrame()
        df_hsneLevel0 = pd.DataFrame()       

        
        
        time_file.write('HSNE,Fit,'+str(execution_hsne_fit)+'\n')
        time_file.write('HUMAP,Fit,'+str(execution_humap_fit)+'\n')        
        
        time_file.write('HSNE,Top,'+str(execution_hsneTOP)+'\n')
        time_file.write('HUMAP,Top,'+str(execution_humapTOP)+'\n')
        
        time_file.write('HSNE,Level 1,'+str(execution_hsneLevel1)+'\n')
        time_file.write('HUMAP,Level 1,'+str(execution_humapLevel1)+'\n')
        
        time_file.write('HSNE,Level 0,'+str(execution_hsneLevel0)+'\n')
        time_file.write('HUMAP,Level 0,'+str(execution_humapLevel0)+'\n')
    
    
    
    
    
    

Loading fmnist-drill dataset...


  0%|          | 0/2 [00:00<?, ?it/s]

Done.
Initing at 18


100%|██████████| 2/2 [01:35<00:00, 47.97s/it]
