In [9]:
import os
import demap
import math
import random
import numpy as np
import pandas as pd
import hierarchical_umap as h_umap
import matplotlib.pyplot as plt

from sklearn.neighbors import NearestNeighbors
from sklearn.utils import check_array
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from sklearn.neighbors import NearestNeighbors

from scipy.stats import pearsonr

In [2]:
def correlation(X, X_emb):
    
    dist_orig = np.square(euclidean_distances(X, X)).flatten()
    dist_emb = np.square(euclidean_distances(X_emb, X_emb)).flatten()
    
    
    coef, p = spearmanr(dist_orig, dist_emb)
    return coef

def stress(X, X_emb):
    
    DE = euclidean_distances(X_emb)
    DE = DE/np.max(DE)
    DH = euclidean_distances(X)
    DH = DH/np.max(DH)
    stress = 0.5 * np.sum((DE - DH)**2)
    
    return np.sqrt(stress/(0.5*np.sum(DH**2)))
    

def neighborhood_preservation(X, X_emb, Khigh=30):
    
    neigh_high = NearestNeighbors(n_neighbors=Khigh+1, n_jobs=-1)
    neigh_high.fit(X)
    high_dists, high_indices = neigh_high.kneighbors(X)


    neigh_emb = NearestNeighbors(n_neighbors=Khigh+1, n_jobs=-1)
    neigh_emb.fit(X_emb)
    emb_dists, emb_indices = neigh_emb.kneighbors(X_emb)

    npres = np.zeros(Khigh)
    
    for k in range(1, Khigh+1):
        for i in range(X.shape[0]):
            high_current = high_indices[i][1:k+1]
            emb_current = emb_indices[i][1:k+1]
            
            tp = len(np.intersect1d(high_current, emb_current))
            
            npres[k-1] += (tp/k)
        
        
    npres /= float(X.shape[0])
    
    return npres

In [25]:
def get_fmnist():
    fashionTrain = pd.read_csv('data/fashion-train.csv')

    fashionX = fashionTrain.values[:,2:]
    fashionY = fashionTrain.values[:, 1].astype(int)

    X = normalize(fashionX)
    y = fashionY

    X = check_array(X, dtype=np.float32, accept_sparse='csr', order='C')
    
    return X, y

def get_mnist():
    X = np.load('./data/MNIST_70000.npy')
    y = np.load('./data/MNIST_70000_label.npy').astype(int)
    X = normalize(X)
    X = check_array(X, dtype=np.float32, accept_sparse='csr', order='C')
    
    return X, y
X, y = get_fmnist()    

In [30]:
distance_similarities = [False, True]
path_increments = [False, True]
n_neighbors =  [15, 20, 40, 50, 70, 90, 100]
landmarks_nwalks = [10, 20, 30]
landmarks_walklengths = [5, 10, 20, 30, 50, 100]
influence_nwalks =  [10, 20, 30]
influence_walklengths = [10, 30, 50, 70, 80, 90, 100]
influence_neighborhoods = [0.0, 0.1, 0.2, 0.3, 0.5, 0.7, 0.9, 1]
min_dists = [0.05,0.1, 0.15]
executions = 10

n_parameters = len(distance_similarities) * len(path_increments) * len(n_neighbors) * len(landmarks_nwalks) * len(influence_neighborhoods) 
print("%d parameters." % (n_parameters))

672 parameters.


In [None]:
id_conf = 0
file_conf = open('experiments/humap_parameters/current_analysis.csv', 'w')
file_conf.write('id,distance_similarity,path_increment,n_neighbors,landmarks_nwlandmarks_wl,influence_nw,influence_wl,influence_neighborhood,n_dist\n')

for distance_similarity in distance_similarities:
    for path_increment in path_increments:
        for n_neigh in n_neighbors:
            for landmarks_nwalk in landmarks_nwalks:
                for landmarks_walklength in landmarks_walklengths:
                    for influence_nwalk in influence_nwalks:
                        for influence_walklength in influence_walklengths:
                            for influence_neighborhood in influence_neighborhoods:
                                for min_dist in min_dists:

                                    inf_neighborhood = int(influence_neighborhood*n_neigh)
                                    path = 'experiments/humap_parameters/'+str(id_conf)+'/'
                                    id_conf += 1
                                    os.mkdir(path)
                                    
                                    correlation_list = []
                                    np_list = []
                                    demap_list = []
                                    
                                    for i in range(executions):

                                        reducer = h_umap.HUMAP('precomputed', np.array([0.20, 0.19]), n_neigh, min_dist, 'NNDescent', 0.0, True)

                                        reducer.set_distance_similarity(distance_similarity)
                                        reducer.set_path_increment(path_increment)
                                        reducer.set_landmarks_nwalks(landmarks_nwalk)
                                        reducer.set_landmarks_wl(landmarks_walklength)
                                        reducer.set_influence_nwalks(influence_nwalk)
                                        reducer.set_influence_wl(influence_walklength)
                                        reducer.set_influence_neighborhood(inf_neighborhood)

                                        reducer.fit(X, y)
                                        embedding_2 = reducer.get_embedding(2)
                                        y_2 = reducer.get_labels(2)
                                        X_2 = X[reducer.get_original_indices(2), :]
                                        
                                        indices_2 = random.sample(range(0, len(y_2)), min(3000, len(y_2)))
                                        subset_emb_2 = embedding_2[indices_2]
                                        subset_X_2 = X_2[indices_2]
                                        
                                        plt.scatter(embedding_2[:, 0], embedding_2[:, 1], c=y_2, alpha=0.6, cmap='Spectral')
                                        plt.savefig(path+'/embedding_'+str(i)+'.svg')
                                        plt.clf()
                                        
                                        demap_value = demap.DEMaP(subset_X_2, subset_emb_2)
                                        demap_list.append(demap_value)
                                        corr_value = correlation(subset_X_2, subset_emb_2)
                                        correlation_list.append(corr_value)
                                        npres = neighborhood_preservation(subset_X_2, subset_emb_2)
                                        np_list = np_list + npres.tolist()
                                    
                                    df_corr = pd.DataFrame({
                                        'index': list(range(len(correlation_list))),
                                        'corr': correlation_list
                                    })
                                    
                                    df_np = pd.DataFrame({
                                        'n_neighbors': list(range(30))*executions,
                                        'n_preservation': np_list
                                    })
                                    
                                    df_demap = pd.DataFrame({
                                        'index': list(range(len(demap_list))),
                                        'demap': demap_list
                                    })
                                    
                                    df_corr.to_csv(path+'/correlation.csv', index=False)
                                    df_np.to_csv(path+'/npreservation.csv', index=False)
                                    df_demap.to_csv(path+'/demap.csv', index=False)
                                    

                                    file_conf.write(str(id_conf)+','+str(distance_similarity)+','+str(path_increment)+','+str(n_neigh)+','+str(landmarks_nwalk)+','+str(landmarks_walklength)+','+str(influence_nwalk)+','+str(influence_walklength)+','+str(influence_neighborhood)+','+str(min_dist)+'\n')
                                    
                                        
                                        
                                        
                                        
file_conf.close()

In [None]:
distance_similarity = 
path_increment = 
n_neighbors = 
min_dist = 

landmarks_nwalks = [5, 10, 20, 30, 50, 100]
landmarks_walklengths = [5, 10, 20, 30, 50, 100]
influence_nwalks = [5, 10, 20, 30, 50, 100]
influence_walklengths = [5, 10, 20, 30, 50, 60, 70, 80, 90, 100]
influence_neighborhoods = [0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]

In [None]:
for landmarks_nwalk in landmarks_nwalks:
    for landmarks_walklength in landmarks_walklengths:
        for influence_nwalk in influence_nwalks:
            for influence_walklength in influence_walklengths:
                for influence_neighborhood in influence_neighborhoods:

                    inf_neighborhood = int(influence_neighborhood*n_neigh)