In [9]:
import os
import humap

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from scipy.spatial import procrustes

In [10]:
## functions to load datasets
def load_fmnist():
    fashionTrain = pd.read_csv('./../data/fashion-train.csv')

    fashionX = fashionTrain.values[:,2:]
    fashionY = fashionTrain.values[:, 1].astype(int)

    X = normalize(fashionX)
    y = fashionY
    X = check_array(X, dtype=np.float32, accept_sparse='csr', order='C')
    
    return X, y

def load_mnist():
    X = np.load('./../data/MNIST_70000.npy')
    y = np.load('./../data/MNIST_70000_label.npy').astype(int)
    X = normalize(X)
    X = check_array(X, dtype=np.float32, accept_sparse='csr', order='C')
    
    return X, y

def load_scRNAseq():
    download_path = os.path.expanduser("./../data")
    sparse=True
    T1 = scprep.io.load_10X(os.path.join(download_path, "scRNAseq", "T0_1A"), sparse=sparse, gene_labels='both')
    T2 = scprep.io.load_10X(os.path.join(download_path, "scRNAseq", "T2_3B"), sparse=sparse, gene_labels='both')
    T3 = scprep.io.load_10X(os.path.join(download_path, "scRNAseq", "T4_5C"), sparse=sparse, gene_labels='both')
    T4 = scprep.io.load_10X(os.path.join(download_path, "scRNAseq", "T6_7D"), sparse=sparse, gene_labels='both')
    T5 = scprep.io.load_10X(os.path.join(download_path, "scRNAseq", "T8_9E"), sparse=sparse, gene_labels='both')
    filtered_batches = []
    for batch in [T1, T2, T3, T4, T5]:
        batch = scprep.filter.filter_library_size(batch, percentile=20, keep_cells='above')
        batch = scprep.filter.filter_library_size(batch, percentile=75, keep_cells='below')
        filtered_batches.append(batch)
    del T1, T2, T3, T4, T5
    EBT_counts, sample_labels = scprep.utils.combine_batches(
        filtered_batches, 
        ["Day 00-03", "Day 06-09", "Day 12-15", "Day 18-21", "Day 24-27"],
        append_to_cell_names=True
    )
    del filtered_batches # removes objects from memory
    EBT_counts = scprep.filter.filter_rare_genes(EBT_counts, min_cells=10)
    EBT_counts = scprep.normalize.library_size_normalize(EBT_counts)
    mito_genes = scprep.select.get_gene_set(EBT_counts, starts_with="MT-") # Get all mitochondrial genes. There are 14, FYI.
    EBT_counts, sample_labels = scprep.filter.filter_gene_set_expression(
    EBT_counts, sample_labels, genes=mito_genes, 
    percentile=90, keep_cells='below')
    EBT_counts = scprep.transform.sqrt(EBT_counts)
    
    le = LabelEncoder()
    le.fit(sample_labels)
    labels = le.transform(sample_labels)
    X = PCA(n_components=50).fit_transform(EBT_counts.values)
    X = check_array(X, dtype=np.float32, accept_sparse='csr', order='C')
    return X, labels

def load_mammals():
    X = np.loadtxt("./../data/mammals-20000_features.txt")
    y = np.loadtxt("./../data/mammals-20000_classes.txt")
    X = normalize(X)
    
    return X, y

In [12]:
datasets = []

# datasets.append({
#    'load': load_scRNAseq,
#    'name': 'scRNAseq'
# })

datasets.append({
   'load': load_mammals,
   'name': 'mammals'
})

# datasets.append({
#     'load': load_fmnist,
#     'name': 'fmnist'
# })

# datasets.append({
#     'load': load_mnist,
#     'name': 'mnist'
# })

n_executions = 20
levels = 3


def select_landmarks(values_level, values_next_level):    
    intersect = []
    
    for row in values_level:
        pos, = np.where(index == values_next_level[:, -1])
        if len(pos) > 0:
            intersect.append(values_next_level[pos[0]][1:-1])
    
    return np.array(intersect)


def plot_procrustes(embedding_a, embedding_b, y, disparity):
    
    f, ax = plt.subplots(1, 2, figsize=(12, 4))

    ax[0].scatter(embedding_a[:, 0], embedding_a[:, 1], c=y, alpha=0.1, cmap='tab10')
    ax[0].tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False) 
    ax[0].tick_params(axis='y', which='both', left=False, right=False, labelleft=False) 

    ax[1].scatter(embedding_b[:, 0], embedding_b[:, 1], c=y, alpha=0.1, cmap='tab10')
    ax[1].tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False) 
    ax[1].tick_params(axis='y', which='both', left=False, right=False, labelleft=False) 
    
    print("Disparity between embeddings: %.5f" % (disparity))

In [None]:
for dataset in datasets:
    name = dataset['name']
    
    for execution in tqdm(range(n_executions)):
        
        humap_values = []
        for level in range(levels):
            df = pd.read_csv('comparison-techniques/'+name+'/humap_it'+str(execution)+'_level'+str(levels-level)+'.csv')
            humap_values.append(df.values)
                    
        humap_landmarks = []
        for i in range(levels-1):
            humap_landmarks.append(select_landmarks(humap_values[i], humap_values[i+1]))
            
        for i in range(len(humap_landmarks)):
            mtx1, mtx2, disparity = procrustes(humap_values[i][:, 1:-1], humap_landmarks[i])
            plot_procrustes(mtx1, mtx2, humap_values[i][:, 0], disparity)
        
            
            
        
            
        
        
        