In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict

In [2]:
!pwd

/Users/malte.luecken/helmholtz_munich/benchmarking_data_integration/Benchmarking_data_integration/notebooks/analysis


In [3]:
files = !ls ../../../Paper/202104_Revision/Supplementary\ Files/Results/*.csv
#files = !ls ../../../Paper/Supplementary\ Files/Results/*.csv

In [4]:
data = {file:pd.read_csv(file) for file in files}

# Test robustness to other score aggregation approaches (e.g., z-scores)

In [5]:
batch_scores = ['PCR batch', 'Batch ASW', 'graph iLISI', 'graph connectivity', 'kBET']
bio_scores = ['NMI cluster/label', 'ARI cluster/label', 'Cell type ASW', 'isolated label F1', 'isolated label silhouette', 'graph cLISI', 'CC conservation', 'HVG conservation']

In [6]:
def max_min_scale_cols(df):
    return((df - df.min())/(df.max() - df.min()))

def z_score(df):
    return((df - df.mean())/df.std())

## For RNA & Sims:

In [7]:
def test_ranking(df):
    from scipy.stats import spearmanr
    
    batch_scores = ['PCR batch', 'Batch ASW', 'graph iLISI', 'graph connectivity', 'kBET']
    bio_scores = ['NMI cluster/label', 'ARI cluster/label', 'Cell type ASW', 'isolated label F1', 'isolated label silhouette', 'graph cLISI', 'CC conservation', 'HVG conservation', 'trajectory conservation']
    
    batch_score_sub = [bs for bs in batch_scores if bs in df.columns]
    bio_score_sub = [bs for bs in bio_scores if bs in df.columns]
    
    df['Batch_Correction_zsc'] = z_score(df[batch_score_sub]).mean(1)
    df['Bio_conservation_zsc'] = z_score(df[bio_score_sub]).mean(1)
    df['Overall_Score_zsc'] = 0.4*df['Batch_Correction_zsc'] + 0.6*df['Bio_conservation_zsc']
    
    if 'Features' in df.columns:
        df['Method_id'] = ['_'.join(df[['Method', 'Output', 'Features', 'Scaling']].values[i]) for i in range(df.shape[0])]
    else:
        df['Method_id'] = ['_'.join(df[['Method', 'Output']].values[i]) for i in range(df.shape[0])]
    
    sorted_df = df[['Method_id', 'Overall_Score_zsc', 'Overall Score']].sort_values(by='Overall_Score_zsc', ascending=False)

    sorted_df['rank'] = [i for i in range(sorted_df.shape[0])]
    
    test_statistic = spearmanr(sorted_df['rank'].values, sorted_df.index)[0]
    
    return (test_statistic, pd.DataFrame({'Method_id':sorted_df['Method_id'], 'rank_zsc':sorted_df['rank'].values, 'rank_init':sorted_df.index}))

In [8]:
for file in data.keys():
    print(f'{file}')
    test_ranking(data[file])

../../../Paper/202104_Revision/Supplementary Files/Results/immune_cell_hum_mou_summary_scores.csv


(0.9910485933503838,
                          Method_id  rank_zsc  rank_init
 0         scGen*_gene_HVG_unscaled         0          0
 1         BBKNN_graph_HVG_unscaled         1          1
 5            MNN_gene_HVG_unscaled         2          5
 2          scVI_embed_HVG_unscaled         3          2
 3       Scanorama_embed_HVG_scaled         4          3
 ..                             ...       ...        ...
 64   Seurat v3 CCA_gene_HVG_scaled        64         64
 65  Seurat v3 CCA_gene_FULL_scaled        65         65
 66        trVAE_embed_HVG_unscaled        66         66
 67       trVAE_embed_FULL_unscaled        67         67
 68         scGen*_gene_FULL_scaled        68         68
 
 [69 rows x 3 columns])

../../../Paper/202104_Revision/Supplementary Files/Results/immune_cell_hum_summary_scores.csv


(0.9804895871392036,
                       Method_id  rank_zsc  rank_init
 0    Harmony_embed_HVG_unscaled         0          0
 1    Scanorama_embed_HVG_scaled         1          1
 3   scANVI*_embed_FULL_unscaled         2          3
 2    fastMNN_embed_HVG_unscaled         3          2
 16     scGen*_gene_HVG_unscaled         4         16
 ..                          ...       ...        ...
 63    SAUCIE_embed_HVG_unscaled        64         63
 66    SAUCIE_gene_FULL_unscaled        65         66
 65   SAUCIE_embed_FULL_unscaled        66         65
 67       scGen*_gene_HVG_scaled        67         67
 68      scGen*_gene_FULL_scaled        68         68
 
 [69 rows x 3 columns])

../../../Paper/202104_Revision/Supplementary Files/Results/lung_atlas_summary_scores.csv


(0.9670442089879432,
                            Method_id  rank_zsc  rank_init
 0           scGen*_gene_HVG_unscaled         0          0
 2          scGen*_gene_FULL_unscaled         1          2
 3   Seurat v3 RPCA_gene_HVG_unscaled         2          3
 1         scANVI*_embed_HVG_unscaled         3          1
 17          BBKNN_graph_HVG_unscaled         4         17
 ..                               ...       ...        ...
 65          SAUCIE_gene_HVG_unscaled        64         65
 64         SAUCIE_embed_HVG_unscaled        65         64
 66         LIGER_embed_FULL_unscaled        66         66
 67          LIGER_embed_HVG_unscaled        67         67
 68    Seurat v3 CCA_gene_FULL_scaled        68         68
 
 [69 rows x 3 columns])

../../../Paper/202104_Revision/Supplementary Files/Results/mouse_brain_summary_scores.csv


(0.9960180995475113,
                           Method_id  rank_zsc  rank_init
 1          BBKNN_graph_HVG_unscaled         0          1
 0        scANVI*_embed_HVG_unscaled         1          0
 7         BBKNN_graph_FULL_unscaled         2          7
 2            BBKNN_graph_HVG_scaled         3          2
 3          Harmony_embed_HVG_scaled         4          3
 5          ComBat_gene_HVG_unscaled         5          5
 6        Harmony_embed_HVG_unscaled         6          6
 4           BBKNN_graph_FULL_scaled         7          4
 8           scVI_embed_HVG_unscaled         8          8
 10      Harmony_embed_FULL_unscaled         9         10
 12         LIGER_embed_HVG_unscaled        10         12
 11        Scanorama_gene_HVG_scaled        11         11
 9        Scanorama_embed_HVG_scaled        12          9
 15           ComBat_gene_HVG_scaled        13         15
 13      scANVI*_embed_FULL_unscaled        14         13
 14      Scanorama_gene_HVG_unscaled        15     

../../../Paper/202104_Revision/Supplementary Files/Results/pancreas_summary_scores.csv


(0.9751918158567776,
                            Method_id  rank_zsc  rank_init
 8          Scanorama_gene_HVG_scaled         0          8
 1      Seurat v3 CCA_gene_HVG_scaled         1          1
 0    Seurat v3 CCA_gene_HVG_unscaled         2          0
 2   Seurat v3 RPCA_gene_HVG_unscaled         3          2
 5     Seurat v3 RPCA_gene_HVG_scaled         4          5
 ..                               ...       ...        ...
 63          SAUCIE_embed_FULL_scaled        64         63
 65           SAUCIE_gene_FULL_scaled        65         65
 67         SAUCIE_gene_FULL_unscaled        66         67
 66        SAUCIE_embed_FULL_unscaled        67         66
 68           scGen*_gene_FULL_scaled        68         68
 
 [69 rows x 3 columns])

../../../Paper/202104_Revision/Supplementary Files/Results/simulations_1_1_summary_scores.csv


(0.9453781512605043,
                            Method_id  rank_zsc  rank_init
 8          scGen*_gene_FULL_unscaled         0          8
 0        Harmony_embed_FULL_unscaled         1          0
 6   Seurat v3 CCA_gene_FULL_unscaled         2          6
 1        fastMNN_embed_FULL_unscaled         3          1
 4           scGen*_gene_HVG_unscaled         4          4
 ..                               ...       ...        ...
 57           BBKNN_graph_FULL_scaled        64         57
 61          fastMNN_embed_HVG_scaled        65         61
 67         SAUCIE_gene_FULL_unscaled        66         67
 66        SAUCIE_embed_FULL_unscaled        67         66
 68   Unintegrated_gene_FULL_unscaled        68         68
 
 [69 rows x 3 columns])

../../../Paper/202104_Revision/Supplementary Files/Results/simulations_2_summary_scores.csv


(0.9871026671538181,
                             Method_id  rank_zsc  rank_init
 0            scGen*_gene_HVG_unscaled         0          0
 5           scGen*_gene_FULL_unscaled         1          5
 2       Seurat v3 CCA_gene_HVG_scaled         2          2
 1            Harmony_embed_HVG_scaled         3          1
 4     Seurat v3 CCA_gene_HVG_unscaled         4          4
 ..                                ...       ...        ...
 64          SAUCIE_embed_HVG_unscaled        64         64
 65   Seurat v3 RPCA_gene_HVG_unscaled        65         65
 66  Seurat v3 RPCA_gene_FULL_unscaled        66         66
 67     Seurat v3 RPCA_gene_HVG_scaled        67         67
 68    Seurat v3 RPCA_gene_FULL_scaled        68         68
 
 [69 rows x 3 columns])

On RNA tasks and simulations alternative ranking via z-scores of metrics prior to aggregation has at least spearman correlation of 0.95 to the ranking via min-max scaling and aggregation that we perform. The 0.95 is on the least discriminative task, all other tasks have 0.97 and above.

## For ATAC:

In [9]:
#files = !ls ../../../Paper/Supplementary\ Files/Results/ATAC/*.csv
files = ['../../../Paper/202104_Revision/Supplementary Files/Results/ATAC/mouse_brain_atac_large_summary_scores.csv', '../../../Paper/202104_Revision/Supplementary Files/Results/ATAC/mouse_brain_atac_small_summary_scores.csv']

data_atac = {file:pd.read_csv(file) for file in files}

In [10]:
for file in data_atac.keys():
    print(f'{file}')
    test_ranking(data_atac[file])

../../../Paper/202104_Revision/Supplementary Files/Results/ATAC/mouse_brain_atac_large_summary_scores.csv


(0.9874342489772066,
               Method_id  rank_zsc  rank_init
 10          BBKNN_graph         0         10
 0         Harmony_embed         1          0
 3           ComBat_gene         2          3
 1           ComBat_gene         3          1
 2         Harmony_embed         4          2
 9           BBKNN_graph         5          9
 4           LIGER_embed         6          4
 5           LIGER_embed         7          5
 6     Unintegrated_gene         8          6
 8     Unintegrated_gene         9          8
 7         scANVI*_embed        10          7
 11          scGen*_gene        11         11
 12           scVI_embed        12         12
 26          BBKNN_graph        13         26
 15  Seurat v3 RPCA_gene        14         15
 13           DESC_embed        15         13
 14           DESC_embed        16         14
 17          ComBat_gene        17         17
 18    Unintegrated_gene        18         18
 16        Harmony_embed        19         16
 19   Seurat 

../../../Paper/202104_Revision/Supplementary Files/Results/ATAC/mouse_brain_atac_small_summary_scores.csv


(0.9717699360933595,
               Method_id  rank_zsc  rank_init
 1    Seurat v3 CCA_gene         0          1
 7   Seurat v3 RPCA_gene         1          7
 2         scANVI*_embed         2          2
 0           BBKNN_graph         3          0
 4         Harmony_embed         4          4
 12  Seurat v3 RPCA_gene         5         12
 6         Harmony_embed         6          6
 18    Unintegrated_gene         7         18
 5           LIGER_embed         8          5
 10           DESC_embed         9         10
 13          scGen*_gene        10         13
 8           LIGER_embed        11          8
 20   Seurat v3 CCA_gene        12         20
 9            scVI_embed        13          9
 21          scGen*_gene        14         21
 19          ComBat_gene        15         19
 11        scANVI*_embed        16         11
 16        fastMNN_embed        17         16
 14        scANVI*_embed        18         14
 3           BBKNN_graph        19          3
 15          

With z-scoring BBKNN (on which fewer metrics are computed, and which has more outlier scores in LISI metrics) would be ranked higher. This is likely less robust than our ranking.