In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict

In [2]:
!pwd

/Users/malte.luecken/helmholtz_munich/benchmarking_data_integration/scib-reproducibility/notebooks/analysis


In [3]:
files = !ls ../../../Paper/202109_kBET_fix/Supplementary\ Files/Results/*.csv
#files = !ls ../../../Paper/Supplementary\ Files/Results/*.csv

In [4]:
data = {file:pd.read_csv(file) for file in files}

# Test robustness to other score aggregation approaches (e.g., z-scores)

In [5]:
batch_scores = ['PCR batch', 'Batch ASW', 'graph iLISI', 'graph connectivity', 'kBET']
bio_scores = ['NMI cluster/label', 'ARI cluster/label', 'Cell type ASW', 'isolated label F1', 'isolated label silhouette', 'graph cLISI', 'CC conservation', 'HVG conservation']

In [6]:
def max_min_scale_cols(df):
    return((df - df.min())/(df.max() - df.min()))

def z_score(df):
    return((df - df.mean())/df.std())

## For RNA & Sims:

In [7]:
def test_ranking(df):
    from scipy.stats import spearmanr
    
    batch_scores = ['PCR batch', 'Batch ASW', 'graph iLISI', 'graph connectivity', 'kBET']
    bio_scores = ['NMI cluster/label', 'ARI cluster/label', 'Cell type ASW', 'isolated label F1', 'isolated label silhouette', 'graph cLISI', 'CC conservation', 'HVG conservation', 'trajectory conservation']
    
    batch_score_sub = [bs for bs in batch_scores if bs in df.columns]
    bio_score_sub = [bs for bs in bio_scores if bs in df.columns]
    
    df['Batch_Correction_zsc'] = z_score(df[batch_score_sub]).mean(1)
    df['Bio_conservation_zsc'] = z_score(df[bio_score_sub]).mean(1)
    df['Overall_Score_zsc'] = 0.4*df['Batch_Correction_zsc'] + 0.6*df['Bio_conservation_zsc']
    
    if 'Features' in df.columns:
        df['Method_id'] = ['_'.join(df[['Method', 'Output', 'Features', 'Scaling']].values[i]) for i in range(df.shape[0])]
    else:
        df['Method_id'] = ['_'.join(df[['Method', 'Output']].values[i]) for i in range(df.shape[0])]
    
    sorted_df = df[['Method_id', 'Overall_Score_zsc', 'Overall Score']].sort_values(by='Overall_Score_zsc', ascending=False)

    sorted_df['rank'] = [i for i in range(sorted_df.shape[0])]
    
    test_statistic = spearmanr(sorted_df['rank'].values, sorted_df.index)[0]
    
    return (test_statistic, pd.DataFrame({'Method_id':sorted_df['Method_id'], 'rank_zsc':sorted_df['rank'].values, 'rank_init':sorted_df.index}))

In [8]:
for file in data.keys():
    print(f'{file}')
    test_ranking(data[file])

../../../Paper/202109_kBET_fix/Supplementary Files/Results/immune_cell_hum_mou_summary_scores.csv


(0.9916331750091342,
                          Method_id  rank_zsc  rank_init
 0         scGen*_gene_HVG_unscaled         0          0
 4        scGen*_gene_FULL_unscaled         1          4
 1       Scanorama_embed_HVG_scaled         2          1
 5            MNN_gene_HVG_unscaled         3          5
 3       scANVI*_embed_HVG_unscaled         4          3
 ..                             ...       ...        ...
 64   Seurat v3 CCA_gene_HVG_scaled        64         64
 65  Seurat v3 CCA_gene_FULL_scaled        65         65
 66        trVAE_embed_HVG_unscaled        66         66
 67       trVAE_embed_FULL_unscaled        67         67
 68         scGen*_gene_FULL_scaled        68         68
 
 [69 rows x 3 columns])

../../../Paper/202109_kBET_fix/Supplementary Files/Results/immune_cell_hum_summary_scores.csv


(0.9822067957617829,
                       Method_id  rank_zsc  rank_init
 6      scGen*_gene_HVG_unscaled         0          6
 1    fastMNN_embed_HVG_unscaled         1          1
 0    Scanorama_embed_HVG_scaled         2          0
 9     fastMNN_gene_HVG_unscaled         3          9
 2   scANVI*_embed_FULL_unscaled         4          2
 ..                          ...       ...        ...
 63    SAUCIE_embed_HVG_unscaled        64         63
 65   SAUCIE_embed_FULL_unscaled        65         65
 66    SAUCIE_gene_FULL_unscaled        66         66
 67       scGen*_gene_HVG_scaled        67         67
 68      scGen*_gene_FULL_scaled        68         68
 
 [69 rows x 3 columns])

../../../Paper/202109_kBET_fix/Supplementary Files/Results/lung_atlas_summary_scores.csv


(0.9635367190354404,
                          Method_id  rank_zsc  rank_init
 0        scGen*_gene_FULL_unscaled         0          0
 1         scGen*_gene_HVG_unscaled         1          1
 2       scANVI*_embed_HVG_unscaled         2          2
 3      scANVI*_embed_FULL_unscaled         3          3
 8   Seurat v3 RPCA_gene_HVG_scaled         4          8
 ..                             ...       ...        ...
 65       SAUCIE_gene_FULL_unscaled        64         65
 62       SAUCIE_embed_HVG_unscaled        65         62
 66       LIGER_embed_FULL_unscaled        66         66
 67        LIGER_embed_HVG_unscaled        67         67
 68  Seurat v3 CCA_gene_FULL_scaled        68         68
 
 [69 rows x 3 columns])

../../../Paper/202109_kBET_fix/Supplementary Files/Results/mouse_brain_summary_scores.csv


(0.9952036199095022,
                           Method_id  rank_zsc  rank_init
 1          BBKNN_graph_HVG_unscaled         0          1
 0        scANVI*_embed_HVG_unscaled         1          0
 7         BBKNN_graph_FULL_unscaled         2          7
 2            BBKNN_graph_HVG_scaled         3          2
 3          Harmony_embed_HVG_scaled         4          3
 5          ComBat_gene_HVG_unscaled         5          5
 6        Harmony_embed_HVG_unscaled         6          6
 4           BBKNN_graph_FULL_scaled         7          4
 8           scVI_embed_HVG_unscaled         8          8
 11        Scanorama_gene_HVG_scaled         9         11
 10      Harmony_embed_FULL_unscaled        10         10
 9        Scanorama_embed_HVG_scaled        11          9
 13         LIGER_embed_HVG_unscaled        12         13
 15           ComBat_gene_HVG_scaled        13         15
 12      scANVI*_embed_FULL_unscaled        14         12
 14      Scanorama_gene_HVG_unscaled        15     

../../../Paper/202109_kBET_fix/Supplementary Files/Results/pancreas_summary_scores.csv


(0.9685056631348193,
                          Method_id  rank_zsc  rank_init
 1   Seurat v3 RPCA_gene_HVG_scaled         0          1
 3    Seurat v3 CCA_gene_HVG_scaled         1          3
 16       Scanorama_gene_HVG_scaled         2         16
 0       Harmony_embed_HVG_unscaled         3          0
 6           scGen*_gene_HVG_scaled         4          6
 ..                             ...       ...        ...
 64        SAUCIE_embed_FULL_scaled        64         64
 65         SAUCIE_gene_FULL_scaled        65         65
 66         scGen*_gene_FULL_scaled        66         66
 68       SAUCIE_gene_FULL_unscaled        67         68
 67      SAUCIE_embed_FULL_unscaled        68         67
 
 [69 rows x 3 columns])

../../../Paper/202109_kBET_fix/Supplementary Files/Results/simulations_1_1_summary_scores.csv


(0.9610887833394228,
                           Method_id  rank_zsc  rank_init
 4           scGen*_gene_FULL_scaled         0          4
 1       Scanorama_embed_FULL_scaled         1          1
 0         Harmony_embed_FULL_scaled         2          0
 2        scANVI*_embed_HVG_unscaled         3          2
 11             MNN_gene_FULL_scaled         4         11
 ..                              ...       ...        ...
 61         fastMNN_embed_HVG_scaled        64         61
 60          BBKNN_graph_FULL_scaled        65         60
 67        SAUCIE_gene_FULL_unscaled        66         67
 66       SAUCIE_embed_FULL_unscaled        67         66
 68  Unintegrated_gene_FULL_unscaled        68         68
 
 [69 rows x 3 columns])

../../../Paper/202109_kBET_fix/Supplementary Files/Results/simulations_2_summary_scores.csv


(0.9920716112531971,
                             Method_id  rank_zsc  rank_init
 0            Harmony_embed_HVG_scaled         0          0
 1            scGen*_gene_HVG_unscaled         1          1
 2           scGen*_gene_FULL_unscaled         2          2
 6     Seurat v3 CCA_gene_HVG_unscaled         3          6
 4           Harmony_embed_FULL_scaled         4          4
 ..                                ...       ...        ...
 64          SAUCIE_embed_HVG_unscaled        64         64
 65   Seurat v3 RPCA_gene_HVG_unscaled        65         65
 66  Seurat v3 RPCA_gene_FULL_unscaled        66         66
 67     Seurat v3 RPCA_gene_HVG_scaled        67         67
 68    Seurat v3 RPCA_gene_FULL_scaled        68         68
 
 [69 rows x 3 columns])

On RNA tasks and simulations alternative ranking via z-scores of metrics prior to aggregation has at least spearman correlation of 0.96 to the ranking via min-max scaling and aggregation that we perform. The lowest value is for the least discriminative task, simulation 1.

## For ATAC:

In [9]:
#files = !ls ../../../Paper/Supplementary\ Files/Results/ATAC/*.csv
files = ['../../../Paper/202109_kBET_fix/Supplementary Files/Results/ATAC/mouse_brain_atac_large_summary_scores.csv', '../../../Paper/202109_kBET_fix/Supplementary Files/Results/ATAC/mouse_brain_atac_small_summary_scores.csv']

data_atac = {file:pd.read_csv(file) for file in files}

In [10]:
for file in data_atac.keys():
    print(f'{file}')
    test_ranking(data_atac[file])

../../../Paper/202109_kBET_fix/Supplementary Files/Results/ATAC/mouse_brain_atac_large_summary_scores.csv


(0.9997077732320282,
               Method_id  rank_zsc  rank_init
 0           LIGER_embed         0          0
 2           LIGER_embed         1          2
 1           ComBat_gene         2          1
 4         scANVI*_embed         3          4
 3           ComBat_gene         4          3
 5           BBKNN_graph         5          5
 6         Harmony_embed         6          6
 9           BBKNN_graph         7          9
 7     Unintegrated_gene         8          7
 8         Harmony_embed         9          8
 10    Unintegrated_gene        10         10
 11          scGen*_gene        11         11
 12           scVI_embed        12         12
 13           DESC_embed        13         13
 14        Harmony_embed        14         14
 15           DESC_embed        15         15
 16          ComBat_gene        16         16
 17    Unintegrated_gene        17         17
 18          LIGER_embed        18         18
 19      Scanorama_embed        19         19
 20         f

../../../Paper/202109_kBET_fix/Supplementary Files/Results/ATAC/mouse_brain_atac_small_summary_scores.csv


(0.9911642122811894,
               Method_id  rank_zsc  rank_init
 0           LIGER_embed         0          0
 1         scANVI*_embed         1          1
 3           LIGER_embed         2          3
 2           BBKNN_graph         3          2
 4    Seurat v3 CCA_gene         4          4
 6            DESC_embed         5          6
 7   Seurat v3 RPCA_gene         6          7
 5           BBKNN_graph         7          5
 15  Seurat v3 RPCA_gene         8         15
 10           scVI_embed         9         10
 9         Harmony_embed        10          9
 8            scVI_embed        11          8
 14        fastMNN_embed        12         14
 11        scANVI*_embed        13         11
 19          scGen*_gene        14         19
 18          scGen*_gene        15         18
 20    Unintegrated_gene        16         20
 12        Harmony_embed        17         12
 13        scANVI*_embed        18         13
 16           scVI_embed        19         16
 21         f

Rank correlations are much higher even in ATAC.