In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict

In [2]:
!pwd

/Users/malte.luecken/helmholtz_munich/benchmarking_data_integration/Benchmarking_data_integration/notebooks/analysis


In [3]:
files = !ls ../../../Paper/202010_Revision/Supplementary\ Files/Results/*.csv
#files = !ls ../../../Paper/Supplementary\ Files/Results/*.csv

In [4]:
data = {file:pd.read_csv(file) for file in files}

# Test robustness to other score aggregation approaches (e.g., z-scores)

In [5]:
batch_scores = ['PCR batch', 'Batch ASW', 'graph iLISI', 'graph connectivity', 'kBET']
bio_scores = ['NMI cluster/label', 'ARI cluster/label', 'Cell type ASW', 'isolated label F1', 'isolated label silhouette', 'graph cLISI', 'CC conservation', 'HVG conservation']

In [6]:
def max_min_scale_cols(df):
    return((df - df.min())/(df.max() - df.min()))

def z_score(df):
    return((df - df.mean())/df.std())

## For RNA & Sims:

In [7]:
def test_ranking(df):
    from scipy.stats import spearmanr
    
    batch_scores = ['PCR batch', 'Batch ASW', 'graph iLISI', 'graph connectivity', 'kBET']
    bio_scores = ['NMI cluster/label', 'ARI cluster/label', 'Cell type ASW', 'isolated label F1', 'isolated label silhouette', 'graph cLISI', 'CC conservation', 'HVG conservation', 'trajectory conservation']
    
    batch_score_sub = [bs for bs in batch_scores if bs in df.columns]
    bio_score_sub = [bs for bs in bio_scores if bs in df.columns]
    
    df['Batch_Correction_zsc'] = z_score(df[batch_score_sub]).mean(1)
    df['Bio_conservation_zsc'] = z_score(df[bio_score_sub]).mean(1)
    df['Overall_Score_zsc'] = 0.4*df['Batch_Correction_zsc'] + 0.6*df['Bio_conservation_zsc']
    
    if 'Features' in df.columns:
        df['Method_id'] = ['_'.join(df[['Method', 'Output', 'Features', 'Scaling']].values[i]) for i in range(df.shape[0])]
    else:
        df['Method_id'] = ['_'.join(df[['Method', 'Output']].values[i]) for i in range(df.shape[0])]
    
    sorted_df = df[['Method_id', 'Overall_Score_zsc', 'Overall Score']].sort_values(by='Overall_Score_zsc', ascending=False)

    sorted_df['rank'] = [i for i in range(sorted_df.shape[0])]
    
    test_statistic = spearmanr(sorted_df['rank'].values, sorted_df.index)[0]
    
    return (test_statistic, pd.DataFrame({'Method_id':sorted_df['Method_id'], 'rank_zsc':sorted_df['rank'].values, 'rank_init':sorted_df.index}))

In [8]:
for file in data.keys():
    print(f'{file}')
    test_ranking(data[file])

../../../Paper/202010_Revision/Supplementary Files/Results/immune_cell_hum_mou_summary_scores.csv


(0.9879430032882719,                          Method_id  rank_zsc  rank_init
 0         scGen*_gene_HVG_unscaled         0          0
 4         BBKNN_graph_HVG_unscaled         1          4
 1       Scanorama_embed_HVG_scaled         2          1
 10       scGen*_gene_FULL_unscaled         3         10
 6            MNN_gene_HVG_unscaled         4          6
 ..                             ...       ...        ...
 64   Seurat v3 CCA_gene_HVG_scaled        64         64
 65  Seurat v3 CCA_gene_FULL_scaled        65         65
 66        trVAE_embed_HVG_unscaled        66         66
 67       trVAE_embed_FULL_unscaled        67         67
 68         scGen*_gene_FULL_scaled        68         68
 
 [69 rows x 3 columns])

../../../Paper/202010_Revision/Supplementary Files/Results/immune_cell_hum_summary_scores.csv


(0.9758860065765439,                        Method_id  rank_zsc  rank_init
 0     Scanorama_embed_HVG_scaled         0          0
 1     fastMNN_embed_HVG_unscaled         1          1
 13      scGen*_gene_HVG_unscaled         2         13
 11     fastMNN_gene_HVG_unscaled         3         11
 3   Scanorama_embed_HVG_unscaled         4          3
 ..                           ...       ...        ...
 63     SAUCIE_embed_HVG_unscaled        64         63
 66     SAUCIE_gene_FULL_unscaled        65         66
 65    SAUCIE_embed_FULL_unscaled        66         65
 67        scGen*_gene_HVG_scaled        67         67
 68       scGen*_gene_FULL_scaled        68         68
 
 [69 rows x 3 columns])

../../../Paper/202010_Revision/Supplementary Files/Results/lung_atlas_summary_scores.csv


(0.9640116916331751,                            Method_id  rank_zsc  rank_init
 0           scGen*_gene_HVG_unscaled         0          0
 1          scGen*_gene_FULL_unscaled         1          1
 2   Seurat v3 RPCA_gene_HVG_unscaled         2          2
 3         scANVI*_embed_HVG_unscaled         3          3
 16          BBKNN_graph_HVG_unscaled         4         16
 ..                               ...       ...        ...
 65          SAUCIE_gene_HVG_unscaled        64         65
 64         SAUCIE_embed_HVG_unscaled        65         64
 66          LIGER_embed_HVG_unscaled        66         66
 67         LIGER_embed_FULL_unscaled        67         67
 68    Seurat v3 CCA_gene_FULL_scaled        68         68
 
 [69 rows x 3 columns])

../../../Paper/202010_Revision/Supplementary Files/Results/mouse_brain_summary_scores.csv


(0.995475113122172,                           Method_id  rank_zsc  rank_init
 1          BBKNN_graph_HVG_unscaled         0          1
 0        scANVI*_embed_HVG_unscaled         1          0
 6         BBKNN_graph_FULL_unscaled         2          6
 2            BBKNN_graph_HVG_scaled         3          2
 4          ComBat_gene_HVG_unscaled         4          4
 3           BBKNN_graph_FULL_scaled         5          3
 5        Scanorama_embed_HVG_scaled         6          5
 8         Scanorama_gene_HVG_scaled         7          8
 7           scVI_embed_HVG_unscaled         8          7
 9          LIGER_embed_HVG_unscaled         9          9
 11      Scanorama_gene_HVG_unscaled        10         11
 14           ComBat_gene_HVG_scaled        11         14
 12         Harmony_embed_HVG_scaled        12         12
 10      scANVI*_embed_FULL_unscaled        13         10
 13         fastMNN_embed_HVG_scaled        14         13
 16          fastMNN_gene_HVG_scaled        15       

../../../Paper/202010_Revision/Supplementary Files/Results/pancreas_summary_scores.csv


(0.974643770551699,                            Method_id  rank_zsc  rank_init
 5          Scanorama_gene_HVG_scaled         0          5
 0    Seurat v3 CCA_gene_HVG_unscaled         1          0
 1      Seurat v3 CCA_gene_HVG_scaled         2          1
 2   Seurat v3 RPCA_gene_HVG_unscaled         3          2
 4     Seurat v3 RPCA_gene_HVG_scaled         4          4
 ..                               ...       ...        ...
 63          LIGER_embed_HVG_unscaled        64         63
 65         LIGER_embed_FULL_unscaled        65         65
 67         SAUCIE_gene_FULL_unscaled        66         67
 66        SAUCIE_embed_FULL_unscaled        67         66
 68           scGen*_gene_FULL_scaled        68         68
 
 [69 rows x 3 columns])

../../../Paper/202010_Revision/Supplementary Files/Results/simulations_1_1_summary_scores.csv


(0.9546949214468398,                            Method_id  rank_zsc  rank_init
 5          scGen*_gene_FULL_unscaled         0          5
 4   Seurat v3 CCA_gene_FULL_unscaled         1          4
 0        fastMNN_embed_FULL_unscaled         2          0
 6           scGen*_gene_HVG_unscaled         3          6
 8              MNN_gene_HVG_unscaled         4          8
 ..                               ...       ...        ...
 59           BBKNN_graph_FULL_scaled        64         59
 61          fastMNN_embed_HVG_scaled        65         61
 67         SAUCIE_gene_FULL_unscaled        66         67
 66        SAUCIE_embed_FULL_unscaled        67         66
 68   Unintegrated_gene_FULL_unscaled        68         68
 
 [69 rows x 3 columns])

../../../Paper/202010_Revision/Supplementary Files/Results/simulations_2_summary_scores.csv


(0.9885641213006943,                             Method_id  rank_zsc  rank_init
 0            scGen*_gene_HVG_unscaled         0          0
 5           scGen*_gene_FULL_unscaled         1          5
 3       Seurat v3 CCA_gene_HVG_scaled         2          3
 4     Seurat v3 CCA_gene_HVG_unscaled         3          4
 1          Harmony_embed_HVG_unscaled         4          1
 ..                                ...       ...        ...
 64          SAUCIE_embed_HVG_unscaled        64         64
 65   Seurat v3 RPCA_gene_HVG_unscaled        65         65
 66  Seurat v3 RPCA_gene_FULL_unscaled        66         66
 67     Seurat v3 RPCA_gene_HVG_scaled        67         67
 68    Seurat v3 RPCA_gene_FULL_scaled        68         68
 
 [69 rows x 3 columns])

In [93]:
for file in data.keys():
    print(f'{file}')
    test_ranking(data[file])

../../../Paper/Supplementary Files/Results/immune_cell_hum_mou_summary_scores.csv


(0.9426633110843636,                           Method_id  rank_zsc  rank_init
 4          BBKNN_graph_HVG_unscaled         0          4
 0        Scanorama_embed_HVG_scaled         1          0
 1       Scanorama_embed_FULL_scaled         2          1
 2           scVI_embed_HVG_unscaled         3          2
 7             MNN_gene_HVG_unscaled         4          7
 3          scVI_embed_FULL_unscaled         5          3
 8         Scanorama_gene_HVG_scaled         6          8
 5         ComBat_gene_FULL_unscaled         7          5
 6               MNN_gene_HVG_scaled         8          6
 9            ComBat_gene_HVG_scaled         9          9
 12        BBKNN_graph_FULL_unscaled        10         12
 10             MNN_gene_FULL_scaled        11         10
 11          ComBat_gene_FULL_scaled        12         11
 13     Scanorama_embed_HVG_unscaled        13         13
 15      Scanorama_gene_HVG_unscaled        14         15
 14         ComBat_gene_HVG_unscaled        15      

../../../Paper/Supplementary Files/Results/immune_cell_hum_summary_scores.csv


(0.9426633110843636,                           Method_id  rank_zsc  rank_init
 0        Scanorama_embed_HVG_scaled         0          0
 1          Conos_graph_HVG_unscaled         1          1
 2      Scanorama_embed_HVG_unscaled         2          2
 3       Scanorama_embed_FULL_scaled         3          3
 9               MNN_gene_HVG_scaled         4          9
 10            MNN_gene_HVG_unscaled         5         10
 7         Conos_graph_FULL_unscaled         6          7
 8       Scanorama_gene_HVG_unscaled         7          8
 6          BBKNN_graph_HVG_unscaled         8          6
 4     Scanorama_embed_FULL_unscaled         9          4
 13          Conos_graph_FULL_scaled        10         13
 5        Harmony_embed_HVG_unscaled        11          5
 18             MNN_gene_FULL_scaled        12         18
 16        Scanorama_gene_HVG_scaled        13         16
 11          scVI_embed_HVG_unscaled        14         11
 12        Harmony_embed_FULL_scaled        15      

../../../Paper/Supplementary Files/Results/lung_atlas_summary_scores.csv


(0.9426633110843636,                           Method_id  rank_zsc  rank_init
 1          BBKNN_graph_HVG_unscaled         0          1
 0         Conos_graph_FULL_unscaled         1          0
 3          Conos_graph_HVG_unscaled         2          3
 2        Scanorama_embed_HVG_scaled         3          2
 7         BBKNN_graph_FULL_unscaled         4          7
 4       Scanorama_embed_FULL_scaled         5          4
 9             MNN_gene_HVG_unscaled         6          9
 5      Scanorama_embed_HVG_unscaled         7          5
 10              MNN_gene_HVG_scaled         8         10
 6          scVI_embed_FULL_unscaled         9          6
 13         ComBat_gene_HVG_unscaled        10         13
 22           BBKNN_graph_HVG_scaled        11         22
 8           scVI_embed_HVG_unscaled        12          8
 12     Seurat v3_gene_FULL_unscaled        13         12
 15      Seurat v3_gene_HVG_unscaled        14         15
 18      Scanorama_gene_HVG_unscaled        15      

../../../Paper/Supplementary Files/Results/mouse_brain_summary_scores.csv


(0.9426633110843636,                           Method_id  rank_zsc  rank_init
 0          BBKNN_graph_HVG_unscaled         0          0
 1         BBKNN_graph_FULL_unscaled         1          1
 2            BBKNN_graph_HVG_scaled         2          2
 3           BBKNN_graph_FULL_scaled         3          3
 4          ComBat_gene_HVG_unscaled         4          4
 5           scVI_embed_HVG_unscaled         5          5
 6            ComBat_gene_HVG_scaled         6          6
 7        Scanorama_embed_HVG_scaled         7          7
 12      Scanorama_gene_HVG_unscaled         8         12
 8         Scanorama_gene_HVG_scaled         9          8
 10         Harmony_embed_HVG_scaled        10         10
 9          scVI_embed_FULL_unscaled        11          9
 15     Scanorama_embed_HVG_unscaled        12         15
 14       Harmony_embed_HVG_unscaled        13         14
 11         LIGER_embed_HVG_unscaled        14         11
 13         Conos_graph_HVG_unscaled        15      

../../../Paper/Supplementary Files/Results/pancreas_jointnorm_summary_scores.csv


(0.9426633110843636,                           Method_id  rank_zsc  rank_init
 2           Conos_graph_FULL_scaled         0          2
 3         Seurat v3_gene_HVG_scaled         1          3
 4            Conos_graph_HVG_scaled         2          4
 1       Seurat v3_gene_HVG_unscaled         3          1
 0          BBKNN_graph_HVG_unscaled         4          0
 5      Seurat v3_gene_FULL_unscaled         5          5
 10        Scanorama_gene_HVG_scaled         6         10
 6        Scanorama_embed_HVG_scaled         7          6
 8        Seurat v3_gene_FULL_scaled         8          8
 7         BBKNN_graph_FULL_unscaled         9          7
 12        Conos_graph_FULL_unscaled        10         12
 19           ComBat_gene_HVG_scaled        11         19
 16         Conos_graph_HVG_unscaled        12         16
 14      Scanorama_embed_FULL_scaled        13         14
 9         Harmony_embed_FULL_scaled        14          9
 11         Harmony_embed_HVG_scaled        15      

../../../Paper/Supplementary Files/Results/simulations_1_1_summary_scores.csv


(0.9426633110843636,                           Method_id  rank_zsc  rank_init
 2             MNN_gene_HVG_unscaled         0          2
 3      Seurat v3_gene_FULL_unscaled         1          3
 6       Scanorama_gene_HVG_unscaled         2          6
 4       Scanorama_embed_FULL_scaled         3          4
 0      Scanorama_embed_HVG_unscaled         4          0
 1     Scanorama_embed_FULL_unscaled         5          1
 5        Scanorama_embed_HVG_scaled         6          5
 8       Seurat v3_gene_HVG_unscaled         7          8
 7           Conos_graph_FULL_scaled         8          7
 9         Scanorama_gene_HVG_scaled         9          9
 11              MNN_gene_HVG_scaled        10         11
 14        ComBat_gene_FULL_unscaled        11         14
 13             MNN_gene_FULL_scaled        12         13
 15         ComBat_gene_HVG_unscaled        13         15
 18           ComBat_gene_HVG_scaled        14         18
 12        Seurat v3_gene_HVG_scaled        15      

../../../Paper/Supplementary Files/Results/simulations_2_summary_scores.csv


(0.9426633110843636,                           Method_id  rank_zsc  rank_init
 0         Seurat v3_gene_HVG_scaled         0          0
 1       Seurat v3_gene_HVG_unscaled         1          1
 2          Harmony_embed_HVG_scaled         2          2
 3           Conos_graph_FULL_scaled         3          3
 4        Harmony_embed_HVG_unscaled         4          4
 5         Harmony_embed_FULL_scaled         5          5
 6        Seurat v3_gene_FULL_scaled         6          6
 7      Seurat v3_gene_FULL_unscaled         7          7
 8            Conos_graph_HVG_scaled         8          8
 10       Scanorama_embed_HVG_scaled         9         10
 9          Conos_graph_HVG_unscaled        10          9
 15        Scanorama_gene_HVG_scaled        11         15
 12        BBKNN_graph_FULL_unscaled        12         12
 11         BBKNN_graph_HVG_unscaled        13         11
 18            MNN_gene_HVG_unscaled        14         18
 14          BBKNN_graph_FULL_scaled        15      

## For ATAC:

In [13]:
#files = !ls ../../../Paper/Supplementary\ Files/Results/ATAC/*.csv
files = ['../../../Paper/202010_Revision/Supplementary Files/Results/ATAC/mouse_brain_atac_large_11batches_summary_scores.csv', '../../../Paper/202010_Revision/Supplementary Files/Results/ATAC/mouse_brain_atac_small_summary_scores.csv']

data_atac = {file:pd.read_csv(file) for file in files}

In [14]:
for file in data_atac.keys():
    print(f'{file}')
    test_ranking(data_atac[file])

../../../Paper/202010_Revision/Supplementary Files/Results/ATAC/mouse_brain_atac_large_11batches_summary_scores.csv


(0.9882189497082526,               Method_id  rank_zsc  rank_init
 10          BBKNN_graph         0         10
 0         Harmony_embed         1          0
 2           ComBat_gene         2          2
 1           ComBat_gene         3          1
 4         Harmony_embed         4          4
 3           LIGER_embed         5          3
 9           BBKNN_graph         6          9
 6     Unintegrated_gene         7          6
 5           LIGER_embed         8          5
 8     Unintegrated_gene         9          8
 7         scANVI*_embed        10          7
 11          scGen*_gene        11         11
 12           scVI_embed        12         12
 26          BBKNN_graph        13         26
 14  Seurat v3 RPCA_gene        14         14
 13           DESC_embed        15         13
 16          ComBat_gene        16         16
 18    Unintegrated_gene        17         18
 15           DESC_embed        18         15
 19   Seurat v3 CCA_gene        19         19
 17        Har

../../../Paper/202010_Revision/Supplementary Files/Results/ATAC/mouse_brain_atac_small_summary_scores.csv


(0.9755487635454295,               Method_id  rank_zsc  rank_init
 1           LIGER_embed         0          1
 2    Seurat v3 CCA_gene         1          2
 3         scANVI*_embed         2          3
 7   Seurat v3 RPCA_gene         3          7
 5            scVI_embed         4          5
 0           BBKNN_graph         5          0
 6         scANVI*_embed         6          6
 13  Seurat v3 RPCA_gene         7         13
 19    Unintegrated_gene         8         19
 11        scANVI*_embed         9         11
 14          scGen*_gene        10         14
 12           scVI_embed        11         12
 8         Harmony_embed        12          8
 9           LIGER_embed        13          9
 10        Harmony_embed        14         10
 20   Seurat v3 CCA_gene        15         20
 18          ComBat_gene        16         18
 21          scGen*_gene        17         21
 17        fastMNN_embed        18         17
 4           BBKNN_graph        19          4
 15           

With z-scoring BBKNN (on which fewer metrics are computed) would be ranked higher. This is potentially less robust than our ranking.