In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict

In [2]:
!pwd

/Users/malte.luecken/helmholtz_munich/benchmarking_data_integration/Benchmarking_data_integration/notebooks/analysis


In [3]:
files = !ls ../../../Paper/202104_Revision/Supplementary\ Files/Results/*.csv
#files = !ls ../../R/visualization/*.csv

In [4]:
data = {file:pd.read_csv(file) for file in files}

In [5]:
data[files[0]]

Unnamed: 0.1,Unnamed: 0,Method,Output,Features,Scaling,Overall Score,Batch Correction,PCR batch,Batch ASW,graph iLISI,...,Bio conservation,NMI cluster/label,ARI cluster/label,Cell type ASW,isolated label F1,isolated label silhouette,graph cLISI,CC conservation,HVG conservation,trajectory conservation
0,319,scGen*,gene,HVG,unscaled,0.638093,0.546431,0.593345,0.841671,0.106118,...,0.699201,0.904241,0.849627,0.675813,0.155460,0.391633,1.000000,0.491212,0.416649,0.925860
1,211,BBKNN,graph,HVG,unscaled,0.627318,0.858917,,,0.347902,...,0.472918,0.570298,0.493649,,0.248603,,0.788703,,,0.682456
2,235,scVI,embed,HVG,unscaled,0.623057,0.586432,0.870668,0.917481,0.078631,...,0.647474,0.687811,0.544036,0.518036,0.183759,0.457020,0.988365,0.569934,,0.934886
3,43,Scanorama,embed,HVG,scaled,0.616636,0.615550,0.901327,0.955880,0.107809,...,0.617360,0.637543,0.540902,0.501580,0.159606,0.438662,0.967787,0.706393,,0.888212
4,247,scANVI*,embed,HVG,unscaled,0.608263,0.541463,0.799600,0.891219,0.068181,...,0.652797,0.746849,0.612477,0.569162,0.076483,0.441384,0.996387,0.652344,,0.905722
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,91,Seurat v3 CCA,gene,HVG,scaled,,,,,,...,,,,,,,,,,
65,92,Seurat v3 CCA,gene,FULL,scaled,,,,,,...,,,,,,,,,,
66,175,trVAE,embed,HVG,unscaled,,,,,,...,,,,,,,,,,
67,176,trVAE,embed,FULL,unscaled,,,,,,...,,,,,,,,,,


# Unscaled vs scaled

In [6]:
batch_corr = []
bio_cons = []
overall = []

for file in data:
    data[file] = data[file].loc[~np.isnan(data[file]['Overall Score']),:]
    
    data[file]['method_id'] = ['_'.join(data[file][['Method', 'Output', 'Features']].values[i]) for i in range(data[file].shape[0])]
    
    for meth in data[file]['method_id'].unique():
        tmpDat = data[file][['Scaling', 'Batch Correction', 'Bio conservation', 'Overall Score']].loc[data[file]['method_id'].isin([meth])]
        if tmpDat.shape[0] == 2:
            overall.append(tmpDat['Overall Score'].loc[tmpDat['Scaling'] == 'unscaled'].values[0] > tmpDat['Overall Score'].loc[tmpDat['Scaling'] == 'scaled'].values[0])
            bio_cons.append(tmpDat['Bio conservation'].loc[tmpDat['Scaling'] == 'unscaled'].values[0] > tmpDat['Bio conservation'].loc[tmpDat['Scaling'] == 'scaled'].values[0])
            batch_corr.append(tmpDat['Batch Correction'].loc[tmpDat['Scaling'] == 'unscaled'].values[0] > tmpDat['Batch Correction'].loc[tmpDat['Scaling'] == 'scaled'].values[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [7]:
np.mean(overall)
np.mean(bio_cons)
np.mean(batch_corr)

0.5638297872340425

0.723404255319149

0.30851063829787234

Unscaled scores tend to have a higher bio conservation score, but a lower batch removal score (72% vs 31% of cases).

In [8]:
len(overall)

188

# HVG vs Full feature

In [9]:
batch_corr = []
bio_cons = []
overall = []

for file in data:
    data[file] = data[file].loc[~np.isnan(data[file]['Overall Score']),:]
    
    data[file]['method_id'] = ['_'.join(data[file][['Method', 'Output', 'Scaling']].values[i]) for i in range(data[file].shape[0])]
    
    for meth in data[file]['method_id'].unique():
        tmpDat = data[file][['Features', 'Batch Correction', 'Bio conservation', 'Overall Score']].loc[data[file]['method_id'].isin([meth])]
        if tmpDat.shape[0] == 2:
            overall.append(tmpDat['Overall Score'].loc[tmpDat['Features'] == 'HVG'].values[0] > tmpDat['Overall Score'].loc[tmpDat['Features'] == 'FULL'].values[0])
            bio_cons.append(tmpDat['Bio conservation'].loc[tmpDat['Features'] == 'HVG'].values[0] > tmpDat['Bio conservation'].loc[tmpDat['Features'] == 'FULL'].values[0])
            batch_corr.append(tmpDat['Batch Correction'].loc[tmpDat['Features'] == 'HVG'].values[0] > tmpDat['Batch Correction'].loc[tmpDat['Features'] == 'FULL'].values[0])

In [10]:
np.mean(overall)
np.mean(bio_cons)
np.mean(batch_corr)
len(overall)

0.7203791469194313

0.6587677725118484

0.8199052132701422

211

HVG selection improves overall performance, but specifically batch removal performance.

# Proportion above unintegrated

In [11]:
prop = dict()
method_above = dict()
total_methods = dict()

for file in data:
    data[file] = data[file].loc[~np.isnan(data[file]['Overall Score']),:]
    file_name = file.split('/')[7]

    unint_score = data[file].loc[data[file]['Method'] == 'Unintegrated']['Overall Score'].values[0]
    num_better = (data[file]['Overall Score'] > unint_score).sum()
    method_num = data[file].shape[0]-1
    frac = float(num_better)/method_num

    method_above[file_name] = num_better
    total_methods[file_name] = method_num
    prop[file_name] = frac

    print(f'{file_name}: Frac above unintegrated: {frac}')

immune_cell_hum_mou_summary_scores.csv: Frac above unintegrated: 0.6229508196721312
immune_cell_hum_summary_scores.csv: Frac above unintegrated: 0.8823529411764706
lung_atlas_summary_scores.csv: Frac above unintegrated: 0.7761194029850746
mouse_brain_summary_scores.csv: Frac above unintegrated: 0.8947368421052632
pancreas_summary_scores.csv: Frac above unintegrated: 0.9411764705882353
simulations_1_1_summary_scores.csv: Frac above unintegrated: 1.0
simulations_2_summary_scores.csv: Frac above unintegrated: 0.875


In [12]:
np.sum(list(method_above.values()))
np.sum(list(total_methods.values()))
np.sum(list(method_above.values()))/np.sum(list(total_methods.values()))
np.mean(list(prop.values()))

372

434

0.8571428571428571

0.8560480680753108

On average 86% of outputs are above unintegrated, and the immune cell human mouse task is the hardest, followed by the lung task.

# Method top performer

In [13]:
topMeth = dict()
topMeth_bio = dict()
topMeth_batch = dict()

for file in data:
    data[file] = data[file].loc[~np.isnan(data[file]['Overall Score']),:]
    
    data[file]['method_id'] = ['_'.join(data[file][['Method', 'Output']].values[i]) for i in range(data[file].shape[0])]
    
    for meth in data[file]['method_id'].unique():
        tmpDat = data[file][['Features', 'Scaling', 'Batch Correction', 'Bio conservation', 'Overall Score']].loc[data[file]['method_id'].isin([meth])]

        if meth not in topMeth:
            topMeth[meth] = {'HVG_scaled':0, 'HVG_unscaled':0, 'FULL_scaled':0, 'FULL_unscaled':0}
            topMeth_bio[meth] = {'HVG_scaled':0, 'HVG_unscaled':0, 'FULL_scaled':0, 'FULL_unscaled':0}
            topMeth_batch[meth] = {'HVG_scaled':0, 'HVG_unscaled':0, 'FULL_scaled':0, 'FULL_unscaled':0}

        tmpDat['preproc'] = ['_'.join(tmpDat[['Features', 'Scaling']].values[i]) for i in range(tmpDat.shape[0])]
        
        topVal = tmpDat.sort_values(by='Overall Score', ascending=False)['preproc'].values[0]
        topMeth[meth][topVal] += 1
        
        topVal = tmpDat.sort_values(by='Bio conservation', ascending=False)['preproc'].values[0]
        topMeth_bio[meth][topVal] += 1
        
        topVal = tmpDat.sort_values(by='Batch Correction', ascending=False)['preproc'].values[0]
        topMeth_batch[meth][topVal] += 1        

In [14]:
topMeth

{'scGen*_gene': {'HVG_scaled': 0,
  'HVG_unscaled': 6,
  'FULL_scaled': 0,
  'FULL_unscaled': 0},
 'BBKNN_graph': {'HVG_scaled': 0,
  'HVG_unscaled': 5,
  'FULL_scaled': 0,
  'FULL_unscaled': 2},
 'scVI_embed': {'HVG_scaled': 0,
  'HVG_unscaled': 6,
  'FULL_scaled': 0,
  'FULL_unscaled': 1},
 'Scanorama_embed': {'HVG_scaled': 6,
  'HVG_unscaled': 0,
  'FULL_scaled': 0,
  'FULL_unscaled': 1},
 'scANVI*_embed': {'HVG_scaled': 0,
  'HVG_unscaled': 6,
  'FULL_scaled': 0,
  'FULL_unscaled': 1},
 'MNN_gene': {'HVG_scaled': 3,
  'HVG_unscaled': 3,
  'FULL_scaled': 0,
  'FULL_unscaled': 0},
 'Seurat v3 RPCA_gene': {'HVG_scaled': 0,
  'HVG_unscaled': 4,
  'FULL_scaled': 0,
  'FULL_unscaled': 1},
 'fastMNN_embed': {'HVG_scaled': 2,
  'HVG_unscaled': 4,
  'FULL_scaled': 0,
  'FULL_unscaled': 1},
 'Scanorama_gene': {'HVG_scaled': 5,
  'HVG_unscaled': 1,
  'FULL_scaled': 1,
  'FULL_unscaled': 0},
 'fastMNN_gene': {'HVG_scaled': 2,
  'HVG_unscaled': 4,
  'FULL_scaled': 0,
  'FULL_unscaled': 1},
 'Co

Overall:

clear:
- scGen - HVG unscaled
- Scanorama embed - HVG scaled (full unscaled - sim 1)
- Scanorama gene - HVG scaled (check HVG unscaled - imm hum/mou 5 ranks diff to HVG scaled, Full scaled - sim 1)
- scVI - HVG unscaled (full is nearly same as HVG on imm hum)
- scANVI - HVG unscaled (full is nearly same as HVG on imm hum)
- Seurat RPCA - HVG unscaled (check full unscaled))
- Seurat v3 CCA - HVG unscaled (check HVG scaled - sim 2, nearly same as HVG unscaled; full unscaled - sim 1)
- FastMNN embed - HVG unscaled (check HVG scaled - pancreas, very similar to unscaled; Full unscaled - sim 1)
- FastMNN gene - HVG unscaled (check HVG scaled - pancreas, very similar to unscaled; Full unscaled - sim 1)
- Harmony - HVG unscaled (HVG scaled - mouse brain, sim 2, similar to HVG unscaled score; Full unscaled - sim 1)
- trVAE - unscaled, HVG or full? (sim 1 - full; sim 2 - hvg; panc - hvg; lung - hvg; imm hum - full (similar)) => HVG
- SAUCIE embed & gene - HVG, prob scaled (check HVG unscaled - also 2, full scaled) (sim1 - unscaled, similar; sim2 - scaled; panc - unscaled, similar; lung - scaled; imm hum - scaled; imm hum mou - all fail; mouse brain - scaled) => scaled
- BBKNN - unscaled, HVG or full? (sim 1 -> full; sim2 -> HVG (looks correct); lung - HVG (good); imm hum - full (both similar scores); imm hum mou - HVG (good); mouse brain - HVG; => HVG is better!

partially clear:
- MNN - HVG, unscaled or scaled? (sim 1/2 - unscaled; pancreas - scaled, but similar; lung - scaled; imm hum - scaled; imm hum mou - unscaled) => scaling task dependent
- ComBat - prob HVG, prob unscaled (but HVG scaled most common at 3) => check all (full in sim1 & hum/mou (all look bad); scaled in sim2/panc/imm hum (only panc not similar ranks, but these are easy score); unscaled in lung, brain (but similar in lung)) -> HVG, scaling doesn't really matter (slight task dependence).
- Conos graph - prob full, prob unscaled (also 2 HVG unscaled, and 2 full scaled) (full scaled best in panc and sim 1; full unscaled best in lung; hvg unscaled best in imm hum; unscaled best in imm hum/mou; mouse brain all fail) => adapt preprocessing to task, stronger integration => scaled; mostly full feature though

Completely task dependent:
- DESC - unscaled HVG/full unclear -> task dependent (mostly, it doesn't matter as poor performance overall)
- LIGER - unscaled (default), HVG or full? (sims 1/2 - full; panc - HVG,lung - full, nearly same, failed; imm hum - full (nearly same); imm hum mou - HVG (nearly same); mouse brain - no comparison) => task dependent

# Look at trajectories

In [15]:
data[files[0]].columns

Index(['Unnamed: 0', 'Method', 'Output', 'Features', 'Scaling',
       'Overall Score', 'Batch Correction', 'PCR batch', 'Batch ASW',
       'graph iLISI', 'graph connectivity', 'kBET', 'Bio conservation',
       'NMI cluster/label', 'ARI cluster/label', 'Cell type ASW',
       'isolated label F1', 'isolated label silhouette', 'graph cLISI',
       'CC conservation', 'HVG conservation', 'trajectory conservation',
       'method_id'],
      dtype='object')

In [16]:
for file in data:
    data[file] = data[file].loc[~np.isnan(data[file]['Overall Score']),:]
    
    data[file]['method_id'] = ['_'.join(data[file][['Method', 'Output', 'Features','Scaling']].values[i]) for i in range(data[file].shape[0])]
    
    if 'trajectory conservation' not in data[file].columns:
        continue

    print(file)
    print('top performers:')
    data[file].sort_values(by='trajectory conservation', ascending=False)[['method_id', 'trajectory conservation']].reset_index().loc[:12]
    print('bottom performers:')
    data[file].sort_values(by='trajectory conservation', ascending=True)[['method_id', 'trajectory conservation']].reset_index().loc[:12]
    print('\n')

../../../Paper/202104_Revision/Supplementary Files/Results/immune_cell_hum_mou_summary_scores.csv
top performers:


Unnamed: 0,index,method_id,trajectory conservation
0,25,Scanorama_gene_FULL_unscaled,0.9778
1,9,Scanorama_gene_HVG_unscaled,0.977264
2,21,Scanorama_embed_FULL_unscaled,0.976953
3,11,Scanorama_embed_HVG_unscaled,0.972919
4,7,scVI_embed_FULL_unscaled,0.938736
5,2,scVI_embed_HVG_unscaled,0.934886
6,14,scANVI*_embed_FULL_unscaled,0.932522
7,0,scGen*_gene_HVG_unscaled,0.92586
8,20,MNN_gene_FULL_scaled,0.92407
9,13,Seurat v3 RPCA_gene_HVG_unscaled,0.923949


bottom performers:


Unnamed: 0,index,method_id,trajectory conservation
0,53,DESC_embed_HVG_unscaled,0.169114
1,29,MNN_gene_HVG_scaled,0.261818
2,28,ComBat_gene_HVG_unscaled,0.309337
3,37,Harmony_embed_HVG_scaled,0.345588
4,56,SAUCIE_embed_HVG_scaled,0.388484
5,57,SAUCIE_gene_HVG_scaled,0.389901
6,43,Conos_graph_FULL_unscaled,0.392952
7,50,DESC_embed_HVG_scaled,0.394056
8,16,Scanorama_gene_FULL_scaled,0.411472
9,58,SAUCIE_gene_HVG_unscaled,0.434621




../../../Paper/202104_Revision/Supplementary Files/Results/immune_cell_hum_summary_scores.csv
top performers:


Unnamed: 0,index,method_id,trajectory conservation
0,16,scGen*_gene_HVG_unscaled,0.882649
1,60,Unintegrated_gene_FULL_unscaled,0.880672
2,31,scGen*_gene_FULL_unscaled,0.877364
3,24,Scanorama_embed_FULL_unscaled,0.876981
4,28,fastMNN_gene_HVG_scaled,0.876832
5,17,Seurat v3 RPCA_gene_HVG_unscaled,0.875649
6,9,Conos_graph_FULL_unscaled,0.87332
7,46,MNN_gene_FULL_unscaled,0.86986
8,14,fastMNN_embed_FULL_unscaled,0.865914
9,30,fastMNN_gene_FULL_unscaled,0.86568


bottom performers:


Unnamed: 0,index,method_id,trajectory conservation
0,68,scGen*_gene_FULL_scaled,0.0
1,61,DESC_embed_HVG_scaled,0.0
2,43,Seurat v3 CCA_gene_HVG_scaled,0.254477
3,54,LIGER_embed_HVG_unscaled,0.358585
4,56,DESC_embed_HVG_unscaled,0.385665
5,66,SAUCIE_gene_FULL_unscaled,0.400054
6,67,scGen*_gene_HVG_scaled,0.452327
7,64,SAUCIE_gene_HVG_unscaled,0.47148
8,63,SAUCIE_embed_HVG_unscaled,0.473584
9,32,Scanorama_gene_FULL_scaled,0.524972






Hum-Mou:
- Scanorama performs well mainly on unscaled for human mouse
- scVI and cell-informed DL methods otherwise work well
- poor performance from scaled methods, those that perform poor integration (SAUCIE, Conos), and DESC, which assumes clustered data

Hum:
- unscaled performs best, mostly Full gene sets
- Scanorama, scGen, FastMNN generally perform well -> methods that focus more on bio conservation, but also Seurat v3 RPCA
- poor conservation from DL methods that did not converge (scGen scaled), methods that do not capture the bio signal complexity (SAUCIE, LIGER), and DESC that assumes clustered data


Overall:
- ScGen, Scanorama, FastMNN perform well, but DESC, SAUCIE do not. 
- In general unscaled data performs best, and tendency towards full gene sets especially when less batch correction is required in simpler task (human). 
- On simpler task most methods perform well.

# CC conservation

In [17]:
topVer = dict()

for file in data:
    if 'CC conservation' not in data[file].columns:
        continue
    
    data[file] = data[file].loc[~np.isnan(data[file]['Overall Score']),:]
    
    data[file]['method_id'] = ['_'.join(data[file][['Method', 'Output']].values[i]) for i in range(data[file].shape[0])]
    
    for meth in data[file]['method_id'].unique():
        tmpDat = data[file][['Features', 'Scaling', 'CC conservation']].loc[data[file]['method_id'].isin([meth])]

        if tmpDat.shape[0] < 2:
            continue
        
        if meth not in topVer:
            topVer[meth] = {'HVG_scaled':0, 'HVG_unscaled':0, 'FULL_scaled':0, 'FULL_unscaled':0}

        tmpDat['preproc'] = ['_'.join(tmpDat[['Features', 'Scaling']].values[i]) for i in range(tmpDat.shape[0])]
        
        topVal = tmpDat.sort_values(by='CC conservation', ascending=False)['preproc'].values[0]
        topVer[meth][topVal] += 1 

In [18]:
topVer

{'scGen*_gene': {'HVG_scaled': 0,
  'HVG_unscaled': 1,
  'FULL_scaled': 1,
  'FULL_unscaled': 2},
 'BBKNN_graph': {'HVG_scaled': 0,
  'HVG_unscaled': 4,
  'FULL_scaled': 0,
  'FULL_unscaled': 1},
 'scVI_embed': {'HVG_scaled': 0,
  'HVG_unscaled': 3,
  'FULL_scaled': 0,
  'FULL_unscaled': 2},
 'Scanorama_embed': {'HVG_scaled': 2,
  'HVG_unscaled': 1,
  'FULL_scaled': 0,
  'FULL_unscaled': 2},
 'scANVI*_embed': {'HVG_scaled': 0,
  'HVG_unscaled': 3,
  'FULL_scaled': 0,
  'FULL_unscaled': 2},
 'MNN_gene': {'HVG_scaled': 0,
  'HVG_unscaled': 0,
  'FULL_scaled': 1,
  'FULL_unscaled': 3},
 'Seurat v3 RPCA_gene': {'HVG_scaled': 0,
  'HVG_unscaled': 1,
  'FULL_scaled': 2,
  'FULL_unscaled': 1},
 'fastMNN_embed': {'HVG_scaled': 1,
  'HVG_unscaled': 2,
  'FULL_scaled': 0,
  'FULL_unscaled': 2},
 'Scanorama_gene': {'HVG_scaled': 1,
  'HVG_unscaled': 0,
  'FULL_scaled': 1,
  'FULL_unscaled': 3},
 'fastMNN_gene': {'HVG_scaled': 1,
  'HVG_unscaled': 2,
  'FULL_scaled': 0,
  'FULL_unscaled': 2},
 'Co

In [19]:
top8Meth = defaultdict(int)
bot8Meth = defaultdict(int)

for file in data:
    data[file] = data[file].loc[~np.isnan(data[file]['Overall Score']),:]
    
    data[file]['method_id'] = ['_'.join(data[file][['Method', 'Output', 'Features','Scaling']].values[i]) for i in range(data[file].shape[0])]
    
    if 'CC conservation' not in data[file].columns:
        continue

    for meth in data[file].sort_values(by='CC conservation', ascending=False)['method_id'].values[:8]:
        top8Meth[meth] += 1

    for meth in data[file].sort_values(by='CC conservation', ascending=True)['method_id'].values[:8]:
        bot8Meth[meth] += 1

In [20]:
top8Meth
bot8Meth

defaultdict(int,
            {'Unintegrated_gene_FULL_unscaled': 4,
             'MNN_gene_FULL_unscaled': 3,
             'ComBat_gene_FULL_unscaled': 4,
             'Scanorama_gene_FULL_unscaled': 3,
             'MNN_gene_FULL_scaled': 2,
             'MNN_gene_HVG_unscaled': 2,
             'ComBat_gene_FULL_scaled': 1,
             'MNN_gene_HVG_scaled': 1,
             'Harmony_embed_FULL_scaled': 1,
             'Harmony_embed_HVG_unscaled': 1,
             'DESC_embed_FULL_unscaled': 2,
             'Scanorama_gene_HVG_unscaled': 1,
             'Seurat v3 RPCA_gene_HVG_unscaled': 1,
             'SAUCIE_gene_HVG_scaled': 2,
             'SAUCIE_embed_HVG_scaled': 2,
             'Conos_graph_HVG_unscaled': 1,
             'SAUCIE_gene_HVG_unscaled': 1,
             'SAUCIE_embed_HVG_unscaled': 1,
             'DESC_embed_HVG_unscaled': 1,
             'DESC_embed_HVG_scaled': 1,
             'fastMNN_gene_FULL_unscaled': 1,
             'fastMNN_embed_FULL_unscaled': 1,
     

defaultdict(int,
            {'scGen*_gene_HVG_scaled': 2,
             'SAUCIE_embed_FULL_scaled': 2,
             'SAUCIE_gene_FULL_scaled': 2,
             'scGen*_gene_FULL_unscaled': 2,
             'SAUCIE_embed_HVG_scaled': 1,
             'SAUCIE_gene_HVG_scaled': 1,
             'LIGER_embed_FULL_unscaled': 2,
             'LIGER_embed_HVG_unscaled': 3,
             'trVAE_embed_HVG_unscaled': 1,
             'fastMNN_embed_HVG_scaled': 1,
             'fastMNN_gene_HVG_scaled': 1,
             'SAUCIE_gene_HVG_unscaled': 2,
             'SAUCIE_embed_HVG_unscaled': 2,
             'SAUCIE_embed_FULL_unscaled': 1,
             'fastMNN_gene_FULL_scaled': 2,
             'fastMNN_embed_FULL_scaled': 2,
             'ComBat_gene_HVG_scaled': 1,
             'BBKNN_graph_HVG_scaled': 1,
             'Scanorama_gene_FULL_unscaled': 1,
             'Scanorama_embed_FULL_unscaled': 1,
             'BBKNN_graph_FULL_scaled': 1,
             'scANVI*_embed_FULL_unscaled': 1,
         

Full, and especially unscaled data typically conserve more CC variance.

Scanorama (gene), Combat, and MNN perform consistently well

LIGER, SAUCIE, but also ScGen with sub-optimal preprocessing perform poorly

# HVG conservation

In [21]:
topVer = dict()

for file in data:
    if 'HVG conservation' not in data[file].columns:
        continue
    
    tmp = data[file].loc[~np.isnan(data[file]['HVG conservation']),:].copy()
    
    tmp['method_id'] = ['_'.join(tmp[['Method', 'Output']].values[i]) for i in range(tmp.shape[0])]
    
    for meth in tmp['method_id'].unique():
        tmpDat = tmp[['Features', 'Scaling', 'HVG conservation']].loc[tmp['method_id'].isin([meth])]

        if tmpDat.shape[0] < 2:
            continue
        
        if meth not in topVer:
            topVer[meth] = {'HVG_scaled':0, 'HVG_unscaled':0, 'FULL_scaled':0, 'FULL_unscaled':0}

        tmpDat['preproc'] = ['_'.join(tmpDat[['Features', 'Scaling']].values[i]) for i in range(tmpDat.shape[0])]
        
        topVal = tmpDat.sort_values(by='HVG conservation', ascending=False)['preproc'].values[0]
        topVer[meth][topVal] += 1 

In [22]:
topVer

{'scGen*_gene': {'HVG_scaled': 0,
  'HVG_unscaled': 6,
  'FULL_scaled': 0,
  'FULL_unscaled': 0},
 'MNN_gene': {'HVG_scaled': 0,
  'HVG_unscaled': 6,
  'FULL_scaled': 0,
  'FULL_unscaled': 0},
 'Seurat v3 RPCA_gene': {'HVG_scaled': 0,
  'HVG_unscaled': 5,
  'FULL_scaled': 0,
  'FULL_unscaled': 0},
 'Scanorama_gene': {'HVG_scaled': 0,
  'HVG_unscaled': 6,
  'FULL_scaled': 0,
  'FULL_unscaled': 1},
 'fastMNN_gene': {'HVG_scaled': 1,
  'HVG_unscaled': 6,
  'FULL_scaled': 0,
  'FULL_unscaled': 0},
 'ComBat_gene': {'HVG_scaled': 0,
  'HVG_unscaled': 7,
  'FULL_scaled': 0,
  'FULL_unscaled': 0},
 'SAUCIE_gene': {'HVG_scaled': 1,
  'HVG_unscaled': 6,
  'FULL_scaled': 0,
  'FULL_unscaled': 0},
 'Seurat v3 CCA_gene': {'HVG_scaled': 0,
  'HVG_unscaled': 5,
  'FULL_scaled': 0,
  'FULL_unscaled': 0}}

In [23]:
top5Meth = defaultdict(int)
bot5Meth = defaultdict(int)

for file in data:
    tmp = data[file].loc[~np.isnan(data[file]['HVG conservation']),:].copy()
    
    tmp['method_id'] = ['_'.join(tmp[['Method', 'Output', 'Features','Scaling']].values[i]) for i in range(tmp.shape[0])]
    
    if 'HVG conservation' not in tmp.columns:
        continue

    for meth in tmp.sort_values(by='HVG conservation', ascending=False)['method_id'].values[:5]:
        top5Meth[meth] += 1

    for meth in tmp.sort_values(by='HVG conservation', ascending=True)['method_id'].values[:5]:
        bot5Meth[meth] += 1

In [24]:
top5Meth
bot5Meth

defaultdict(int,
            {'Unintegrated_gene_FULL_unscaled': 6,
             'scGen*_gene_HVG_unscaled': 3,
             'Seurat v3 RPCA_gene_HVG_unscaled': 4,
             'Scanorama_gene_HVG_unscaled': 3,
             'ComBat_gene_HVG_unscaled': 5,
             'Seurat v3 CCA_gene_HVG_unscaled': 4,
             'SAUCIE_gene_HVG_unscaled': 1,
             'Seurat v3 CCA_gene_FULL_unscaled': 2,
             'ComBat_gene_FULL_unscaled': 1,
             'scGen*_gene_FULL_unscaled': 1,
             'Scanorama_gene_HVG_scaled': 1,
             'SAUCIE_gene_HVG_scaled': 1,
             'fastMNN_gene_HVG_scaled': 1,
             'MNN_gene_HVG_unscaled': 1,
             'Scanorama_gene_FULL_unscaled': 1})

defaultdict(int,
            {'ComBat_gene_FULL_scaled': 6,
             'MNN_gene_FULL_scaled': 6,
             'SAUCIE_gene_FULL_scaled': 3,
             'fastMNN_gene_FULL_scaled': 1,
             'Scanorama_gene_FULL_scaled': 3,
             'Seurat v3 CCA_gene_FULL_scaled': 3,
             'Seurat v3 RPCA_gene_FULL_scaled': 4,
             'fastMNN_gene_HVG_unscaled': 1,
             'ComBat_gene_HVG_scaled': 1,
             'SAUCIE_gene_HVG_unscaled': 1,
             'fastMNN_gene_HVG_scaled': 1,
             'SAUCIE_gene_HVG_scaled': 1,
             'Scanorama_gene_FULL_unscaled': 1,
             'scGen*_gene_FULL_scaled': 2,
             'SAUCIE_gene_FULL_unscaled': 1})

Generally performs best on HVG unscaled, worst on full, scaled data

Scanorama, ComBat, Seurat v3 (both CCA and RPCA) perform best

worst performance on full scaled preprocessing

In [25]:
files

['../../../Paper/202104_Revision/Supplementary Files/Results/immune_cell_hum_mou_summary_scores.csv',
 '../../../Paper/202104_Revision/Supplementary Files/Results/immune_cell_hum_summary_scores.csv',
 '../../../Paper/202104_Revision/Supplementary Files/Results/lung_atlas_summary_scores.csv',
 '../../../Paper/202104_Revision/Supplementary Files/Results/mouse_brain_summary_scores.csv',
 '../../../Paper/202104_Revision/Supplementary Files/Results/pancreas_summary_scores.csv',
 '../../../Paper/202104_Revision/Supplementary Files/Results/simulations_1_1_summary_scores.csv',
 '../../../Paper/202104_Revision/Supplementary Files/Results/simulations_2_summary_scores.csv']

In [26]:
data['../../../Paper/202104_Revision/Supplementary Files/Results/pancreas_summary_scores.csv'].sort_values(by='Overall Score', ascending=False)[['Method', 'Output', 'Features','Scaling', 'Bio conservation']]

Unnamed: 0,Method,Output,Features,Scaling,Bio conservation
0,Seurat v3 CCA,gene,HVG,unscaled,0.655820
1,Seurat v3 CCA,gene,HVG,scaled,0.681818
2,Seurat v3 RPCA,gene,HVG,unscaled,0.647876
3,scGen*,gene,HVG,unscaled,0.677264
4,Harmony,embed,HVG,unscaled,0.700046
...,...,...,...,...,...
64,Unintegrated,gene,FULL,unscaled,0.535144
65,SAUCIE,gene,FULL,scaled,0.327244
66,SAUCIE,embed,FULL,unscaled,0.305845
67,SAUCIE,gene,FULL,unscaled,0.286527
