In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os

In [2]:
basedir = '/u/home/t/terencew/project-cluo/igvf/pilot/multiome/'

donors = list(np.loadtxt(f'{basedir}/txt/donors.txt', dtype=str))
samples = list(np.loadtxt(f'{basedir}/txt/samples.txt', dtype=str))
s = samples[0]

gex_methods = pd.Index(['demuxlet', 'freemuxlet',  'vireo', 'vireo_nogenos',
    'souporcell', 'souporcell_nogenos', 'scsplit', 'scsplit_nogenos', 'demuxalot'])
atac_methods = pd.Index(['demuxlet', 'freemuxlet',  'vireo', 'vireo_nogenos',
    'souporcell', 'souporcell_nogenos', 'scsplit', 'scsplit_nogenos', 'scavengers'])

free_methods = pd.Index(['freemuxlet', 'vireo_nogenos', 'souporcell',
                'souporcell_nogenos', 'scsplit', 'scsplit_nogenos', 'scavengers'])

doub_experiments = np.loadtxt(f'{basedir}/ambient/ambisim/cov_doub_test/txt/experiments.txt', dtype=str)
mux_experiments = np.loadtxt(f'{basedir}/ambient/ambisim/cov_mux_test/txt/experiments.txt', dtype=str)

In [3]:
def map_target_to_anchor(target, anchor, donors, clusters):
    for i in range(clusters):
        barcodes = target[target == str(i)].index
        if barcodes.empty == False:
            ### get best assigned donor, no unassigned/doublet
            mask = [x in donors for x in anchor.loc[barcodes]]
            if sum(mask) != 0:
                donor = anchor.loc[barcodes][mask].mode().values[0]
                maj_donor = donor
            else:
                maj_donor = 'unassigned'
            target.replace({f'{i}' : f'{maj_donor}'}, inplace=True)
    return target

In [4]:
def get_truth_df(indir, crarc_dir):
    truth = pd.read_csv(f'{indir}/drop_data_rand.txt', sep='\t', header=0, index_col=0, low_memory=False)
    barcodes = pd.read_csv(f'{crarc_dir}/outs/filtered_feature_bc_matrix/barcodes.tsv.gz',
                       sep='\t', header=None, index_col=None)

    truth.index = [f'{x}-1' for x in truth.index]
    truth = truth[truth.index.isin(barcodes[0])]
    truth.sort_index(inplace=True)
   
    ###
    doub_mask = [',' in x for x in truth['sam']]
    truth.loc[doub_mask, 'sam'] = 'doublet'
        
    truth['ambient_RNA'] = truth['rna_nr_a'] / (truth['rna_nr_c']  + truth['rna_nr_a'])
    truth['ambient_DNA'] = truth['atac_nr_a'] / (truth['atac_nr_c']  + truth['atac_nr_a'])
    truth['ambient_DNA_peaks'] = truth['atac_nr_ap'] / (truth['atac_nr_cp']  + truth['atac_nr_ap'])
    truth['method'] = 'truth'
    
    truth = truth[['sam', 'ambient_RNA', 'ambient_DNA', 'ambient_DNA_peaks', 'method']]
    truth.rename(columns={'sam' : 'donor_id'}, inplace=True)

    return truth

In [5]:
# gex_to_drop = ['scsplit', 'scsplit_nogenos', 'demuxalot']
# atac_to_drop = ['scsplit', 'scsplit_nogenos', 'scavengers']

# tmp_gex_methods = gex_methods.drop(gex_to_drop)
# tmp_atac_methods = atac_methods.drop(atac_to_drop)

In [7]:
projdir = f'{basedir}/ambient/ambisim/cov_mux_test/'
exp = '2_0'
n_donors = int(exp.split('_')[0])
donors = np.loadtxt(f'{projdir}/txt/{n_donors}_donors.txt', dtype=str)

crarc_dir = f'{projdir}/{exp}/cr_arc/{s}'
truth_dir = f'{projdir}/{exp}/{s}'
truth = get_truth_df(truth_dir, crarc_dir)

In [8]:
###
projdir = f'{basedir}/ambient/ambisim/cov_mux_test/'
s = samples[0]
mod = 'gex'

###
exp = '2_0'
n_donors = int(exp.split('_')[0])
donors = np.loadtxt(f'{projdir}/txt/{n_donors}_donors.txt', dtype=str)

crarc_dir = f'{projdir}/{exp}/cr_arc/{s}'
truth_dir = f'{projdir}/{exp}/{s}'
truth = get_truth_df(truth_dir, crarc_dir)

###
mux_gex = truth.copy()

indir = f'{projdir}/{exp}/demux/merged/{mod}/{s}'
for method in gex_methods:
    print(method)
    demux = pd.read_csv(f'{indir}/{method}.csv', sep='\t', header=0, index_col=0).sort_index()
    assert np.sum(demux.index == truth.index) == demux.shape[0]
    if method in free_methods:
        tmp_free = demux['donor_id']
        truth_id = truth['donor_id']
        if 'scsplit' in method:
            demux['donor_id'] = map_target_to_anchor(tmp_free, truth_id, donors, n_donors+1)
        else:
            demux['donor_id'] = map_target_to_anchor(tmp_free, truth_id, donors, n_donors)
    demux = demux[['donor_id']]
    demux['ambient_RNA'] = truth.reindex(demux.index)['ambient_RNA']
    demux['ambient_DNA'] = truth.reindex(demux.index)['ambient_DNA']
    demux['ambient_DNA_peaks'] = truth.reindex(demux.index)['ambient_DNA_peaks']
    demux['method'] = method
    mux_gex  = pd.concat([mux_gex, demux])

mux_gex.shape

demuxlet
freemuxlet
vireo
vireo_nogenos
souporcell
souporcell_nogenos
scsplit
scsplit_nogenos
demuxalot


(89990, 5)

In [9]:
# mux_gex[mux_gex['method'] == 'scsplit']['donor_id'].value_counts()

### let's fill all missing methods with zeros

In [10]:
###
projdir = f'{basedir}/ambient/ambisim/prop_doub/'
s = samples[0]
mod = 'gex'

n_donors = 4
donors = list(np.loadtxt(f'{basedir}/txt/donors.txt', dtype=str))

doub_gex = pd.DataFrame()

for exp in doub_experiments:
    print(exp)
    
    ###
    crarc_dir = f'{projdir}/{exp}/cr_arc/{s}'
    truth_dir = f'{projdir}/{exp}/{s}'
    truth = get_truth_df(truth_dir, crarc_dir)
    
    ###
    zero_df = pd.DataFrame(columns=['donor_id'], index=truth.index, data=0)
    ###

    tmp_doub_gex = truth.copy()
    
    ###
    indir = f'{projdir}/{exp}/demux/merged/{mod}/{s}'
    for method in gex_methods:
        try:
            demux = pd.read_csv(f'{indir}/{method}.csv', sep='\t', header=0, index_col=0).sort_index()
        except:
            print(f'{method} not found')
            ### replace!!!
            demux = zero_df.copy()
            
        demux = demux.reindex(truth.index)
        assert np.sum(demux.index == truth.index) == demux.shape[0]
        
#         if method in free_methods:
        if method in free_methods and demux['donor_id'].value_counts()[0] != demux.shape[0]:
            tmp_free = pd.Series(index=demux.index, data=[str(x).replace('.0', '') for x in demux['donor_id']])
            truth_id = pd.Series(index=truth.index, data=[str(x) for x in truth['donor_id']])
            if 'scsplit' in method:
                demux['donor_id'] = map_target_to_anchor(tmp_free, truth_id, donors, n_donors+1)
            else:
                demux['donor_id'] = map_target_to_anchor(tmp_free, truth_id, donors, n_donors)            
        demux = demux[['donor_id']]
        demux['ambient_RNA'] = truth.reindex(demux.index)['ambient_RNA']
        demux['ambient_DNA'] = truth.reindex(demux.index)['ambient_DNA']
        demux['ambient_DNA_peaks'] = truth.reindex(demux.index)['ambient_DNA_peaks']
        demux['method'] = method
        tmp_doub_gex  = pd.concat([tmp_doub_gex, demux])
        
    tmp_doub_gex['exp'] = exp
    doub_gex = pd.concat([doub_gex, tmp_doub_gex])

0_0
0_10
0_20
0_30
10_0
10_10
10_20
10_30
20_0
20_10
20_20
20_30
30_0
30_10
30_20
30_30


In [11]:
###
projdir = f'{basedir}/ambient/ambisim/prop_doub/'
s = samples[0]
mod = 'atac'

n_donors = 4
donors = list(np.loadtxt(f'{basedir}/txt/donors.txt', dtype=str))

doub_atac = pd.DataFrame()

for exp in doub_experiments:
    print(exp)
    
    ###
    crarc_dir = f'{projdir}/{exp}/cr_arc/{s}'
    truth_dir = f'{projdir}/{exp}/{s}'
    truth = get_truth_df(truth_dir, crarc_dir)
    
    ###
    zero_df = pd.DataFrame(columns=['donor_id'], index=truth.index, data=0)
    ###

    tmp_doub_atac = truth.copy()
    
    ###
    indir = f'{projdir}/{exp}/demux/merged/{mod}/{s}'
    for method in atac_methods:
        try:
            demux = pd.read_csv(f'{indir}/{method}.csv', sep='\t', header=0, index_col=0).sort_index()
        except:
            print(f'{method} not found')
            ### replace!!!
            demux = zero_df.copy()
            
        demux = demux.reindex(truth.index)
        assert np.sum(demux.index == truth.index) == demux.shape[0]
        
#         if method in free_methods:
        if method in free_methods and demux['donor_id'].value_counts()[0] != demux.shape[0]:
            tmp_free = pd.Series(index=demux.index, data=[str(x).replace('.0', '') for x in demux['donor_id']])
            truth_id = pd.Series(index=truth.index, data=[str(x) for x in truth['donor_id']])
            if 'scsplit' in method:
                demux['donor_id'] = map_target_to_anchor(tmp_free, truth_id, donors, n_donors+1)
            else:
                demux['donor_id'] = map_target_to_anchor(tmp_free, truth_id, donors, n_donors)            
        demux = demux[['donor_id']]
        demux['ambient_RNA'] = truth.reindex(demux.index)['ambient_RNA']
        demux['ambient_DNA'] = truth.reindex(demux.index)['ambient_DNA']
        demux['ambient_DNA_peaks'] = truth.reindex(demux.index)['ambient_DNA_peaks']
        demux['method'] = method
        tmp_doub_atac  = pd.concat([tmp_doub_atac, demux])
        
    tmp_doub_atac['exp'] = exp
    doub_atac = pd.concat([doub_atac, tmp_doub_atac])

0_0
0_10
0_20
0_30
10_0
10_10
10_20
10_30
20_0
20_10
20_20
20_30
30_0
30_10
30_20
30_30


In [12]:
###
projdir = f'{basedir}/ambient/ambisim/mux_test/'
donors = np.loadtxt(f'{projdir}/txt/{n_donors}_donors.txt', dtype=str)

s = samples[0]
mod = 'gex'

mux_gex = pd.DataFrame()

for exp in mux_experiments:
    print(exp)
    n_donors = int(exp.split('_')[0])
    donors = np.loadtxt(f'{projdir}/txt/{n_donors}_donors.txt', dtype=str)

    crarc_dir = f'{projdir}/{exp}/cr_arc/{s}'
    truth_dir = f'{projdir}/{exp}/{s}'
    truth = get_truth_df(truth_dir, crarc_dir)
    
    ###
    zero_df = pd.DataFrame(columns=['donor_id'], index=truth.index, data=0)
    ###

    tmp_mux_gex = truth.copy()

    indir = f'{projdir}/{exp}/demux/merged/{mod}/{s}'
    for method in gex_methods:
        try:
            demux = pd.read_csv(f'{indir}/{method}.csv', sep='\t', header=0, index_col=0).sort_index()
        except:
            print(f'{method} not found')
            demux = zero_df.copy()

        demux = demux.reindex(truth.index)
        assert np.sum(demux.index == truth.index) == demux.shape[0]
#         if method in free_methods:
        if method in free_methods and demux['donor_id'].value_counts()[0] != demux.shape[0]:
            tmp_free = pd.Series(index=demux.index, data=[str(x).replace('.0', '') for x in demux['donor_id']])
            truth_id = pd.Series(index=truth.index, data=[str(x) for x in truth['donor_id']])
            if 'scsplit' in method:
                demux['donor_id'] = map_target_to_anchor(tmp_free, truth_id, donors, n_donors+1)
            else:
                demux['donor_id'] = map_target_to_anchor(tmp_free, truth_id, donors, n_donors)   
            
        demux = demux[['donor_id']]
        demux['ambient_RNA'] = truth.reindex(demux.index)['ambient_RNA']
        demux['ambient_DNA'] = truth.reindex(demux.index)['ambient_DNA']
        demux['ambient_DNA_peaks'] = truth.reindex(demux.index)['ambient_DNA_peaks']
        demux['method'] = method
        tmp_mux_gex  = pd.concat([tmp_mux_gex, demux])
        
    tmp_mux_gex['exp'] = exp
    mux_gex = pd.concat([mux_gex, tmp_mux_gex])

2_0
2_10
2_20
2_30
6_0
6_10
6_20
6_30
8_0
8_10
8_20
8_30
10_0
10_10
10_20
10_30
12_0
12_10
12_20
12_30
14_0
14_10
14_20
14_30
16_0
16_10
16_20
16_30


In [13]:
###
projdir = f'{basedir}/ambient/ambisim/mux_test/'
s = samples[0]
mod = 'atac'

mux_atac = pd.DataFrame()

for exp in mux_experiments:
    print(exp)
    n_donors = int(exp.split('_')[0])
    donors = np.loadtxt(f'{projdir}/txt/{n_donors}_donors.txt', dtype=str)

    crarc_dir = f'{projdir}/{exp}/cr_arc/{s}'
    truth_dir = f'{projdir}/{exp}/{s}'
    truth = get_truth_df(truth_dir, crarc_dir)
    
    ###
    zero_df = pd.DataFrame(columns=['donor_id'], index=truth.index, data=0)
    ###

    tmp_mux_atac = truth.copy()

    indir = f'{projdir}/{exp}/demux/merged/{mod}/{s}'
    for method in atac_methods:
        try:
            demux = pd.read_csv(f'{indir}/{method}.csv', sep='\t', header=0, index_col=0).sort_index()
        except:
            print(f'{method} not found')
            demux = zero_df.copy()

        demux = demux.reindex(truth.index)
        assert np.sum(demux.index == truth.index) == demux.shape[0]
#         if method in free_methods:
        if method in free_methods and demux['donor_id'].value_counts()[0] != demux.shape[0]:
            tmp_free = pd.Series(index=demux.index, data=[str(x).replace('.0', '') for x in demux['donor_id']])
            truth_id = pd.Series(index=truth.index, data=[str(x) for x in truth['donor_id']])
            if 'scsplit' in method:
                demux['donor_id'] = map_target_to_anchor(tmp_free, truth_id, donors, n_donors+1)
            else:
                demux['donor_id'] = map_target_to_anchor(tmp_free, truth_id, donors, n_donors)
             
        demux = demux[['donor_id']]
        demux['ambient_RNA'] = truth.reindex(demux.index)['ambient_RNA']
        demux['ambient_DNA'] = truth.reindex(demux.index)['ambient_DNA']
        demux['ambient_DNA_peaks'] = truth.reindex(demux.index)['ambient_DNA_peaks']
        demux['method'] = method
        tmp_mux_atac  = pd.concat([tmp_mux_atac, demux])
        
    tmp_mux_atac['exp'] = exp
    mux_atac = pd.concat([mux_atac, tmp_mux_atac])

2_0
2_10
2_20
2_30
6_0
6_10
6_20
6_30
8_0
8_10
8_20
8_30
10_0
10_10
10_20
10_30
12_0
12_10
12_20
12_30
14_0
14_10
14_20
14_30
16_0
16_10
16_20
16_30


In [14]:
doub_gex.shape, doub_atac.shape, mux_gex.shape, mux_atac.shape

((1361090, 6), (1361090, 6), (2386320, 6), (2386320, 6))

In [15]:
doub_gex.to_csv(f'{basedir}/csv/ambisim/prop_doub/ambisim_gex.csv.gz', sep='\t', header=True, index=True)
doub_atac.to_csv(f'{basedir}/csv/ambisim/prop_doub/ambisim_atac.csv.gz', sep='\t', header=True, index=True)
mux_gex.to_csv(f'{basedir}/csv/ambisim/mux_test/ambisim_gex.csv.gz', sep='\t', header=True, index=True)
mux_atac.to_csv(f'{basedir}/csv/ambisim/mux_test/ambisim_atac.csv.gz', sep='\t', header=True, index=True)