In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
projdir = '/u/home/t/terencew/project-cluo/igvf/pilot/multiome/'

donors = list(np.loadtxt(f'{projdir}/txt/donors.txt', dtype=str))
samples = list(np.loadtxt(f'{projdir}/txt/samples.txt', dtype=str))
s = samples[0]

In [3]:
doub_experiments = np.loadtxt(f'{projdir}/ambient/ambisim/cov_doub_test/txt/experiments.txt', dtype=str)
mux_experiments = np.loadtxt(f'{projdir}/ambient/ambisim/cov_mux_test/txt/experiments.txt', dtype=str)

In [4]:
def get_truth_df(indir, crarc_dir):
    truth = pd.read_csv(f'{indir}/drop_data_rand.txt', sep='\t', header=0, index_col=0, low_memory=False)
    barcodes = pd.read_csv(f'{crarc_dir}/outs/filtered_feature_bc_matrix/barcodes.tsv.gz',
                       sep='\t', header=None, index_col=None)

    truth.index = [f'{x}-1' for x in truth.index]
    truth = truth[truth.index.isin(barcodes[0])]
    truth.sort_index(inplace=True)
   
    ###
    doub_mask = [',' in x for x in truth['sam']]
    truth.loc[doub_mask, 'sam'] = 'doublet'
        
    truth['ambient_RNA'] = truth['rna_nr_a'] / (truth['rna_nr_c']  + truth['rna_nr_a'])
    truth['ambient_DNA'] = truth['atac_nr_a'] / (truth['atac_nr_c']  + truth['atac_nr_a'])
    truth['ambient_DNA_peaks'] = truth['atac_nr_ap'] / (truth['atac_nr_cp']  + truth['atac_nr_ap'])
    truth['method'] = 'truth'
    
    truth = truth[['sam', 'ambient_RNA', 'ambient_DNA', 'ambient_DNA_peaks', 'method']]
    truth.rename(columns={'sam' : 'donor_id'}, inplace=True)

    return truth

In [5]:
indir = f'{projdir}/ambient/ambisim/cov_doub_test/'
doub_summary = pd.DataFrame()
for exp in doub_experiments:
    tmp_summary = pd.read_csv(f'{indir}/{exp}/cr_arc/{s}/outs/summary.csv', sep=',')
    tmp_summary.index = [exp]
    doub_summary = pd.concat([doub_summary, tmp_summary])

In [6]:
indir = f'{projdir}/ambient/ambisim/cov_mux_test/'
mux_summary = pd.DataFrame()
for exp in mux_experiments:
    tmp_summary = pd.read_csv(f'{indir}/{exp}/cr_arc/{s}/outs/summary.csv', sep=',')
    tmp_summary.index = [exp]
    mux_summary = pd.concat([mux_summary, tmp_summary])

In [7]:
doub_summary['type'] = 'Vary_doublet'
mux_summary['type'] = 'Vary_num_muxed'

In [18]:
doub_summary['amb_rate'] = [x.split('_')[0] for x in doub_summary.index]
doub_summary['doublet_rate'] = [x.split('_')[1] for x in doub_summary.index]
doub_summary['num_donors'] = [4 for x in doub_summary.index]

In [19]:
mux_summary['amb_rate'] = [x.split('_')[1] for x in mux_summary.index]
mux_summary['doublet_rate'] = [10 for x in mux_summary.index]
mux_summary['num_donors'] = [x.split('_')[0] for x in mux_summary.index]

In [20]:
all_summary = pd.concat([doub_summary, mux_summary])

In [21]:
all_summary.head()

Unnamed: 0,Sample ID,Genome,Pipeline version,Estimated number of cells,Feature linkages detected,Linked genes,Linked peaks,ATAC Confidently mapped read pairs,ATAC Fraction of genome in peaks,ATAC Fraction of high-quality fragments in cells,...,GEX Reads mapped to genome,GEX Reads with TSO,GEX Sequenced read pairs,GEX Total genes detected,GEX Valid UMIs,GEX Valid barcodes,type,amb_rate,doublet_rate,num_donors
0_0,20220928-IGVF-D0,GRCh38,cellranger-arc-2.0.1,8631,15570,5270,11303,0.8748,0.0424,0.9335,...,0.9981,0.0,60314841,29202,1.0,1.0,Vary_doublet,0,0,4
0_10,20220928-IGVF-D0,GRCh38,cellranger-arc-2.0.1,8584,13139,4687,10368,0.8744,0.0424,0.9264,...,0.998,0.0,60131721,29242,1.0,1.0,Vary_doublet,0,10,4
0_20,20220928-IGVF-D0,GRCh38,cellranger-arc-2.0.1,8571,11718,4469,9926,0.8749,0.0425,0.9246,...,0.9981,0.0,59914589,29164,1.0,1.0,Vary_doublet,0,20,4
0_30,20220928-IGVF-D0,GRCh38,cellranger-arc-2.0.1,8572,10058,3964,9226,0.8748,0.0427,0.9224,...,0.9981,0.0,60091431,29190,1.0,1.0,Vary_doublet,0,30,4
10_0,20220928-IGVF-D0,GRCh38,cellranger-arc-2.0.1,8594,11004,4241,8788,0.871,0.0401,0.9264,...,0.9981,0.0,60013084,29121,1.0,1.0,Vary_doublet,10,0,4


In [22]:
all_summary.to_csv(f'{projdir}/csv/final_figures/tables/table_s3_sims_crarc_qc.csv',
                   sep=',', header=True, index=True)

### lowcov

In [23]:
indir = f'{projdir}/ambient/ambisim/prop_doub/'
doub_summary = pd.DataFrame()
for exp in doub_experiments:
    tmp_summary = pd.read_csv(f'{indir}/{exp}/cr_arc/{s}/outs/summary.csv', sep=',')
    tmp_summary.index = [exp]
    doub_summary = pd.concat([doub_summary, tmp_summary])

In [24]:
indir = f'{projdir}/ambient/ambisim/mux_test/'
mux_summary = pd.DataFrame()
for exp in mux_experiments:
    tmp_summary = pd.read_csv(f'{indir}/{exp}/cr_arc/{s}/outs/summary.csv', sep=',')
    tmp_summary.index = [exp]
    mux_summary = pd.concat([mux_summary, tmp_summary])

In [25]:
doub_summary['amb_rate'] = [x.split('_')[0] for x in doub_summary.index]
doub_summary['doublet_rate'] = [x.split('_')[1] for x in doub_summary.index]
doub_summary['num_donors'] = [4 for x in doub_summary.index]
mux_summary['amb_rate'] = [x.split('_')[1] for x in mux_summary.index]
mux_summary['doublet_rate'] = [10 for x in mux_summary.index]
mux_summary['num_donors'] = [x.split('_')[0] for x in mux_summary.index]

In [26]:
all_summary.head()

Unnamed: 0,Sample ID,Genome,Pipeline version,Estimated number of cells,Feature linkages detected,Linked genes,Linked peaks,ATAC Confidently mapped read pairs,ATAC Fraction of genome in peaks,ATAC Fraction of high-quality fragments in cells,...,GEX Reads mapped to genome,GEX Reads with TSO,GEX Sequenced read pairs,GEX Total genes detected,GEX Valid UMIs,GEX Valid barcodes,type,amb_rate,doublet_rate,num_donors
0_0,20220928-IGVF-D0,GRCh38,cellranger-arc-2.0.1,8631,15570,5270,11303,0.8748,0.0424,0.9335,...,0.9981,0.0,60314841,29202,1.0,1.0,Vary_doublet,0,0,4
0_10,20220928-IGVF-D0,GRCh38,cellranger-arc-2.0.1,8584,13139,4687,10368,0.8744,0.0424,0.9264,...,0.998,0.0,60131721,29242,1.0,1.0,Vary_doublet,0,10,4
0_20,20220928-IGVF-D0,GRCh38,cellranger-arc-2.0.1,8571,11718,4469,9926,0.8749,0.0425,0.9246,...,0.9981,0.0,59914589,29164,1.0,1.0,Vary_doublet,0,20,4
0_30,20220928-IGVF-D0,GRCh38,cellranger-arc-2.0.1,8572,10058,3964,9226,0.8748,0.0427,0.9224,...,0.9981,0.0,60091431,29190,1.0,1.0,Vary_doublet,0,30,4
10_0,20220928-IGVF-D0,GRCh38,cellranger-arc-2.0.1,8594,11004,4241,8788,0.871,0.0401,0.9264,...,0.9981,0.0,60013084,29121,1.0,1.0,Vary_doublet,10,0,4


In [27]:
all_summary.to_csv(f'{projdir}/csv/final_figures/tables/table_s4_sims_crarc_qc_lowcov.csv',
                   sep=',', header=True, index=True)