In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.io import mmread, mmwrite

In [3]:
projdir = '/u/home/t/terencew/project-cluo/igvf/pilot/multiome/'
donors = list(np.loadtxt(f'{projdir}/txt/donors.txt', dtype=str))
samples = list(np.loadtxt(f'{projdir}/txt/samples.txt', dtype=str))
s = samples[0]

gex_methods = ['demuxlet', 'freemuxlet', 'vireo', 'vireo_nogenos',
     'scsplit', 'scsplit_nogenos', 'souporcell', 'souporcell_nogenos', 'demuxalot']
atac_methods = ['demuxlet', 'freemuxlet',  'vireo', 'vireo_nogenos',
    'scsplit', 'scsplit_nogenos', 'souporcell', 'souporcell_nogenos', 'scavengers']

con_cols = ['C1', 'C2', 'I1', 'I2']
final_con_cols = ['C1', 'C2', 'I1', 'I2', 'donor']

pd_experiments = np.loadtxt(f'{projdir}/ambient/ambisim/prop_doub/txt/experiments.txt', dtype=str)
mux_experiments = np.loadtxt(f'{projdir}/ambient/ambisim/mux_test/txt/experiments.txt', dtype=str)

In [4]:
pd_exps = pd_experiments[['_20' in x for x in pd_experiments]]
pd_exps

array(['0_20', '10_20', '20_20', '30_20', '40_20', '50_20'], dtype='<U5')

In [5]:
mux_exps = mux_experiments[['_20' in x for x in mux_experiments]]
mux_exps

array(['2_20', '6_20', '8_20', '10_20', '12_20', '14_20', '16_20'],
      dtype='<U5')

In [6]:
# indir = f'{projdir}/ambient/ambisim/prop_doub/'
# exp = '0_20'
# gex_indiv = pd.read_csv(f'{indir}/{exp}/demux/merged/gex/r2_test/{s}/individual_gex.csv', sep='\t',
#                   header=0, index_col=0)
# atac_indiv = pd.read_csv(f'{indir}/{exp}/demux/merged/atac/r2_test/{s}/individual_atac.csv', sep='\t',
#                    header=0, index_col=0)
# gex_indiv.shape

In [7]:
def read_varcon_steps(indir, steps, cov_thresh):
    step = steps[0]
    con_path = f'{indir}/batches/{step}_cov{cov_thresh}_con_counts.csv'
    varcon = pd.read_csv(con_path, sep='\t', header=0, index_col=0)
    for step in steps[1:]:
        con_path = f'{indir}/batches/{step}_cov{cov_thresh}_con_counts.csv'
        tmp_con = pd.read_csv(con_path, sep='\t', header=0, index_col=0)
        varcon = pd.concat([varcon, tmp_con])
    return varcon

In [8]:
def process_varcon(demux, varcon, donors, method):
    cols = ['C1', 'C2', 'I1', 'I2', 'donor']
    counts = pd.DataFrame(columns=cols)
    for donor in donors:
        bc_mask = demux[method] == donor
        tmp_assign = demux[bc_mask]
        tmp_con = varcon[bc_mask]
        col_mask = [donor in x for x in tmp_con.columns]
        select_cols = tmp_con.columns[col_mask]
        tmp_con = tmp_con[select_cols]
        tmp_con.columns = ['C1', 'C2', 'I1', 'I2']
        tmp_con['donor'] = donor
        counts = pd.concat([counts, tmp_con])
    return counts

In [9]:
def process_con(con):
    con['DP'] = np.sum(con.iloc[:,:4], axis=1)
    con['Consistent rate'] = (con['C1'] + con['C2']) / con['DP']
    con['Inconsistent rate'] = 1 - con['Consistent rate']
    con['C1 rate'] = con['C1'] / con['DP']
    con['C2 rate'] = con['C2'] / con['DP']
    con['I1 rate'] = con['I1'] / con['DP']
    con['I2 rate'] = con['I2'] / con['DP']
    return con

In [10]:
!ls {indir}/{exp}/csv/demux/

ls: cannot access {indir}/{exp}/csv/demux/: No such file or directory


In [11]:
indir = f'{projdir}/ambient/ambisim/prop_doub/'
exp = pd_exps[0]

In [12]:
gex = pd.read_csv(f'{indir}/{exp}/csv/demux/individual_gex.csv', sep='\t',
              header=0, index_col=0)
atac = pd.read_csv(f'{indir}/{exp}/csv/demux/individual_atac.csv', sep='\t',
              header=0, index_col=0)

In [13]:
# n_drops = sum(gex['method'] == 'demuxlet')
n_drops = gex.shape[0]
interval = 200
steps = list(range(0, n_drops, interval))
cov_thresh = 20

In [14]:
pd_exps = ['0_20', '10_20', '20_20', '30_20']
exp = pd_exps[0]

vardir = f'{projdir}/ambient/ambisim/prop_doub/{exp}/demux/var_consistency/gex/{s}/'
gex_con = read_varcon_steps(vardir, steps, cov_thresh)
gex_con.index = [f'{x}_D0' for x in gex_con.index]
gex_con.shape

(8571, 16)

In [15]:
vardir = f'{projdir}/ambient/ambisim/prop_doub/{exp}/demux/var_consistency/atac/{s}/'

atac_con = read_varcon_steps(vardir, steps, cov_thresh)
atac_con.index = [f'{x}_D0' for x in atac_con.index]
atac_con.shape

(8571, 16)

In [16]:
gex

Unnamed: 0,demuxlet,freemuxlet,vireo,vireo_nogenos,souporcell,souporcell_nogenos,scsplit,scsplit_nogenos,demuxalot,truth
AAACAGCCAAACAACA-1_D0,27_C4,27_C4,27_C4,27_C4,27_C4,27_C4,27_C4,27_C4,27_C4,27_C4
AAACAGCCAAACATAG-1_D0,26_A10,26_A10,26_A10,26_A10,26_A10,26_A10,26_A10,26_A10,26_A10,26_A10
AAACAGCCAAACCCTA-1_D0,27_C4,27_C4,27_C4,27_C4,27_C4,27_C4,27_C4,27_C4,27_C4,27_C4
AAACAGCCAAACCTAT-1_D0,26_A10,26_A10,26_A10,26_A10,26_A10,26_A10,26_A10,26_A10,26_A10,26_A10
AAACAGCCAAACCTTG-1_D0,48_V3,48_V3,48_V3,48_V3,48_V3,48_V3,48_V3,48_V3,48_V3,48_V3
...,...,...,...,...,...,...,...,...,...,...
AACAAGCCACTGACTA-1_D0,48_V3,doublet,doublet,doublet,doublet,doublet,26_A10,26_A10,doublet,doublet
AACAAGCCACTGGCCA-1_D0,27_C4,doublet,doublet,doublet,27_C4,doublet,doublet,doublet,doublet,doublet
AACAAGCCACTGGCTG-1_D0,doublet,25_A4,doublet,doublet,doublet,doublet,doublet,doublet,doublet,doublet
AACAAGCCACTTAACG-1_D0,48_V3,48_V3,doublet,doublet,unassigned,doublet,doublet,unassigned,doublet,doublet


In [17]:
gex_truth_con = process_varcon(gex, gex_con, donors, 'truth')
atac_truth_con = process_varcon(atac, atac_con, donors, 'truth')

In [18]:
tmp_gex = pd.DataFrame(np.sum(gex_truth_con.drop(columns=['donor']), axis=0)).transpose()
tmp_atac = pd.DataFrame(np.sum(atac_truth_con.drop(columns=['donor']), axis=0)).transpose()

In [19]:
process_con(tmp_gex)
process_con(tmp_atac).columns

Index(['C1', 'C2', 'I1', 'I2', 'DP', 'Consistent rate', 'Inconsistent rate',
       'C1 rate', 'C2 rate', 'I1 rate', 'I2 rate'],
      dtype='object')

In [20]:
atac_truth_con

Unnamed: 0,C1,C2,I1,I2,donor
AAACAGCCAAACGCGA-1_D0,8,90,2,0,25_A4
AAACAGCCAAACTAAG-1_D0,2,29,0,0,25_A4
AAACAGCCAAAGCTCC-1_D0,16,198,2,0,25_A4
AAACAGCCAAATACCT-1_D0,1,21,0,0,25_A4
AAACAGCCAAATATCC-1_D0,2,48,0,0,25_A4
...,...,...,...,...,...
AAAGGTTAGTAACCAC-1_D0,8,102,0,0,48_V3
AAAGGTTAGTAACTCA-1_D0,8,110,0,0,48_V3
AAAGGTTAGTAAGAAC-1_D0,1,100,0,0,48_V3
AAAGGTTAGTAGCCAT-1_D0,11,220,1,0,48_V3


In [21]:
process_con(tmp_gex).values

array([[189857, 2842008, 9586, 230, 3041681.0, 0.9967728371252607,
        0.003227162874739342, 0.06241844558979064, 0.9343543915354701,
        0.0031515467927110043, 7.561608202832578e-05]], dtype=object)

In [22]:
final_con_cols = ['C1', 'C2', 'I1', 'I2', 'DP', 'Consistent rate', 'Inconsistent rate',
       'C1 rate', 'C2 rate', 'I1 rate', 'I2 rate']
cov_thresh = [0, 10, 20]
interval = 200
s = samples[0]

indir = f'{projdir}/ambient/ambisim/prop_doub/'
donors = list(np.loadtxt(f'{projdir}/txt/donors.txt', dtype=str))

pd_gex = pd.DataFrame(index=pd_exps, columns=final_con_cols, data=0)
pd_atac = pd.DataFrame(index=pd_exps, columns=final_con_cols, data=0)

# for exp in pd_exps:
for exp in pd_experiments:
    print(exp)
    gex_indiv = pd.read_csv(f'{indir}/{exp}/csv/demux/individual_gex.csv', sep='\t',
                  header=0, index_col=0)
    atac_indiv = pd.read_csv(f'{indir}/{exp}/csv/demux/individual_atac.csv', sep='\t',
                  header=0, index_col=0)
    n_drops = gex_indiv.shape[0]
    steps = list(range(0, n_drops, interval))
    for cov in cov_thresh:
        vardir = f'{projdir}/ambient/ambisim/prop_doub/{exp}/demux/var_consistency/gex/{s}/'
        gex_con = read_varcon_steps(vardir, steps, cov)
        gex_con.index = [f'{x}_D0' for x in gex_con.index]

        gex_truth_con = process_varcon(gex_indiv, gex_con, donors, 'truth')
        
        outdir = f'{projdir}/ambient/ambisim/prop_doub/{exp}/csv/var_consistency/gex/{s}/'
        gex_con.to_csv(f'{outdir}/{exp}_cov{cov}_all.csv', sep='\t', header=True, index=True)
        gex_truth_con.to_csv(f'{outdir}/{exp}_cov{cov}_truth.csv', sep='\t', header=True, index=True)
        
        for method in gex_methods:
            tmp_gex_con = process_varcon(gex_indiv, gex_con, donors, method)
            tmp_gex_con.to_csv(f'{outdir}/{exp}_cov{cov}_{method}.csv', sep='\t', header=True, index=True)
        
        vardir = f'{projdir}/ambient/ambisim/prop_doub/{exp}/demux/var_consistency/atac/{s}/'
        atac_con = read_varcon_steps(vardir, steps, cov)
        atac_con.index = [f'{x}_D0' for x in atac_con.index]
        atac_truth_con = process_varcon(atac_indiv, atac_con, donors, 'truth')
        
        outdir = f'{projdir}/ambient/ambisim/prop_doub/{exp}/csv/var_consistency/atac/{s}/'
        atac_con.to_csv(f'{outdir}/{exp}_cov{cov}_all.csv', sep='\t', header=True, index=True)
        atac_truth_con.to_csv(f'{outdir}/{exp}_cov{cov}_truth.csv', sep='\t', header=True, index=True)
        
        for method in atac_methods:
            tmp_atac_con = process_varcon(atac_indiv, atac_con, donors, method)
            tmp_atac_con.to_csv(f'{outdir}/{exp}_cov{cov}_{method}.csv', sep='\t', header=True, index=True)
        
        tmp_gex = pd.DataFrame(np.sum(gex_truth_con.drop(columns=['donor']), axis=0)).transpose()
        tmp_atac = pd.DataFrame(np.sum(atac_truth_con.drop(columns=['donor']), axis=0)).transpose()

        pd_gex.loc[exp] = process_con(tmp_gex).values[0]
        pd_atac.loc[exp] = process_con(tmp_atac).values[0]

0_0
0_10
0_20
0_30
0_40
0_50
10_0
10_10
10_20
10_30
10_40
10_50
20_0
20_10
20_20
20_30
20_40
20_50
30_0
30_10
30_20
30_30


FileNotFoundError: [Errno 2] No such file or directory: '/u/home/t/terencew/project-cluo/igvf/pilot/multiome//ambient/ambisim/prop_doub/30_30/demux/var_consistency/gex/20220928-IGVF-D0//batches/2600_cov0_con_counts.csv'

In [None]:
pd_gex

In [None]:
gex_indiv

In [None]:
!ls {indir}/ambient/ambisim/mux_test/txt/

In [None]:
final_con_cols = ['C1', 'C2', 'I1', 'I2', 'DP', 'Consistent rate', 'Inconsistent rate',
       'C1 rate', 'C2 rate', 'I1 rate', 'I2 rate']
cov_thresh = [0, 10, 20]
interval = 200
s = samples[0]

indir = f'{projdir}/ambient/ambisim/mux_test/'
donors = list(np.loadtxt(f'{projdir}/txt/donors.txt', dtype=str))

mux_gex = pd.DataFrame(index=mux_exps, columns=final_con_cols, data=0)
mux_atac = pd.DataFrame(index=mux_exps, columns=final_con_cols, data=0)

for exp in mux_exps:
# for exp in mux_experiments:
    print(exp)
    n = exp.split('_')[0]
    donors = list(np.loadtxt(f'{indir}/txt/{n}_donors.txt', dtype=str))
    gex_indiv = pd.read_csv(f'{indir}/{exp}/csv/demux/individual_gex.csv', sep='\t',
                  header=0, index_col=0)
    atac_indiv = pd.read_csv(f'{indir}/{exp}/csv/demux/individual_atac.csv', sep='\t',
                  header=0, index_col=0)
    n_drops = gex_indiv.shape[0]
    steps = list(range(0, n_drops, interval))
    for cov in cov_thresh:
        vardir = f'{projdir}/ambient/ambisim/mux_test/{exp}/demux/var_consistency/gex/{s}/'
        gex_con = read_varcon_steps(vardir, steps, cov)
        gex_con.index = [f'{x}_D0' for x in gex_con.index]

        gex_truth_con = process_varcon(gex_indiv, gex_con, donors, 'truth')
        
        outdir = f'{projdir}/ambient/ambisim/mux_test/{exp}/csv/var_consistency/gex/{s}/'
        gex_con.to_csv(f'{outdir}/{exp}_cov{cov}_all.csv', sep='\t', header=True, index=True)
        gex_truth_con.to_csv(f'{outdir}/{exp}_cov{cov}_truth.csv', sep='\t', header=True, index=True)
        
        for method in gex_methods:
            tmp_gex_con = process_varcon(gex_indiv, gex_con, donors, method)
            tmp_gex_con.to_csv(f'{outdir}/{exp}_cov{cov}_{method}.csv', sep='\t', header=True, index=True)
        
        vardir = f'{projdir}/ambient/ambisim/mux_test/{exp}/demux/var_consistency/atac/{s}/'
        atac_con = read_varcon_steps(vardir, steps, cov)
        atac_con.index = [f'{x}_D0' for x in atac_con.index]
        atac_truth_con = process_varcon(atac_indiv, atac_con, donors, 'truth')
        
        outdir = f'{projdir}/ambient/ambisim/mux_test/{exp}/csv/var_consistency/atac/{s}/'
        atac_con.to_csv(f'{outdir}/{exp}_cov{cov}_all.csv', sep='\t', header=True, index=True)
        atac_truth_con.to_csv(f'{outdir}/{exp}_cov{cov}_truth.csv', sep='\t', header=True, index=True)
        
        for method in atac_methods:
            tmp_atac_con = process_varcon(atac_indiv, atac_con, donors, method)
            tmp_atac_con.to_csv(f'{outdir}/{exp}_cov{cov}_{method}.csv', sep='\t', header=True, index=True)
        
        tmp_gex = pd.DataFrame(np.sum(gex_truth_con.drop(columns=['donor']), axis=0)).transpose()
        tmp_atac = pd.DataFrame(np.sum(atac_truth_con.drop(columns=['donor']), axis=0)).transpose()

        mux_gex.loc[exp] = process_con(tmp_gex).values[0]
        mux_atac.loc[exp] = process_con(tmp_atac).values[0]