In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.io import mmread, mmwrite

In [21]:
projdir = '/u/home/t/terencew/project-cluo/igvf/pilot/multiome/'
donors = list(np.loadtxt(f'{projdir}/txt/donors.txt', dtype=str))
samples = list(np.loadtxt(f'{projdir}/txt/samples.txt', dtype=str))[:2]
s = samples[0]

gex_methods = ['demuxlet', 'freemuxlet', 'vireo', 'vireo_nogenos',
               'scsplit', 'scsplit_nogenos', 'souporcell', 'souporcell_nogenos', 'demuxalot']
atac_methods = ['demuxlet', 'freemuxlet',  'vireo', 'vireo_nogenos',
               'scsplit', 'scsplit_nogenos', 'souporcell', 'souporcell_nogenos', 'scavengers']
    
merged_methods = ['demuxlet', 'freemuxlet',  'vireo', 'vireo_nogenos',
               'souporcell', 'souporcell_nogenos', 'ambimux', 'ambimux_joint']

#                'scsplit', 'scsplit_nogenos', 'souporcell', 'souporcell_nogenos', 'scavengers']
con_cols = ['C1', 'C2', 'I1', 'I2']

In [4]:
gex = pd.read_csv(f'{projdir}/csv/demux/inter_gex.csv', sep='\t',
                  header=0, index_col=0)
atac = pd.read_csv(f'{projdir}/csv/demux/inter_atac.csv', sep='\t',
                   header=0, index_col=0)
gex.shape, atac.shape

((30497, 9), (30497, 9))

In [5]:
ambimux_gex = pd.read_csv(f'{projdir}/csv/demux/gex/ambimux.csv', sep='\t', header=0, index_col=0)
ambimux_atac = pd.read_csv(f'{projdir}/csv/demux/atac/ambimux.csv', sep='\t', header=0, index_col=0)
ambimux_joint = pd.read_csv(f'{projdir}/csv/demux/ambimux.csv', sep='\t', header=0, index_col=0)
ambimux_gex.shape, ambimux_atac.shape, ambimux_joint.shape

((67899, 30), (67899, 30), (67899, 30))

In [6]:
ambimux_gex.sort_index(inplace=True)
ambimux_atac.sort_index(inplace=True)
ambimux_joint.sort_index(inplace=True)

In [7]:
gex['ambimux'] = ambimux_gex['assignment']
atac['ambimux'] = ambimux_atac['assignment']
gex['ambimux_joint'] = ambimux_joint['assignment']
atac['ambimux_joint'] = ambimux_joint['assignment']

In [8]:
tmp_gex = gex.copy()
tmp_atac = atac.copy()
tmp_gex['inter'] = gex.eq(gex.iloc[:, 0], axis=0).all(1)
tmp_atac['inter'] = atac.eq(atac.iloc[:, 0], axis=0).all(1)
tmp_gex['majority'] = gex.mode(axis=1)[0]
tmp_atac['majority'] = atac.mode(axis=1)[0]

gex['majority'] = tmp_gex['majority']
gex['inter'] = tmp_gex['inter']
atac['majority'] = tmp_atac['majority']
atac['inter'] = tmp_atac['inter']

In [9]:
gex['sample'] = [x.split('_')[1] for x in gex.index]
atac['sample'] = [x.split('_')[1] for x in atac.index]

In [10]:
def read_varcon_steps(indir, steps, cov_thresh):
    step = steps[0]
    con_path = f'{indir}/batches/{step}_cov{cov_thresh}_con_counts.csv'
    varcon = pd.read_csv(con_path, sep='\t', header=0, index_col=0)
    for step in steps[1:]:
        con_path = f'{indir}/batches/{step}_cov{cov_thresh}_con_counts.csv'
        tmp_con = pd.read_csv(con_path, sep='\t', header=0, index_col=0)
        varcon = pd.concat([varcon, tmp_con])
    return varcon

In [11]:
interval = 100
steps = list(range(0, gex.shape[0], interval))
steps[:5]

[0, 100, 200, 300, 400]

In [12]:
# !ls {projdir}/demux/regular/var_consistency/gex/{s}/batches

In [13]:
interval = 100
cov_thresh = 20
s = samples[0]

indir = f'{projdir}/demux/regular/var_consistency/gex/{s}/'
n_drops = sum(gex['sample'] == s)
steps = list(range(0, n_drops, interval))

gex_con = read_varcon_steps(indir, steps, cov_thresh)
gex_con.index = [f'{x}_{s}' for x in gex_con.index]
gex_con.shape

for s in samples[1:]:
    print(s)
    indir = f'{projdir}/demux/regular/var_consistency/gex/{s}/'
    n_drops = sum(gex['sample'] == s)
    steps = list(range(0, n_drops, interval))
    tmp_gex_con = read_varcon_steps(indir, steps, cov_thresh)
    tmp_gex_con.index = [f'{x}_{s}' for x in tmp_gex_con.index]
    gex_con = pd.concat([gex_con, tmp_gex_con])
    
gex_con.shape

20220928-IGVF-D3


(30498, 16)

In [14]:
def process_varcon(demux, varcon, donors, method):
    cols = ['C1', 'C2', 'I1', 'I2', 'donor']
    counts = pd.DataFrame(columns=cols)
    for donor in donors:
        bc_mask = demux[method] == donor
        tmp_assign = demux[bc_mask]
        tmp_con = varcon[bc_mask]
        col_mask = [donor in x for x in tmp_con.columns]
        select_cols = tmp_con.columns[col_mask]
        tmp_con = tmp_con[select_cols]
        tmp_con.columns = ['C1', 'C2', 'I1', 'I2']
        tmp_con['donor'] = donor
        counts = pd.concat([counts, tmp_con])
    return counts

In [15]:
# method = 'vireo'
# process_varcon(gex, gex_con, donors, method)

In [16]:
# cov_thresh = [0, 10, 20]
# outdir = f'{projdir}/csv/var_consistency/real/{s}'
# # indir = f'{projdir}/ambient/var_consistency/gex/{s}/'
# indir = f'{projdir}/demux/regular/var_consistency/gex/{s}/'

# for method in gex_methods:
#     print(method)
#     for cov in cov_thresh:
# #         indir = f'{projdir}/ambient/var_consistency/gex/{s}/'

#         indir = f'{projdir}/demux/regular/var_consistency/gex/{s}/'
#         n_drops = sum(gex['sample'] == s)
#         steps = list(range(0, n_drops, interval))

#         gex_con = read_varcon_steps(indir, steps, cov)
#         gex_con.index = [f'{x}_{s}' for x in gex_con.index]
#         gex_con.shape

#         for s in samples[1:]:
#             indir = f'{projdir}/demux/regular/var_consistency/gex/{s}/'
#             n_drops = sum(gex['sample'] == s)
#             steps = list(range(0, n_drops, interval))
#             tmp_gex_con = read_varcon_steps(indir, steps, cov)
#             tmp_gex_con.index = [f'{x}_{s}' for x in tmp_gex_con.index]
#             gex_con = pd.concat([gex_con, tmp_gex_con])

# #         gex_con = read_varcon_steps(indir, steps, cov)
# #         gex_con.index = [f'{x}_{s}' for x in gex_con.index]
        
#         outdir = f'{projdir}/csv/var_consistency/real/gex/{s}'
#         consistency = process_varcon(gex, gex_con, donors, method)
#         consistency.to_csv(f'{outdir}/{method}_cov{cov}_varcon.csv', sep='\t', header=True, index=True)

In [18]:
cov_thresh = [0, 10, 20]
outdir = f'{projdir}/csv/var_consistency/real/gex/merged'

# indir = f'{projdir}/ambient/var_consistency/gex/{s}/'
indir = f'{projdir}/demux/regular/var_consistency/gex/{s}/'

for cov in cov_thresh:
    print(cov)
#         indir = f'{projdir}/ambient/var_consistency/gex/{s}/'

    s = samples[0]
    indir = f'{projdir}/demux/regular/var_consistency/gex/{s}/'
    n_drops = sum(gex['sample'] == s)
    steps = list(range(0, n_drops, interval))

    gex_con = read_varcon_steps(indir, steps, cov)
    gex_con.index = [f'{x}_{s}' for x in gex_con.index]
    gex_con.shape

    for s in samples[1:]:
        indir = f'{projdir}/demux/regular/var_consistency/gex/{s}/'
        n_drops = sum(gex['sample'] == s)
        steps = list(range(0, n_drops, interval))
        tmp_gex_con = read_varcon_steps(indir, steps, cov)
        tmp_gex_con.index = [f'{x}_{s}' for x in tmp_gex_con.index]
        gex_con = pd.concat([gex_con, tmp_gex_con])
        
    mask = gex_con.index.duplicated()
    gex_con = gex_con[~mask]
    print(gex.shape, gex_con.shape)
    
    for method in gex_methods:
        consistency = process_varcon(gex, gex_con, donors, method)
        consistency.to_csv(f'{outdir}/{method}_cov{cov}_varcon.csv', sep='\t', header=True, index=True)
    
    method = 'majority'
    consistency = process_varcon(gex, gex_con, donors, method)
    consistency.to_csv(f'{outdir}/{method}_cov{cov}_varcon.csv', sep='\t', header=True, index=True)
    
    method = 'inter'
    mask = gex['inter']
    tmp_gex = pd.DataFrame(gex[mask].iloc[:,0])
    tmp_gex_con = gex_con[mask]
    tmp_gex.columns = ['inter']
    consistency = process_varcon(tmp_gex, tmp_gex_con, donors, method)
    consistency.to_csv(f'{outdir}/{method}_cov{cov}_varcon.csv', sep='\t', header=True, index=True)
    
    ### 
    method = 'ambimux'
    consistency = process_varcon(gex, gex_con, donors, method)
    consistency.to_csv(f'{outdir}/{method}_cov{cov}_varcon.csv', sep='\t', header=True, index=True)    
    
    method = 'ambimux_joint'
    consistency = process_varcon(gex, gex_con, donors, method)
    consistency.to_csv(f'{outdir}/{method}_cov{cov}_varcon.csv', sep='\t', header=True, index=True)
    

0
(30497, 14) (30497, 16)
10
(30497, 14) (30497, 16)
20
(30497, 14) (30497, 16)


In [19]:
cov_thresh = [0, 10, 20]
outdir = f'{projdir}/csv/var_consistency/real/{s}'
# indir = f'{projdir}/ambient/var_consistency/atac/{s}/'
indir = f'{projdir}/demux/regular/var_consistency/atac/{s}/'

for cov in cov_thresh:
    print(cov)
#         indir = f'{projdir}/ambient/var_consistency/atac/{s}/'

    s = samples[0]
    indir = f'{projdir}/demux/regular/var_consistency/atac/{s}/'
    n_drops = sum(atac['sample'] == s)
    steps = list(range(0, n_drops, interval))

    atac_con = read_varcon_steps(indir, steps, cov)
    atac_con.index = [f'{x}_{s}' for x in atac_con.index]
    atac_con.shape

    for s in samples[1:]:
        indir = f'{projdir}/demux/regular/var_consistency/atac/{s}/'
        n_drops = sum(atac['sample'] == s)
        steps = list(range(0, n_drops, interval))
        tmp_atac_con = read_varcon_steps(indir, steps, cov)
        tmp_atac_con.index = [f'{x}_{s}' for x in tmp_atac_con.index]
        atac_con = pd.concat([atac_con, tmp_atac_con])
        
    for method in atac_methods:
        outdir = f'{projdir}/csv/var_consistency/real/atac/merged/'
        consistency = process_varcon(atac, atac_con, donors, method)
        consistency.to_csv(f'{outdir}/{method}_cov{cov}_varcon.csv', sep='\t', header=True, index=True)
    
    mask = atac_con.index.duplicated()
    atac_con = atac_con[~mask]
    print(atac.shape, atac_con.shape)
    
    method = 'majority'
    consistency = process_varcon(atac, atac_con, donors, method)
    consistency.to_csv(f'{outdir}/{method}_cov{cov}_varcon.csv', sep='\t', header=True, index=True)
    
    method = 'inter'
    mask = atac['inter']
    tmp_atac = pd.DataFrame(atac[mask].iloc[:,0])
    tmp_atac_con = atac_con[mask]
    tmp_atac.columns = ['inter']
    consistency = process_varcon(tmp_atac, tmp_atac_con, donors, method)
    consistency.to_csv(f'{outdir}/{method}_cov{cov}_varcon.csv', sep='\t', header=True, index=True)
    
    ### 
    method = 'ambimux'
    consistency = process_varcon(atac, atac_con, donors, method)
    consistency.to_csv(f'{outdir}/{method}_cov{cov}_varcon.csv', sep='\t', header=True, index=True)    
    
    method = 'ambimux_joint'
    consistency = process_varcon(atac, atac_con, donors, method)
    consistency.to_csv(f'{outdir}/{method}_cov{cov}_varcon.csv', sep='\t', header=True, index=True)

0


  import sys


(30497, 14) (30497, 16)
10
(30497, 14) (30497, 16)
20
(30497, 14) (30497, 16)


In [33]:
 for method in merged_methods:
    print(method)
    consistency = process_varcon(gex, gex_con, donors, method)
    consistency['sample'] = [x.split('_')[1] for x in consistency.index]
    for s in samples:
        outdir = f'{projdir}/csv/var_consistency/real/gex/{s}/'
        tmp_con = consistency[consistency['sample'] == s]
        tmp_con.to_csv(f'{outdir}/{method}_cov{cov}_varcon.csv',sep='\t', header=True, index=True)

demuxlet
freemuxlet
vireo
vireo_nogenos
souporcell
souporcell_nogenos
ambimux
ambimux_joint


In [34]:
 for method in merged_methods:
    print(method)
    consistency = process_varcon(atac, atac_con, donors, method)
    consistency['sample'] = [x.split('_')[1] for x in consistency.index]
    for s in samples:
        outdir = f'{projdir}/csv/var_consistency/real/atac/{s}/'
        tmp_con = consistency[consistency['sample'] == s]
        tmp_con.to_csv(f'{outdir}/{method}_cov{cov}_varcon.csv',sep='\t', header=True, index=True)

demuxlet
freemuxlet
vireo
vireo_nogenos
souporcell
souporcell_nogenos
ambimux
ambimux_joint
