In [58]:
import scanpy as sc
import os,sys
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import matplotlib.ticker as mticker

In [59]:
sc.logging.print_header()

scanpy==1.8.0 anndata==0.7.6 umap==0.5.1 numpy==1.20.1 scipy==1.6.2 pandas==1.2.4 scikit-learn==0.24.1 statsmodels==0.12.2 python-igraph==0.9.8 louvain==0.7.1 pynndescent==0.5.5


In [60]:
# add the utility functions folder to PATH (from Rapolas Zilionis, taken from
# https://github.com/rapolaszilionis/utility_functions)

sys.path.append(os.path.abspath("utility_functions_190403_12h24/"))

from rz_import_statements import *
import rz_functions as rz
import rz_fig_params # this adjust mpl.rcParams, almost nothing to import, import after scanpy to overwrite rc.Params
import rz_utility_spring as srz

In [61]:
#load raw adata
adata = sc.read_h5ad('backups_JZ_2022/concatenated_raw_4866048x33538_220120_12h51.h5ad')

In [62]:
adata

AnnData object with n_obs × n_vars = 4866048 × 33538
    obs: 'library'

## Filtering on total counts and mito fraction

In [65]:
#get total counts
adata.obs['total_counts']= adata.X.sum(axis=1)

In [76]:
# remove barcodes with veeery few counts, e.g. 10 to ease up calculations further

cmask = adata.obs['total_counts'].values>10 
print(adata.shape)
adata = adata[cmask]
print(adata.shape)

(4866048, 33538)
(1718924, 33538)


In [77]:
len(adata.obs['library'].unique())

33

In [79]:
# define mitochondrial genes 

mitogenes = [i for i in adata.var_names if i.startswith('MT-')]
print (len(mitogenes))
mitogenes
                                                        

13


['MT-ND1',
 'MT-ND2',
 'MT-CO1',
 'MT-CO2',
 'MT-ATP8',
 'MT-ATP6',
 'MT-CO3',
 'MT-ND3',
 'MT-ND4L',
 'MT-ND4',
 'MT-ND5',
 'MT-ND6',
 'MT-CYB']

In [80]:
#calculate mitochondrial percentage and add to obs

mitomask = np.in1d(adata.var_names,mitogenes)

adata.obs['pct_counts_mito'] = np.array(adata.X[:,mitomask].sum(axis=1)).T[0]/adata.obs['total_counts'].values*100.
adata.obs['pct_counts_mito']


Trying to set attribute `.obs` of view, copying.


0           9.523809
1          11.764706
2           0.668151
3           1.818182
6           3.571429
             ...    
4866043    21.428572
4866044     7.692308
4866045    46.153847
4866046    22.500000
4866047     6.220096
Name: pct_counts_mito, Length: 1718924, dtype: float32

In [82]:
#tidying up the library names
for i in sorted(adata.obs['library'].unique()):
    print('"%s":"",'%i)

"0621Normal":"",
"0621Tumor":"",
"0704Normal":"",
"0818T_S1":"",
"0903T_2_S3":"",
"0903T_S2":"",
"0914T_2_S5":"",
"0914T_S4":"",
"0923T_2_S7":"",
"0923T_S6":"",
"1116N_S11":"",
"1116T1_S8":"",
"1116T2_S9":"",
"1116T3_1_S10":"",
"Healthy0228":"",
"Healthy0314":"",
"N093_S4":"",
"N14":"",
"N21":"",
"N28":"",
"N818_S2":"",
"N914_S6":"",
"N923_S8":"",
"T04":"",
"T14":"",
"T21":"",
"T28":"",
"T818_S1":"",
"T903_S3":"",
"T914_S5":"",
"T923_S7":"",
"Tumor0228":"",
"Tumor0314":"",


In [83]:
renamer = {"0621Normal":"N3_old",
"0621Tumor":"T3_old",
"0704Normal":"N1_old",
           
"0818T_S1":"T5_1",
"0903T_2_S3":"T6_1",
"0903T_S2":"T6_2",
"0914T_2_S5":"T7_2",
"0914T_S4":"T7_1",
"0923T_2_S7":"T8_2",
"0923T_S6":"T8_1",
"1116N_S11":"N9_1",
"1116T1_S8":"T9_1",
"1116T2_S9":"T9_2",
"1116T3_1_S10":"T9_3",
           
"Healthy0228":"N4_old",
"Healthy0314":"N2_old",
           
"N093_S4":"N6_1",
"N14":"T2_1", #names were accidentally swapped between tumor and normal while running StarSolo
"N21":"N3_1",
"N28":"N4_1",
           
"N818_S2":"N5_1",
"N914_S6":"N7_1",
"N923_S8":"N8_1",
           
"T04":"T1_1",
"T14":"N2_1",#names were accidentally swapped between tumor and normal while running StarSolo
"T21":"T3_1",
"T28":"T4_1",
           
"T818_S1":"T5_2",
"T903_S3":"T6_3",
"T914_S5":"T7_3",
"T923_S7":"T8_3",
           
"Tumor0228":"T4_old",
"Tumor0314":"T2_old"
}

In [84]:
adata.obs['library2'] = [renamer[i] for i in adata.obs['library']]
adata.obs.head()

Unnamed: 0,library,total_counts,pct_counts_mito,library2
0,N14,21.0,9.523809,T2_1
1,N14,17.0,11.764706,T2_1
2,N14,449.0,0.668151,T2_1
3,N14,55.0,1.818182,T2_1
6,N14,28.0,3.571429,T2_1


In [85]:
libcol = 'library2' # column of obs to use

# just a hack to faster fill-in thresholds used, they are set by eye
libs = adata.obs[libcol].unique()

#for entering threshold, copy/paste the output of this cell below
print("ts_dict = {") #for "threshold dictionary"
for lib in libs:
    print('"%s":(400,20), #(min_nr_counts,max_pct_mito)'%lib)
print('}')

ts_dict = {
"T2_1":(400,20), #(min_nr_counts,max_pct_mito)
"N3_1":(400,20), #(min_nr_counts,max_pct_mito)
"N4_1":(400,20), #(min_nr_counts,max_pct_mito)
"T1_1":(400,20), #(min_nr_counts,max_pct_mito)
"N2_1":(400,20), #(min_nr_counts,max_pct_mito)
"T3_1":(400,20), #(min_nr_counts,max_pct_mito)
"T4_1":(400,20), #(min_nr_counts,max_pct_mito)
"N6_1":(400,20), #(min_nr_counts,max_pct_mito)
"N5_1":(400,20), #(min_nr_counts,max_pct_mito)
"N7_1":(400,20), #(min_nr_counts,max_pct_mito)
"N8_1":(400,20), #(min_nr_counts,max_pct_mito)
"T5_2":(400,20), #(min_nr_counts,max_pct_mito)
"T6_3":(400,20), #(min_nr_counts,max_pct_mito)
"T7_3":(400,20), #(min_nr_counts,max_pct_mito)
"T8_3":(400,20), #(min_nr_counts,max_pct_mito)
"T5_1":(400,20), #(min_nr_counts,max_pct_mito)
"T6_1":(400,20), #(min_nr_counts,max_pct_mito)
"T6_2":(400,20), #(min_nr_counts,max_pct_mito)
"T7_2":(400,20), #(min_nr_counts,max_pct_mito)
"T7_1":(400,20), #(min_nr_counts,max_pct_mito)
"T8_2":(400,20), #(min_nr_counts,max_pct_mito)
"

In [86]:
# copy and edit output of previous cell
# first run without any thresholds, then adjust based on the distribution of barcodes

# library T4_1 will have 1000 UMI treshold due to extreme contamination of counts
# Note:  later on this library T4_1 was removed along with T1_1

ts_dict = {
"N2_1":(400,20), #(min_nr_genes,max_pct_mito)
"N3_1":(400,20), #(min_nr_genes,max_pct_mito)
"N4_1":(400,20), #(min_nr_genes,max_pct_mito)
"T1_1":(400,20), #(min_nr_genes,max_pct_mito)
"T2_1":(400,20), #(min_nr_genes,max_pct_mito)
"T3_1":(300,20), #(min_nr_genes,max_pct_mito)
"T4_1":(1000,20), #(min_nr_genes,max_pct_mito)
"N6_1":(400,20), #(min_nr_genes,max_pct_mito)
"N5_1":(400,20), #(min_nr_genes,max_pct_mito)
"N7_1":(400,20), #(min_nr_genes,max_pct_mito)
"N8_1":(400,20), #(min_nr_genes,max_pct_mito)
"T5_2":(400,20), #(min_nr_genes,max_pct_mito)
"T6_3":(400,20), #(min_nr_genes,max_pct_mito)
"T7_3":(400,20), #(min_nr_genes,max_pct_mito)
"T8_3":(400,20), #(min_nr_genes,max_pct_mito)
"N1_old":(400,20), #(min_nr_genes,max_pct_mito)
"N2_old":(400,20), #(min_nr_genes,max_pct_mito)
"N3_old":(300,20), #(min_nr_genes,max_pct_mito)
"N4_old":(300,20), #(min_nr_genes,max_pct_mito)
"T2_old":(300,20), #(min_nr_genes,max_pct_mito)
"T3_old":(400,20), #(min_nr_genes,max_pct_mito)
"T4_old":(400,20), #(min_nr_genes,max_pct_mito)
"T5_1":(400,20), #(min_nr_genes,max_pct_mito)
"T6_1":(400,20), #(min_nr_genes,max_pct_mito)
"T6_2":(400,20), #(min_nr_genes,max_pct_mito)
"T7_2":(400,20), #(min_nr_genes,max_pct_mito)
"T7_1":(400,20), #(min_nr_genes,max_pct_mito)
"T8_2":(400,20), #(min_nr_genes,max_pct_mito)
"T8_1":(400,20), #(min_nr_genes,max_pct_mito)
"N9_1":(400,20), #(min_nr_genes,max_pct_mito)
"T9_1":(300,20), #(min_nr_genes,max_pct_mito)
"T9_2":(400,20), #(min_nr_genes,max_pct_mito)
"T9_3":(400,20), #(min_nr_genes,max_pct_mito)
}

In [None]:
!mkdir -p outputs_JZ_2022

plot_title='kidney'
pass_filters = []

#plot mito histograms
nr_rows = math.ceil(len(libs)/5.)

fig,gs = rz.startfig(w=25,h=nr_rows*10,rows=nr_rows*2,columns=5,return_first_ax=False)
#startfig - a custom function by Rapo

# get all the coordinates of the my subplots
coords = np.argwhere(np.zeros([nr_rows,5]) == 0)

# control total count range to expect
xmin = 10
xmax = 10**4.2


for c,lib in zip(coords,libs):
    
    a = fig.add_subplot(gs[c[0]*2,c[1]])
    a1 = fig.add_subplot(gs[c[0]*2+1,c[1]]) #c in coords

    mask = adata.obs[libcol] == lib
    x = adata.obs.loc[mask,'total_counts'] 
    y = adata.obs.loc[mask,'pct_counts_mito']
    
    #scatter
    a.scatter(x,y,lw=0,s=5,alpha=0.1,rasterized=True)
    a.set_xscale('log')
    a.set_xlim(xmin,xmax)
    a.set_ylim(0,100)
    a.set_ylabel('Mitochondrial counts, %')
    a.set_title(lib)
    
    #plot threshold in scatter
    ts = ts_dict[lib]
    a.plot((ts[0],ts[0]),(a.get_ylim()[0],ts[1]),lw=1,color='r')
    a.plot((ts[0],a.get_xlim()[1]),(ts[1],ts[1]),lw=1,color='r')
    pass_ts = (adata.obs.loc[mask,'total_counts']>=ts[0])&(adata.obs.loc[mask,'pct_counts_mito']<ts[1]) 
    cells_pass = pass_ts.sum()
    cells_all = len(pass_ts)
    a.text(a.get_xlim()[1]*0.8,a.get_ylim()[1]*0.8,
           "%d out of %d\nbarcodes pass filter"%(cells_pass,cells_all),
           ha='right')
    
    pass_filters+=(list(pass_ts.values))
    
    # histogram
    bins=np.logspace(np.log10(xmin),np.log10(xmax),51)
    hs, bins,patches = plt.hist(x,bins=bins)
    
    #plot barchart
    lefts = bins[:-1]
    rights = bins[1:]
    a1.bar(x = lefts,width = rights-lefts,height = hs*rights,
          align='edge',
          lw=0.,color = 'c')
    a1.set_xscale('log');
    a1.set_xlim(xmin,xmax)
    #a1.ticklabel_format(axis = 'y', style = 'sci', scilimits = (1,2))
    #a1.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.2e'))
    f = mticker.ScalarFormatter(useOffset=False, useMathText=True)
    g = lambda x,pos : "${}$".format(f._formatSciNotation('%1.10e' % x))
    plt.gca().yaxis.set_major_formatter(mticker.FuncFormatter(g))

    # threshold:
    a1.plot((ts[0],ts[0]),(a1.get_ylim()[0],a1.set_ylim()[1]),lw=1,color='r')
    a1.set_xlabel('Total counts')
    
    a1.set_ylabel('# reads from bin')
           

print(sum(pass_filters))
gs.tight_layout(fig, pad = 0.3)

plt.savefig('outputs_JZ_2022/mito_pct_vs_total_counts_%s.pdf'%(plot_title), dpi=600)
plt.savefig('outputs_JZ_2022/mito_pct_vs_total_counts_%s.png'%(plot_title), dpi=600)

## Removing some low quality libraries

In [89]:
#removing sample T1_1 as it only has 10 barcodes above treshold
#sample T4_1 is known to contain a huge cluster of low quality cells

no_T1_1 = ~adata.obs['library2'].isin(['T1_1']).values
no_T4_1 = ~adata.obs['library2'].isin(['T4_1']).values
cmask = no_T1_1&no_T4_1


In [92]:
adata

AnnData object with n_obs × n_vars = 1718924 × 33538
    obs: 'library', 'total_counts', 'pct_counts_mito', 'library2'

In [100]:
adata = adata[cmask] #applying the mask

In [101]:
adata.obs['library2'].unique()

array(['T2_1', 'N3_1', 'N4_1', 'N2_1', 'T3_1', 'N6_1', 'N5_1', 'N7_1',
       'N8_1', 'T5_2', 'T6_3', 'T7_3', 'T8_3', 'T5_1', 'T6_1', 'T6_2',
       'T7_2', 'T7_1', 'T8_2', 'T8_1', 'N9_1', 'T9_1', 'T9_2', 'T9_3',
       'N1_old', 'N2_old', 'N3_old', 'N4_old', 'T2_old', 'T3_old',
       'T4_old'], dtype=object)

In [102]:
# adding sample information, i.e. T2_1 and T2_old are emulsion aliqouts coming from the same sample, just
# sequenced in different batches

for i in sorted(adata.obs['library2'].unique()):
    print('"%s":"",'%i)

"N1_old":"",
"N2_1":"",
"N2_old":"",
"N3_1":"",
"N3_old":"",
"N4_1":"",
"N4_old":"",
"N5_1":"",
"N6_1":"",
"N7_1":"",
"N8_1":"",
"N9_1":"",
"T2_1":"",
"T2_old":"",
"T3_1":"",
"T3_old":"",
"T4_old":"",
"T5_1":"",
"T5_2":"",
"T6_1":"",
"T6_2":"",
"T6_3":"",
"T7_1":"",
"T7_2":"",
"T7_3":"",
"T8_1":"",
"T8_2":"",
"T8_3":"",
"T9_1":"",
"T9_2":"",
"T9_3":"",


In [103]:
renamer = {
"N1_old":"N1",
"N2_1":"N2",
"N2_old":"N2",
"N3_1":"N3",
"N3_old":"N3",
"N4_1":"N4",
"N4_old":"N4",
"N5_1":"N5",
"N6_1":"N6",
"N7_1":"N7",
"N8_1":"N8",
"N9_1":"N9",
"T2_1":"T2",
"T2_old":"T2",
"T3_1":"T3",
"T3_old":"T3",
"T4_old":"T4",
"T5_1":"T5",
"T5_2":"T5",
"T6_1":"T6",
"T6_2":"T6",
"T6_3":"T6",
"T7_1":"T7",
"T7_2":"T7",
"T7_3":"T7",
"T8_1":"T8",
"T8_2":"T8",
"T8_3":"T8",
"T9_1":"T9",
"T9_2":"T9",
"T9_3":"T9"
}
adata.obs['sample'] = [renamer[i] for i in adata.obs['library2']]
adata.obs.head()

Trying to set attribute `.obs` of view, copying.


Unnamed: 0,library,total_counts,pct_counts_mito,library2,sample
0,N14,21.0,9.523809,T2_1,T2
1,N14,17.0,11.764706,T2_1,T2
2,N14,449.0,0.668151,T2_1,T2
3,N14,55.0,1.818182,T2_1,T2
6,N14,28.0,3.571429,T2_1,T2


In [104]:
#apply the tresholds to data

cmask = np.repeat(True,adata.shape[0])
print (cmask.sum())
for key,value in ts_dict.items():
    m1 = (adata.obs[libcol] == key).values
    tmp = adata.obs[m1]
    m2 = (tmp['total_counts']>=value[0]).values
    m3 = (tmp['pct_counts_mito']<value[1]).values
    cmask[m1] = m2&m3
    
print(cmask.sum(),cmask.shape)

1587329
51196 (1587329,)


In [105]:
print( adata.shape)
adata = adata[cmask]
print (adata.shape)

(1587329, 33538)
(51196, 33538)


In [106]:
#checking cell count for each sample
for i in adata.obs['sample'].unique():
    print (i, sum(adata.obs['sample'] == i))

T2 6004
N3 1063
N4 2031
N2 2321
T3 3195
N6 802
N5 573
N7 640
N8 1243
T5 4751
T6 4550
T7 7285
T8 5322
N9 589
T9 6047
N1 2970
T4 1810


In [107]:

# write the entire adata object
fname1 = 'backups_JZ_2022/no_T4_lib_mito20_umi400_filt_raw_%dx%d_%s_%s.h5ad'%(cdata.shape[0],cdata.shape[1],rz.now(),plot_title)
print(fname1)
adata.write(fname1)



Trying to set attribute `.obs` of view, copying.


backups_JZ_2022/no_T4_lib_mito20_umi400_filt_raw_147456x33538_220120_14h39_kidney.h5ad


... storing 'library2' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'sample' as categorical


In [112]:
# save the pandas dataframe under adata.obs 
meta = adata.obs
fname_obs1 = 'backups_JZ_2022/no_T4_1_obs_info_%dx%d_%s_%s'%(meta.shape[0],meta.shape[1],rz.now(),plot_title)

print(fname_obs1)
rz.save_df(meta,fname_obs1)


backups_JZ_2022/no_T4_1_obs_info_51196x5_220120_14h47_kidney
