# 3. Preprocessing and filtering

This notebook checks quality of data; applys filtering on cells and then genes; export exploratory statstics results and AnnData with filters

In [None]:
# Load libraries

import sys, os
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

# Input

In [None]:
fpath = 'path_to_preprocessed_dataset'
rpath = 'output_path'
adata = sc.read_h5ad(os.path.join(fpath, 'output/2022-02-21-Hu-FUCCI-raw.h5ad'))
genes = adata.var.index
sample_list = ['RIBOmap', 'STARmap']

# QC

## cells

In [None]:
adata.obs

In [None]:
# voluume & total counts
sns.jointplot(data=adata.obs, x='area', y='total_counts', hue='sample', alpha=0.7)
# sns.violinplot(data=adata.obs, x='sample', y='area')
# sns.violinplot(data=adata.obs, x='sample', y='total_counts')

In [None]:
# n_genes in cell
sns.violinplot(data=adata.obs, x='sample', y='log1p_n_genes_by_counts')

In [None]:
adata.obs['density'] = adata.obs['total_counts'] / adata.obs['area']

## genes
added sample specific attributes

In [None]:
# mean counts, n cells by counts, max counts
sample_list = ['RIBOmap', 'STARmap']
for s in sample_list:
    mean_counts = np.mean(adata.X[adata.obs['sample']==s], axis=0)
    max_counts = np.amax(adata.X[adata.obs['sample']==s], axis=0)
    n_cells_by_counts = np.count_nonzero(adata.X[adata.obs['sample']==s], axis=0)
    adata.var['mean_counts_'+s] = mean_counts
    adata.var['max_counts_'+s] = max_counts
    adata.var['n_cells_by_counts_'+s] = n_cells_by_counts
adata.var

In [None]:
# genes mean expression
plt.hist([adata.var['mean_counts_RIBOmap'], adata.var['mean_counts_STARmap']], bins=50, log=True, label=sample_list, histtype='step', rwidth=1)
plt.xlabel('mean expression'), plt.legend()

In [None]:
# gene n_cells_by_counts
plt.hist([adata.var['n_cells_by_counts_RIBOmap'], adata.var['n_cells_by_counts_STARmap']], bins=50, label=sample_list, histtype='step')
plt.xlabel('n_cells_by_counts_'), plt.legend()

# Filtering

### filter cells
volume, total_reads, density

In [None]:
# filter by volume
sns.histplot(data=adata.obs, hue='sample', x='area', linewidth=0)
thres_vol_lower = 0.5e6
thres_vol_higher = 2e6
plt.axvline(x=thres_vol_lower, c='slategrey')
plt.axvline(x=thres_vol_higher, c='slategrey')

pass_vol = [1 if area<thres_vol_higher and area>thres_vol_lower else 0 for area in adata.obs['area']]
adata.obs['pass_volume_filter'] = pass_vol

In [None]:
# filter by total_reads
sns.histplot(data=adata.obs, hue='sample', x='total_counts', linewidth=0)
thres_tr_lower_RIBO = 300
thres_tr_higher_RIBO = 3500
plt.axvline(x=thres_tr_lower_RIBO, c='lightskyblue')
plt.axvline(x=thres_tr_higher_RIBO, c='lightskyblue')

thres_tr_lower_STAR = 500
thres_tr_higher_STAR = 6000
plt.axvline(x=thres_tr_lower_STAR, c='orange')
plt.axvline(x=thres_tr_higher_STAR, c='orange')

pass_tr_RIBO = [1 if area<thres_tr_higher_RIBO and area>thres_tr_lower_RIBO else 0 for area in adata.obs[adata.obs['sample']=='RIBOmap']['total_counts']]
pass_tr_STAR = [1 if area<thres_tr_higher_STAR and area>thres_tr_lower_STAR else 0 for area in adata.obs[adata.obs['sample']=='STARmap']['total_counts']]
pass_tr = pass_tr_RIBO + pass_tr_STAR
adata.obs['pass_counts_filter'] = pass_tr

In [None]:
adata.obs['pass_two_filters'] = np.logical_and(adata.obs['pass_volume_filter'], adata.obs['pass_counts_filter'])

In [None]:
# check density after filtering
sns.histplot(data=adata.obs[adata.obs['pass_two_filters']], x='sample', y='density', hue='sample')
star_dens_thres = 0.00055
plt.axhline(y=star_dens_thres, c='slategray')
pass_dens = [1 if dens > star_dens_thres else 0 for dens in adata.obs['density']]
adata.obs['pass_density_filter'] = pass_dens

In [None]:
adata.obs['pass_all_filters'] = np.logical_and(adata.obs['pass_two_filters'], adata.obs['pass_density_filter'])

In [None]:
# sns.violinplot(data=adata.obs[adata.obs['pass_all_filters']], x='sample', y='total_counts', legend=False).set_title('after filtering')
# sns.despine()
# plt.savefig(rpath+'/figures/expl_stats/total_counts.pdf')

### filter genes
% cells expressed + max_count

In [None]:
# low abundance genes
plt.hist([np.mean(adata.X[np.logical_and(adata.obs['sample']=='RIBOmap', adata.obs['pass_all_filters'])], axis=0), 
            np.mean(adata.X[np.logical_and(adata.obs['sample']=='STARmap', adata.obs['pass_all_filters'])], axis=0)], 
            range=(0,8), bins=50, log=True, label=sample_list, histtype='step', rwidth=1)
plt.xlabel('mean expression (lower end, after cell filtering)'), plt.legend()

In [None]:
sample_list = ['RIBOmap', 'STARmap']
test_thres_expr = [0.5, 1]
for s in sample_list:
    print(s, ': ')
    X_s = adata.X[np.logical_and(adata.obs['sample']==s, adata.obs['pass_all_filters'])]
    mean_ct = np.mean(X_s, axis=0)  
    for thres in test_thres_expr:
        below = np.count_nonzero(mean_ct<thres)
        print('n_genes mean_counts < {0}: {1}'.format(thres, below))

In [None]:
sample_list = ['RIBOmap', 'STARmap']
pct_cell = [0.1, 0.1]
ct_thres = [2, 4] 
for i, s in enumerate(sample_list):
    X_s = adata.X[np.logical_and(adata.obs['sample']==s, adata.obs['pass_all_filters'])] # filter based on filtered cells
    f1 = np.count_nonzero(X_s, axis=0) > pct_cell[i]*X_s.shape[0]  # filter-1
    f2 = np.amax(X_s, axis=0) > ct_thres[i] # filter-2
    f = np.logical_and(f1, f2)
    print('{0} threshold: expressed in at least {1}%% cells AND max expression in a cell greater than {2}'.format(s, pct_cell[i]*100, ct_thres[i]))
    print('filtered out: {0} genes -- {1: .2f}%'.format(adata.n_vars - np.count_nonzero(f), (adata.n_vars - np.count_nonzero(f))/adata.n_vars))
    adata.var['filter_'+s] = f
adata.var['filter'] = np.logical_and(adata.var['filter_RIBOmap'], adata.var['filter_STARmap'])
print('All together filtered out: {0} genes -- {1: .2f}%'.format(adata.n_vars - np.count_nonzero(adata.var['filter']), (adata.n_vars - np.count_nonzero(adata.var['filter']))/adata.n_vars))

In [None]:
# adata.write_h5ad(rpath+'/adata/FUCCI_cell_&_gene_filter_02-22-22.h5ad')

# After filtering

In [None]:
adata = sc.read_h5ad(rpath+'/adata/FUCCI_cell_&_gene_filter_02-22-22.h5ad')
adata

In [None]:
# cell volume, counts
sns.jointplot(data=adata.obs, x='area', y='total_counts', hue='sample', alpha=0.7)

In [None]:
# genes mean expression
plt.hist([adata.var['mean_counts_RIBOmap'], adata.var['mean_counts_STARmap']], bins=50, log=True, label=sample_list, histtype='step', rwidth=1)
plt.xlabel('mean expression'), plt.legend()

In [None]:
# gene n_cells_by_counts
plt.hist([adata.var['n_cells_by_counts_RIBOmap'], adata.var['n_cells_by_counts_STARmap']], bins=50, label=sample_list, histtype='step')
plt.xlabel('n_cells_by_counts_'), plt.legend()