# Quality assessment & Generate H5AD Input

2021-11-12

In [None]:
# Import Packages

%load_ext autoreload
%autoreload 2

import os
import warnings 
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
from anndata import AnnData

# Customized packages
import starmap.sc_util as su

# test()

## RIBOmap

### Input

In [None]:
# Set path
base_path = 'Z:/Data/Analyzed/2021-11-23-Hu-MouseBrain/'

out_path = os.path.join(base_path, 'output')
if not os.path.exists(out_path):
    os.mkdir(out_path)
    
fig_path = os.path.join(base_path, 'figures')
if not os.path.exists(fig_path):
    os.mkdir(fig_path)

sc.settings.figdir = fig_path

In [None]:
# expr_path = os.path.join(out_path, 'expr/cell_barcode_count.csv')
# var_path = os.path.join(out_path, 'expr/cell_barcode_names.csv')
# obs_path = os.path.join(out_path, 'expr/meta.csv')

# # add expression data to the AnnData object 
# expr_x = np.loadtxt(expr_path, delimiter=',')
# var = pd.read_csv(var_path, header=None)
# var = pd.DataFrame(index=var.iloc[:,2].to_list())
# obs = pd.read_csv(obs_path, index_col=0)

# rdata = AnnData(X=expr_x, var=var, obs=obs)

In [None]:
rdata = sc.read_h5ad(os.path.join(base_path, 'RIBOmap', '2022-03-11-RIBOmap-raw.h5ad'))
rdata.obs['protocol'] = 'RIBOmap'

In [None]:
rdata

### QC

In [None]:
# Plot top 20 most expressed genes (before qc)
sc.pl.highest_expr_genes(rdata, n_top=20)

In [None]:
# Calculate QC metrics
sc.pp.calculate_qc_metrics(rdata, percent_top=None, inplace=True)
# Calculate max count for each gene
rdata.var['max_counts'] = rdata.X.max(axis=0)

In [None]:
# Total counts describe statistics
rdata.obs['total_counts'].describe()

In [None]:
rdata.obs['log1p_total_counts'].describe()

In [None]:
from scipy import stats
n = 4
mad = stats.median_absolute_deviation(rdata.obs['log1p_total_counts'], scale=1)
lower_bd = rdata.obs['log1p_total_counts'].median() - n*mad
upper_bd = rdata.obs['log1p_total_counts'].median() + n*mad
print(lower_bd)
print(upper_bd)
print(np.expm1(lower_bd))
print(np.expm1(upper_bd))

In [None]:
sns.histplot(rdata.obs['log1p_total_counts'])
plt.axvline(lower_bd, c='r')
plt.axvline(upper_bd, c='r')
plt.show()

In [None]:
# Total counts describe statistics
rdata.var['total_counts'].describe()

In [None]:
# max counts describe statistics
rdata.var['max_counts'].describe()

In [None]:
# Per-cell Stats Plot

su.plot_stats_per_cell(rdata, color='protocol', save=False)

In [None]:
# sc.pl.violin(rdata, ['total_counts', 'n_genes_by_counts', 'area'],
#              jitter=0.4, multi_panel=True, log=True)

In [None]:
# Get quantiles of reads
su.show_reads_quantile(rdata)

### Filtration

In [None]:
# Filter gene by max counts 
rdata.var['detected'] = rdata.var['max_counts'] > 2
rdata.var['highly_variable'] = rdata.var['max_counts'] > 2
print(rdata.var['detected'].sum())

In [None]:
# Filtration 
sc.pp.filter_cells(rdata, min_genes=10)
sc.pp.filter_genes(rdata, min_cells=10)

# # Filter gene by max counts 
# adata = adata[:, adata.var['max_counts'] > 2]

sc.pp.filter_cells(rdata, min_counts=np.expm1(lower_bd))
sc.pp.filter_cells(rdata, max_counts=np.expm1(upper_bd))

rdata.layers['raw'] = rdata.X
rdata.X.shape

In [None]:
# Per-cell Stats Plot

su.plot_stats_per_cell(rdata, color='protocol', save=False)

### Output

In [None]:
from datetime import datetime
date = datetime.today().strftime('%Y-%m-%d')
rdata.write_h5ad(f"{out_path}/{date}-Hu-TissueRIBOmap-{n}mad-filtered.h5ad")