In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import scanpy as sp
import anndata as an
import bbknn

In [None]:
folder = '../pb_results/'
raw_data = 'parabiosis_tms_raw.h5ad'
D = an.read_h5ad(raw_data)

### Calculate QC metrics

In [None]:
D.var = D.var.reset_index()
D.var['index'] = D.var.apply(lambda x: str(x['index'].lower()), axis = 1)
D.var['mt'] = D.var['index'].str.startswith('mt-')
D.var['rb'] = D.var['index'].str.startswith('rpl') | D.var['index'].str.startswith('rps') | D.var['index'].str.startswith('mrpl') | D.var['index'].str.startswith('mrps')
D.var['ercc'] = D.var['index'].str.startswith('ercc')
D.var['qc'] = D.var['ercc'] | D.var['rb'] | D.var['mt']
D.var = D.var.set_index('index')
sp.pp.calculate_qc_metrics(D, qc_vars=['mt','rb','ercc'], percent_top=None, log1p=False, inplace=True)

## Apply QC

In [None]:
sp.pp.filter_genes(D, min_cells=1)
D = D[D.obs.total_counts >= 5000, :]
D = D[D.obs.n_genes_by_counts >= 500, :]
D = D[D.obs.pct_counts_mt < 10, :]
D = D[D.obs.pct_counts_rb < 10, :]
D = D[D.obs.pct_counts_ercc < 30, :]
D_raw = D.copy()
sp.pp.normalize_total(D, target_sum=1e6)
sp.pp.log1p(D,base=10)
D_cpm = D.copy()

### Create tissue scpecific datasets

In [None]:
tissues = list(D.obs['tissue'].unique())
tissue_data_red = {}
tissue_data_redo = {}
tissue_data_cpm = {}
tissue_data_raw = {}
for tissue in tissues:
    S = (D_raw.obs['tissue']==tissue)
    tissue_data_cpm[tissue] = D_cpm[S,:]
    tissue_data_raw[tissue] = D_raw[S,:]
    tissue_data_raw[tissue].obs =  tissue_data_raw[tissue].obs.reset_index()
    tissue_data_cpm[tissue].obs =  tissue_data_cpm[tissue].obs.reset_index()
    d_ = tissue_data_cpm[tissue]
    sp.pp.highly_variable_genes(d_,max_mean=6,n_top_genes=5000)
    tissue_data_cpm[tissue] = d_
    d = d_.copy()
    d = d[:, d.var.highly_variable]
    sp.tl.pca(d, svd_solver='arpack',n_comps=64)
    tissue_data_red[tissue] = d

### Apply downstream analyses required for cell annotation including batch correction
- BBKNN batch correction
- UMAP for visualization
- Leiden algo. for clustering
- KNN graph is calculated on the  vanilla data without batch conversion to later assess batch correction quality

In [None]:
for tissue in tissues:
    print(tissue,end = ' ')
    d = tissue_data_red[tissue].copy()
    do = tissue_data_red[tissue].copy()
    bbknn.bbknn(d,batch_key='mouse_id', n_pcs=20,neighbors_within_batch=1)
    sp.tl.umap(d)
    sp.tl.leiden(d)
    sp.pp.neighbors(do, n_neighbors=15, n_pcs=20)
    sp.tl.umap(do)
    sp.tl.leiden(do)
    tissue_data_red[tissue] = d
    tissue_data_redo[tissue] = do

### Saving the tissue specific datasets that are ready for cell type annotation

In [None]:
folder = '../pb_results/'
for tissue in tissues:
        print(tissue, end = ' ')
        tissue_data_raw[tissue].write(folder + 'tissue_data_raw_' + tissue + '.h5ad')
        tissue_data_cpm[tissue].write(folder + 'tissue_data_cpm_' + tissue + '.h5ad')
        tissue_data_red[tissue].write(folder + 'tissue_data_red_' + tissue + '.h5ad')
        tissue_data_redo[tissue].write(folder + 'tissue_data_redo_' + tissue + '.h5ad')