In [None]:
import scanpy as sc
import pandas as pd

In [None]:
sc.settings.verbosity = 3
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

In [None]:
PATH = './data/endoderm/'

In [None]:
adata = sc.read_h5ad(PATH + '/sc_endoderm_all_cells.h5ad')

In [None]:
del adata.layers
del adata.raw

In [None]:
# Primary filtering
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)

In [None]:
adata.var['mt'] = adata.var_names.str.startswith('mt-')
sc.pp.calculate_qc_metrics(adata,
                           qc_vars=['mt'],
                           percent_top=None,
                           log1p=False,
                           inplace=True)

In [None]:
# Total count normalize
sc.pp.normalize_total(adata, target_sum=1e4)

In [None]:
# Take logarithm
sc.pp.log1p(adata)

In [None]:
# Identify highly-variable genes
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)

adata.raw = adata

In [None]:
# Keep only HVGs
adata = adata[:, adata.var.highly_variable]

In [None]:
# Regress batch effects
sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])

In [None]:
# Scale data
sc.pp.scale(adata, max_value=10)

In [None]:
# Keep only cell types with more than 500 cells
counts = pd.DataFrame(adata.obs['CellType'].value_counts())
cell_types_to_keep = list(counts[counts['CellType'] > 500].index)
adata = adata[adata.obs['CellType'].isin(cell_types_to_keep), :]

In [None]:
# Filter to 1000 cells per class
target_cells = 1000
cluster_key = 'CellType'
adata = [adata[adata.obs[cluster_key].isin([clust])] for clust in adata.obs[cluster_key].cat.categories]
for dat in adata:
    if dat.n_obs > target_cells:
        sc.pp.subsample(dat, n_obs=target_cells)
adata = adata[0].concatenate(*adata[1:])

In [None]:
adata.write("endoderm_post_process.h5ad")