# Pancreas dataset

* [reference](https://scanpy-tutorials.readthedocs.io/en/latest/integrating-data-using-ingest.html)

Install bbknn by `pip install bbknn` first.

In [None]:
import scanpy as sc
import pandas as pd
import seaborn as sns

In [None]:
sc.settings.verbosity = 1             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=100, frameon=False, figsize=(4, 3), facecolor='white')

## Load/download dataset

In [None]:
adata_all = sc.read('data/pancreas.h5ad', backup_url='https://www.dropbox.com/s/qj1jlm9w10wmt0u/pancreas.h5ad?dl=1')

In [None]:
adata_all.shape

In [None]:
counts = adata_all.obs.celltype.value_counts()
counts

In [None]:
minority_classes = counts.index[-5:].tolist()        # get the minority classes
adata_all = adata_all[                               # actually subset
    ~adata_all.obs.celltype.isin(minority_classes)]
adata_all.obs.celltype.cat.reorder_categories(       # reorder according to abundance
    counts.index[:-5].tolist(), inplace=True)

## Batch effect

In [None]:
sc.pp.pca(adata_all)
sc.pp.neighbors(adata_all)
sc.tl.umap(adata_all)

In [None]:
sc.pl.umap(adata_all, color=['batch', 'celltype'], palette=sc.pl.palettes.vega_20_scanpy)

## Dataintegration with BBKNN

In [None]:
sc.external.pp.bbknn(adata_all, batch_key='batch')

In [None]:
sc.tl.umap(adata_all)

In [None]:
sc.pl.umap(adata_all, color=['batch', 'celltype'])

## Mapping onto a reference batch using ingest

### Set batch 0 as reference batch

In [None]:
adata_ref = adata_all[adata_all.obs.batch == '0']

In [None]:
sc.pp.pca(adata_ref)
sc.pp.neighbors(adata_ref)
sc.tl.umap(adata_ref)

In [None]:
sc.pl.umap(adata_ref, color='celltype')

In [None]:
adatas = [adata_all[adata_all.obs.batch == i].copy() for i in ['1', '2', '3']]

In [None]:
sc.settings.verbosity = 2  # a bit more logging
for iadata, adata in enumerate(adatas):
    print(f'... integrating batch {iadata+1}')
    adata.obs['celltype_orig'] = adata.obs.celltype  # save the original cell type
    sc.tl.ingest(adata, adata_ref, obs='celltype')

In [None]:
adata_concat = adata_ref.concatenate(adatas)

In [None]:
adata_concat.obs.celltype = adata_concat.obs.celltype.astype('category')
adata_concat.obs.celltype.cat.reorder_categories(adata_ref.obs.celltype.cat.categories, inplace=True)  # fix category ordering
adata_concat.uns['celltype_colors'] = adata_ref.uns['celltype_colors']  # fix category coloring

In [None]:
sc.pl.umap(adata_concat, color=['batch', 'celltype'])

## Evaluating consistency

In [None]:
adata_query = adata_concat[adata_concat.obs.batch.isin(['1', '2', '3'])]

In [None]:
sc.pl.umap(
    adata_query, color=['batch', 'celltype', 'celltype_orig'], wspace=0.4)

## Cell types conserved across batches

In [None]:
obs_query = adata_query.obs
conserved_categories = obs_query.celltype.cat.categories.intersection(obs_query.celltype_orig.cat.categories)  # intersected categories
obs_query_conserved = obs_query.loc[obs_query.celltype.isin(conserved_categories) & obs_query.celltype_orig.isin(conserved_categories)]  # intersect categories
obs_query_conserved.celltype.cat.remove_unused_categories(inplace=True)  # remove unused categoriyes
obs_query_conserved.celltype_orig.cat.remove_unused_categories(inplace=True)  # remove unused categoriyes
obs_query_conserved.celltype_orig.cat.reorder_categories(obs_query_conserved.celltype.cat.categories, inplace=True)  # fix category ordering

In [None]:
pd.crosstab(obs_query_conserved.celltype, obs_query_conserved.celltype_orig)

In [None]:
pd.crosstab(adata_query.obs.celltype, adata_query.obs.celltype_orig)

## Visualizing distributions across batches

In [None]:
sc.tl.embedding_density(adata_concat, groupby='batch')

In [None]:
sc.pl.embedding_density(adata_concat, groupby='batch')

In [None]:
for batch in ['1', '2', '3']:
    sc.pl.umap(adata_concat, color='batch', groups=[batch])