# Single-Cell Report: BBKNN Batch Correction

In [None]:
# Import packages
import scanpy as sc
import matplotlib.pyplot as plt
import random

#### Plotting settings and functions

In [None]:
# plot settings
sc.set_figure_params(dpi=150, fontsize=10, dpi_save=600)

In [None]:
def batchBarPlot( obs, axis1, axis2, title):
    cluster_by_sample = obs.groupby(by=['louvain', 'batch']).size().unstack()
    cluster_by_sample.columns = [f"{c} (n={sum((obs['batch'] == c))})" for c in cluster_by_sample.columns]
    cluster_by_sample.index = [int(n) + 1 for n in cluster_by_sample.index]
    cluster_by_sample_norm = (cluster_by_sample/cluster_by_sample.sum()) * 100
    cluster_by_sample_norm.plot(kind='bar', stacked=False, fontsize=8, width=.7, grid=False, ax=axis1)
    axis1.set_ylabel('Percentage of Batch (%)');
    axis1.set_title(title)
    axis1.legend(fancybox=True, framealpha=0.5, loc='right', bbox_to_anchor=(1.15, 0.5))
    (cluster_by_sample.divide(cluster_by_sample.sum(axis=1), axis='rows') * 100).plot(kind='bar', stacked=True, fontsize=8, width=.75, grid=False, ax=axis2)
    #axis1.xticks
    axis2.set_ylabel('Percent of Cluster (%)');
    axis2.set_title(title)
    axis2.legend(fancybox=True, framealpha=0.5, loc='right', bbox_to_anchor=(1.15, 0.5))

#### Read Data

In [None]:
adata1 = sc.read_h5ad(filename=FILE1)
adata2 = sc.read_h5ad(filename=FILE2)

In [None]:
# shuffle the cells in the matrix to avoid batch overplotting
cellID1 = list(adata1.obs_names)
random.shuffle(cellID1)
adata1 = adata1[cellID1]

cellID2 = list(adata2.obs_names)
random.shuffle(cellID2)
adata2 = adata2[cellID2]

---
## Batch effect correction

In [None]:
a = 0.6 # alpha setting
fig, ((ax1,ax2),(ax3,ax4)) = plt.subplots(2,2, figsize=(8,8), dpi=150 )
sc.pl.umap(adata1, color='batch', alpha=a, ax=ax1, show=False)
sc.pl.umap(adata2, color='batch', alpha=a, ax=ax2, show=False)
sc.pl.umap(adata1, color='louvain', alpha=a, palette=sc.pl.palettes.default_64, ax=ax3, show=False)
sc.pl.umap(adata2, color='louvain', alpha=a, palette=sc.pl.palettes.default_64, ax=ax4, show=False)

ax1.set_title('Pre-batch correction (batch)')
ax2.set_title('Post-batch correction (batch)')
ax3.set_title('Pre-batch correction (Louvain)')
ax4.set_title('Post-batch correction (Louvain)')
#
plt.tight_layout()

#### Cluster membership by batch

The following plots show how the batches distribute in each predicted Louvain cluster. The proportion of cells from each batch that belong to a particular cluster are shown in the top row (percentage of batch), pre- and post filtering. The bottom row (percent of cluster) shows the batch composition of each cluster, pre- and post filtering. In the pre-batch correction plots, clusters tend to be based on batch, while post-batch correction, there is a more even distribution of batches in each cluster.

In [None]:
fig, ((ax1,ax2),(ax3,ax4)) = plt.subplots(2,2, figsize=(15,8), dpi=150 )
batchBarPlot(adata1.obs, axis1=ax1, axis2=ax3, title="Pre-batch correction")
batchBarPlot(adata2.obs, axis1=ax2, axis2=ax4, title="Post-batch correction")
plt.tight_layout()