# Preprocessing - SCT & Scran Normalization

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:85% !important; }</style>"))

In [None]:
# General
import scipy as sci
import numpy as np
import pandas as pd
import logging
import time
import pickle
from itertools import chain

#R
import rpy2
import rpy2.rinterface_lib.callbacks
from rpy2.robjects import pandas2ri
import anndata2ri

# Plotting
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import rcParams
from matplotlib import cm
import seaborn as sb

# Analysis
import scanpy as sc
import scvelo as scv

data_dir = "../.."

In [None]:
# Settings

## Scanpy settings
sc.settings.verbosity = 3
sc.logging.print_versions()

## ScVelo settings
scv.settings.verbosity = 3
scv.logging.print_versions()

## R settings
### Ignore R warning messages
#### Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

### Automatically convert rpy2 outputs to pandas dataframes
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython

# Load Data

In [None]:
adata=sc.read(os.path.join(data_dir, 'E14-E15_adata_filtered_rmDoublets.h5ad'))
adata_wsnn=sc.read(os.path.join(data_dir, 'E14-E15_adata_filtered_rmDoublets_sctNormalized_wsnn.h5ad'))


In [None]:
adata_wknn=sc.read(os.path.join(data_dir, 'E14-E15_adata_filtered_rmDoublets_sctNormalized_wknn.h5ad'))

# Normalization with Scran

In [None]:
#Perform a clustering for scran normalization in clusters
adata_pp = adata.copy()
sc.pp.normalize_total(adata_pp, target_sum=1e6)#, exclude_highly_expressed=True) #sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp)
sc.pp.neighbors(adata_pp)
sc.tl.leiden(adata_pp, key_added='groups', resolution=0.5) #sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

In [None]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [None]:
%%R -i data_mat -i input_groups -o size_factors

install.packages("scran")
library(scran)
size_factors = calculateSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [None]:
# Visualize the estimated size factors
adata.obs['size_factors'] = size_factors

sc.pl.scatter(adata, 'size_factors', 'n_counts')
sc.pl.scatter(adata, 'size_factors', 'n_genes')

sb.distplot(size_factors, bins=100, kde=True)

In [None]:
#Keep the count data in a counts layer
adata.layers['raw_counts'] = adata.X.copy()

#Logarithmize raw counts
adata.layers['log_raw_counts'] = sc.pp.log1p(adata.layers['raw_counts'])

In [None]:
#Normalize adata 
adata.X /= adata.obs['size_factors'].values[:,None]
sc.pp.log1p(adata)

#Keep the normalized count data in a counts layer
adata.layers['scran_counts'] = adata.X.copy()

# Add Normalization from Seurat SCT 

In [None]:
%%R -o adata_sct

adata_sct <- readRDS(path.cat(data_dir, 'E14-E15_adata_filtered_rmDoublets_sctNormalized_sce.rds'))

In [None]:
adata_sct.layers['sct_counts'] = adata_sct.X.copy()

In [None]:
# Harmonize features
adata = adata[:,adata_sct.var_names].copy()

In [None]:
# Add SCT data
adata.layers['sct_counts'] = adata_sct.layers['sct_counts']
adata.layers['sct_logcounts'] = adata_sct.layers['sct_logcounts']
adata.layers['sct_scale_data'] = adata_sct.layers['sct_scale_data']
adata.var[['sct.detection_rate', 'sct.gmean', 'sct.variance', 'sct.residual_variance', 'sct.variable']] = adata_sct.var[['sct.detection_rate', 'sct.gmean', 'sct.variance', 'sct.residual_variance', 'sct.variable']]

In [None]:
# Put X in a layer to keep it after merging
adata_sct.layers['sct_counts'] = adata_sct.X.copy()

# Keep raw QC metrics & counts
adata.obs['mt_frac_raw'] = adata.obs['mt_frac']
adata.obs['rp_frac_raw'] = adata.obs['rp_frac']
adata.obs['n_genes_raw'] = adata.obs['n_genes']
adata.obs['log_genes_raw'] = adata.obs['log_genes']
adata.obs['n_counts_raw'] = adata.obs['n_counts']
adata.obs['log_counts_raw'] = adata.obs['log_counts']
#adata.layers['raw_counts'] = adata.X

# Add SCT data
adata.layers['sct_counts'] = adata_sct.layers['sct_counts']
adata.layers['sct_logcounts'] = adata_sct.layers['sct_logcounts']
adata.layers['sct_scale_data'] = adata_sct.layers['sct_scale_data']
adata.var[['sct.detection_rate', 'sct.gmean', 'sct.variance', 'sct.residual_variance', 'sct.variable']] = adata_sct.var[['sct.detection_rate', 'sct.gmean', 'sct.variance', 'sct.residual_variance', 'sct.variable']]

# Set normalized counts as X for QC metrics
adata.X = adata.layers['sct_counts']
adata.X=adata.X.toarray()

# Filter genes: Min 20 cells - filters out 0 count genes
sc.pp.filter_genes(adata, min_cells=20)
print('Number of genes after cell filter: {:d}'.format(adata.n_vars))

In [None]:
# Quality control - calculate QC covariates
adata.obs['n_counts'] = adata.X.sum(1)
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1)
adata.obs['log_genes'] = np.log(adata.obs['n_genes'])

mt_gene_mask = [gene.startswith('mt-') for gene in adata.var_names]
adata.obs['mt_frac'] = adata.X[:, mt_gene_mask].sum(1)/adata.obs['n_counts']

rp_gene_mask = [gene.startswith(('Rps','Rpl')) for gene in adata.var_names]
adata.obs['rp_frac'] = adata.X[:,rp_gene_mask].sum(1) / adata.X.sum(1)

In [None]:
# Set log-normalized counts as X
adata.X = adata.layers['sct_logcounts']
adata.X=adata.X.toarray()

# Store the full data set in 'raw' as log-normalized data for statistical testing
adata.raw = adata

In [None]:
# Calc umap & trimap
hvgs = pd.Series(adata.var['sct.variable'][adata.var['sct.variable'] > 0].index) # use HVGs from sct
adata.var['highly_variable']= False
adata.var['highly_variable'][hvgs] = True

sc.pp.pca(adata, svd_solver='arpack', use_highly_variable=True)
sc.pp.neighbors(adata, metric='correlation', n_neighbors=20, n_pcs=30)
sc.tl.leiden(adata, resolution=0.5)

sc.tl.umap(adata)

In [None]:
adata.write(os.path.join(data_dir, "E14-E15_adata_filtered_rmDoublets_sctNormalized_WNN.h5ad"))