# Task 0b - Normalization
This notebook does normalization of the data. This is the second step and requires the adata from the first step, Task 0a - QC.

This notebook is based on the single-cell tutorial (https://github.com/theislab/single-cell-tutorial). For further explanations of the applied methods please refer to above source.

In [None]:
import scanpy as sc
import numpy as np
import scipy as sp
import pandas as pd
import os
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib import colors
import seaborn as sb
# from gprofiler import GProfiler
from os import listdir
from os.path import isfile, join

import rpy2.rinterface_lib.callbacks
import logging

from rpy2.robjects import pandas2ri
import anndata2ri

In [None]:
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

# Automatically convert rpy2 outputs to pandas dataframes
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython

plt.rcParams['figure.figsize']=(8,8) #rescale figures
sc.settings.verbosity = 3
sc.logging.print_versions()

In [None]:
%%R
# Load libraries from correct lib Paths for my environment - ignore this!
.libPaths(.libPaths()[c(3,2,1)])

# Load all the R libraries we will be using in the notebook
library(scran)
# library(RColorBrewer)
# library(slingshot)
# library(monocle)
# library(gam)
# library(clusterExperiment)
# library(ggplot2)
# library(plyr)
# library(MAST)

### IMPORT ADATA OF QC

In [None]:
adata_dir = '../data/adata/rev_QC_adata.h5ad'
if not os.path.exists(adata_dir):
    raise ValueError('Adata does not exist')

In [None]:
adata = sc.read_h5ad('../data/adata/rev_QC_adata.h5ad')

## Normalization

In [None]:
# Clustering for scran normalization in clusters
adata_pp = adata.copy()
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

In [None]:
# Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [None]:
%%R -i data_mat -i input_groups -o size_factors

size_factors = calculateSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [None]:
# Delete adata_pp
del adata_pp

In [None]:
# Visualize the estimated size factors
adata.obs['size_factors'] = size_factors

sc.pl.scatter(adata, 'size_factors', 'total_counts')
sc.pl.scatter(adata, 'size_factors', 'n_genes_by_counts')

sb.distplot(size_factors, bins=50, kde=False)
plt.show()

In [None]:
# Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [None]:
# Normalize adata 
adata.X /= adata.obs['size_factors'].values[:,None]
adata.X = sp.sparse.csr_matrix(adata.X)
sc.pp.log1p(adata)

In [None]:
# Store the full data set in 'raw' as log-normalised data for statistical testing
adata.raw = adata

In [None]:
adata.write_h5ad('../data/adata/norm_adata.h5ad')