# 2. Preprocessing and Integration

2023-05-05

In [None]:
# Import Packages

%load_ext autoreload
%autoreload 2

import os
import warnings 
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import anndata as ad
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from anndata import AnnData
from natsort import natsorted

# Customized packages
import starmap.sc_util as su
# test()

## Set path

In [None]:
# Set path
base_path = 'path/to/dataset/folder'

input_path = os.path.join(base_path, 'input')

out_path = os.path.join(base_path, 'output')
if not os.path.exists(out_path):
    os.mkdir(out_path)
    
fig_path = os.path.join(base_path, 'figures')
if not os.path.exists(fig_path):
    os.mkdir(fig_path)

sc.settings.figdir = fig_path

In [None]:
# laod combined file
# adata = sc.read_h5ad(os.path.join(out_path, 'Brain-RIBOmap-combined-3mad-filtered.h5ad'))
adata = sc.read_h5ad(os.path.join(out_path, 'Brain-combined-3mad-filtered.h5ad'))
adata

## Filtering genes

In [None]:
thres = 2
# passed = (adata.var['max_counts_rep1_RIBOmap'] > thres) & (adata.var['max_counts_rep2_RIBOmap'] > thres) # two ribomap combined
passed = (adata.var['max_counts_rep1_RIBOmap'] > thres) & (adata.var['max_counts_rep2_RIBOmap'] > thres) & (adata.var['max_counts_rep2_STARmap'] > thres) # all three sections

print(adata.var.loc[passed, :].shape[0])

adata.var['detected'] = passed
adata.var['highly_variable'] = passed

## Normalization & scaling

In [None]:
# Per-cell Stats Plot
su.plot_stats_per_cell(adata, color='protocol-replicate')

In [None]:
# Get quantiles of reads
su.show_reads_quantile(adata)

In [None]:
# Normalization scaling
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)

adata.raw = adata

In [None]:
# Scale data to unit variance and zero mean
sc.pp.scale(adata)
adata.layers['scaled'] = adata.X.copy()

In [None]:
# Regress out unwanted variance 
sc.pp.regress_out(adata, ['total_counts'])
adata.layers['corrected'] = adata.X.copy()

In [None]:
# Combat (only for the dataset with all three sections)
sc.pp.combat(adata, 'protocol-replicate')
adata.layers['combat'] = adata.X.copy()

## Dimensionality reduction

In [None]:
# Run PCA
sc.tl.pca(adata, svd_solver='full', use_highly_variable=True)

# Plot explained variance 
sc.pl.pca_variance_ratio(adata, log=False)

In [None]:
# Plot PCA
sc.pl.pca(adata, color='total_counts')
sc.pl.pca(adata, color='n_genes')
sc.pl.pca(adata, color='protocol-replicate')

In [None]:
g = sns.jointplot(x=adata.obsm['X_pca'][:, 0], y=adata.obsm['X_pca'][:, 1], hue=adata.obs['protocol-replicate'],
                 s=1)
g.set_axis_labels('PC1', 'PC2')
plt.show()

## Integration with Harmony

In [None]:
%%time

import scanpy.external as sce

sce.pp.harmony_integrate(adata, 'protocol-replicate')

In [None]:
g = sns.jointplot(x=adata.obsm['X_pca_harmony'][:, 0], y=adata.obsm['X_pca_harmony'][:, 1], hue=adata.obs['protocol-replicate'],
                 s=1)
g.set_axis_labels('PC1', 'PC2')
plt.show()

In [None]:
%%time
# Computing the neighborhood graph
n_neighbors = 50
n_pcs = 50
    
sc.pp.neighbors(adata, n_neighbors=n_neighbors, n_pcs=n_pcs, use_rep='X_pca_harmony')

# Run UMAP
sc.tl.umap(adata, min_dist=.0001, spread=5)

In [None]:
sc.pl.umap(adata, color='total_counts')
sc.pl.umap(adata, color='n_genes')
sc.pl.umap(adata, color='protocol-replicate')
sc.pl.umap(adata, color='protocol')

In [None]:
g = sns.jointplot(x=adata.obsm['X_umap'][:, 0], y=adata.obsm['X_umap'][:, 1], hue=adata.obs['protocol-replicate'],
                 s=1, alpha=.5)
g.set_axis_labels('UMAP1', 'UMAP2')
plt.show()

In [None]:
# save integrated h5ad
from datetime import datetime
date = datetime.today().strftime('%Y-%m-%d')
# adata.write_h5ad(f"{out_path}/{date}-Brain-RIBOmap-harmony.h5ad")
adata.write_h5ad(f"{out_path}/{date}-Brain-combined-harmony.h5ad")