# FCA multiomics - doublets

In [1]:
import scrublet as scr
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import os
import sys
import scipy


def MovePlots(plotpattern, subplotdir):
    os.system('mkdir -p '+str(sc.settings.figdir)+'/'+subplotdir)
    os.system('mv '+str(sc.settings.figdir)+'/*'+plotpattern+'** '+str(sc.settings.figdir)+'/'+subplotdir)


sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.figdir = './figures-sn/preprocessing/'
sc.logging.print_versions()
sc.settings.set_figure_params(dpi=80)  # low dpi (dots per inch) yields small inline figures

sys.executable



-----
anndata     0.7.5
scanpy      1.7.1
sinfo       0.3.1
-----
PIL                 8.1.2
anndata             0.7.5
anyio               NA
attr                20.3.0
babel               2.9.0
backcall            0.2.0
brotli              NA
cairo               1.20.0
certifi             2020.12.05
cffi                1.14.5
chardet             4.0.0
cloudpickle         1.6.0
colorama            0.4.4
cycler              0.10.0
cython_runtime      NA
cytoolz             0.11.0
dask                2021.03.1
dateutil            2.8.1
decorator           4.4.2
fsspec              0.8.7
get_version         2.1
google              NA
h5py                3.1.0
idna                2.10
igraph              0.8.3
ipykernel           5.5.0
ipython_genutils    0.2.0
jedi                0.18.0
jinja2              2.11.3
joblib              1.0.1
json5               NA
jsonschema          3.2.0
jupyter_server      1.4.1
jupyterlab_server   2.3.0
kiwisolver          1.3.1
legacy_api_wrap     0.0.0


'/opt/conda/bin/python'

In [2]:
# Benjamini-Hochberg and Bonferroni FDR helper functions.

def bh(pvalues):
    """
    Computes the Benjamini-Hochberg FDR correction.
    
    Input:
        * pvals - vector of p-values to correct
    """
    pvalues = np.array(pvalues)
    n = int(pvalues.shape[0])
    new_pvalues = np.empty(n)
    values = [ (pvalue, i) for i, pvalue in enumerate(pvalues) ]
    values.sort()
    values.reverse()
    new_values = []
    for i, vals in enumerate(values):
        rank = n - i
        pvalue, index = vals
        new_values.append((n/rank) * pvalue)
    for i in range(0, int(n)-1):
        if new_values[i] < new_values[i+1]:
            new_values[i+1] = new_values[i]
    for i, vals in enumerate(values):
        pvalue, index = vals
        new_pvalues[index] = new_values[i]
    return new_pvalues

def bonf(pvalues):
    """
    Computes the Bonferroni FDR correction.
    
    Input:
        * pvals - vector of p-values to correct
    """
    new_pvalues = np.array(pvalues) * len(pvalues)
    new_pvalues[new_pvalues>1] = 1
    return new_pvalues

## Scrumblet

(Courtesy of K Polansky)

Two-step doublet score processing, mirroring the approach from Popescu et al. https://www.nature.com/articles/s41586-019-1652-y which was closely based on Pijuan-Sala et al. https://www.nature.com/articles/s41586-019-0933-9

The first step starts with some sort of doublet score, e.g. Scrublet, and ends up with a per-cell p-value (with significant values marking doublets). For each sample individually:

 - run Scrublet to obtain each cell's score
- overcluster the manifold - run a basic Scanpy pipeline up to clustering, then additionally cluster each cluster separately
- compute per-cluster Scrublet scores as the median of the observed values, and use those going forward
- identify p-values:
  - compute normal distribution parameters: centered at the median of the scores, with a MAD-derived standard deviation
  - the score distribution is zero-truncated, so as per the paper I only use above-median values to compute the MAD
  - K deviates from the paper a bit, at least the exact wording captured within it, and multiply the MAD by 1.4826 to obtain a literature-derived normal distribution standard deviation estimate
  - FDR-correct the p-values via Benjamini-Hochberg
- write out all this doublet info into CSVs for later use

NOTE: The second step is performed later, in a multi-sample space

In [3]:
data_dir = '/nfs/users/nfs_l/lg18/team292/lg18/gonads/data/dual/'
meta = pd.read_csv(data_dir+'/meta.txt',index_col=0, sep='\t')
plotmeta = list(meta.columns)
plotmeta.append('sample')
print('Number of samples: ', meta.index.size)

Number of samples:  14


In [4]:
#there's loads of clustering going on, so set verbosity low unless you enjoy walls of text
sc.settings.verbosity = 0  # verbosity: errors (0), warnings (1), info (2), hints (3)

scorenames = ['scrublet_score','scrublet_cluster_score','zscore','bh_pval','bonf_pval']
if not os.path.exists('scrublet-scores'):
    os.makedirs('scrublet-scores')
    #loop over the subfolders of the rawdata folder

samples = meta.index.to_list()

for sample in list(reversed(samples)):
    print(sample)
    #import data
    adata_sample = sc.read_10x_mtx(data_dir+'/'+sample+'/filtered_feature_bc_matrix/',cache=True)
    adata_sample.var_names_make_unique()
    #rename cells to SAMPLE_BARCODE
    adata_sample.obs_names = [sample+'_'+i for i in adata_sample.obs_names]
    #do some early filtering to retain meaningful cells for doublet inspection
    sc.pp.filter_cells(adata_sample, min_genes=200)
    sc.pp.filter_genes(adata_sample, min_cells=3)
    #convert to lower to be species agnostic: human mito start with MT-, mouse with mt-
    mito_genes = [name for name in adata_sample.var_names if name.lower().startswith('mt-')]
    # for each cell compute fraction of counts in mito genes vs. all genes
    # the `.A1` is only necessary as X is sparse (to transform to a dense array after summing)
    adata_sample.obs['percent_mito'] = np.sum(
        adata_sample[:, mito_genes].X, axis=1).A1 / np.sum(adata_sample.X, axis=1).A1
    adata_sample = adata_sample[adata_sample.obs['percent_mito'] < 0.2, :]

    #set up and run Scrublet, seeding for replicability
    np.random.seed(0)
    scrub = scr.Scrublet(adata_sample.X)
    doublet_scores, predicted_doublets = scrub.scrub_doublets(verbose=False)
    adata_sample.obs['scrublet_score'] = doublet_scores

    #overcluster prep. run turbo basic scanpy pipeline
    sc.pp.normalize_per_cell(adata_sample, counts_per_cell_after=1e4)
    sc.pp.log1p(adata_sample)
    sc.pp.highly_variable_genes(adata_sample, min_mean=0.0125, max_mean=3, min_disp=0.5)
    adata_sample = adata_sample[:, adata_sample.var['highly_variable']]
    sc.pp.scale(adata_sample, max_value=10)
    sc.tl.pca(adata_sample, svd_solver='arpack')
    sc.pp.neighbors(adata_sample)
    #overclustering proper - do basic clustering first, then cluster each cluster
    sc.tl.leiden(adata_sample)
    adata_sample.obs['leiden'] = [str(i) for i in adata_sample.obs['leiden']]
    for clus in np.unique(adata_sample.obs['leiden']):
        adata_sub = adata_sample[adata_sample.obs['leiden']==clus].copy()
        sc.tl.leiden(adata_sub)
        adata_sub.obs['leiden'] = [clus+','+i for i in adata_sub.obs['leiden']]
        adata_sample.obs.loc[adata_sub.obs_names,'leiden'] = adata_sub.obs['leiden']

    #compute the cluster scores - the median of Scrublet scores per overclustered cluster
    for clus in np.unique(adata_sample.obs['leiden']):
        adata_sample.obs.loc[adata_sample.obs['leiden']==clus, 'scrublet_cluster_score'] = \
            np.median(adata_sample.obs.loc[adata_sample.obs['leiden']==clus, 'scrublet_score'])
    #now compute doublet p-values. figure out the median and mad (from above-median values) for the distribution
    med = np.median(adata_sample.obs['scrublet_cluster_score'])
    mask = adata_sample.obs['scrublet_cluster_score']>med
    mad = np.median(adata_sample.obs['scrublet_cluster_score'][mask]-med)
    #let's do a one-sided test. the Bertie write-up does not address this but it makes sense
    zscores = (adata_sample.obs['scrublet_cluster_score'].values - med) / (1.4826 * mad)
    adata_sample.obs['zscore'] = zscores
    pvals = 1-scipy.stats.norm.cdf(zscores)
    adata_sample.obs['bh_pval'] = bh(pvals)
    adata_sample.obs['bonf_pval'] = bonf(pvals)

    #create results data frame for single sample and copy stuff over from the adata object
    scrublet_sample = pd.DataFrame(0, index=adata_sample.obs_names, columns=scorenames)
    for score in scorenames:
        scrublet_sample[score] = adata_sample.obs[score]
    #write out complete sample scores
    scrublet_sample.to_csv('scrublet-scores/'+sample+'.csv')

HCA_F_GON10535496


Trying to set attribute `.obs` of view, copying.
  view_to_actual(adata)


HCA_F_GON10535495


Trying to set attribute `.obs` of view, copying.
  view_to_actual(adata)


HCA_F_GON10713286


Trying to set attribute `.obs` of view, copying.
  view_to_actual(adata)


HCA_F_GON10713285


Trying to set attribute `.obs` of view, copying.
  view_to_actual(adata)


HCA_F_GON10713284


Trying to set attribute `.obs` of view, copying.
  view_to_actual(adata)


FCA_GND10287604


Trying to set attribute `.obs` of view, copying.
  view_to_actual(adata)


FCA_GND10287603


Trying to set attribute `.obs` of view, copying.
  view_to_actual(adata)


FCA_GND10287602


Trying to set attribute `.obs` of view, copying.
  view_to_actual(adata)


FCA_GND10287601


Trying to set attribute `.obs` of view, copying.
  view_to_actual(adata)


FCA_GND10287600


Trying to set attribute `.obs` of view, copying.
  view_to_actual(adata)


HD_F_GON9525422


Trying to set attribute `.obs` of view, copying.
  view_to_actual(adata)


HD_F_GON9525421


Trying to set attribute `.obs` of view, copying.
  view_to_actual(adata)


HD_F_GON9525420


Trying to set attribute `.obs` of view, copying.
  view_to_actual(adata)


HD_F_GON9525419


Trying to set attribute `.obs` of view, copying.
  view_to_actual(adata)
