# Estimator validation

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import scanpy.api as sc
import scipy as sp
import itertools
import numpy as np
import scipy.stats as stats
from scipy.integrate import dblquad
import seaborn as sns
from statsmodels.stats.multitest import fdrcorrection
import imp
pd.options.display.max_rows = 999
pd.set_option('display.max_colwidth', -1)
import pickle as pkl
import time


In a future version of Scanpy, `scanpy.api` will be removed.
Simply use `import scanpy as sc` and `import scanpy.external as sce` instead.



In [2]:
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42

import matplotlib.pylab as pylab
params = {'legend.fontsize': 'medium',
         'axes.labelsize': 'medium',
         'axes.titlesize':'medium',
         'figure.titlesize':'medium',
         'xtick.labelsize':'small',
         'ytick.labelsize':'small'}
pylab.rcParams.update(params)


In [3]:
import sys
sys.path.append('/home/ssm-user/Github/scrna-parameter-estimation/scmemo')
import estimator, simulate, scmemo, bootstrap, util

In [4]:
import sys
sys.path.append('/home/ssm-user/Github/single_cell_eb/')
sys.path.append('/home/ssm-user/Github/single_cell_eb/sceb')
import scdd

### Check 1D estimates of `sceb` with `scmemo`

Using the Poisson model. The outputs should be identical, this is for checking the implementation. 

In [20]:
data = sp.sparse.csr_matrix(simulate.simulate_transcriptomes(100, 20))
adata = sc.AnnData(data)
size_factors = scdd.dd_size_factor(adata)

In [21]:
_, M_dd = scdd.dd_1d_moment(adata, size_factor=size_factors)
var_scdd = scdd.M_to_var(M_dd)
print(var_scdd)

#time start: 0.0s
n_cell=100, n_gene=20
#total: 0.00s
[4.05811359e+00 8.65557033e+01 3.38042239e+00 8.67055877e+01
 5.87806859e-02 3.99841925e+01 8.48237731e+00 1.28562336e+01
 4.02345703e+00 2.43614751e+00 9.80252612e-02 1.62111844e+01
 1.05115668e+01 9.53143947e-01 1.36284024e+02 2.32362990e+01
 8.75613957e-01 9.43882965e+01 6.77484907e+01 5.34462511e-01]


In [22]:
imp.reload(estimator)
mean_scmemo, var_scmemo = estimator._poisson_1d(data, data.shape[0], size_factors)
print(var_scmemo)

[4.05811419e+00 8.65558825e+01 3.38042193e+00 8.67056078e+01
 5.87806855e-02 3.99841965e+01 8.48237715e+00 1.28562306e+01
 4.02345666e+00 2.43614724e+00 9.80252592e-02 1.62111701e+01
 1.05115720e+01 9.53144150e-01 1.36289603e+02 2.32362900e+01
 8.75614006e-01 9.43883035e+01 6.77483004e+01 5.34462544e-01]


In [23]:
df = pd.DataFrame()
df['size_factor'] = size_factors
df['inv_size_factor'] = 1/size_factors
df['inv_size_factor_sq'] = 1/size_factors**2
df['expr'] = data[:, 0].todense().A1
precomputed_size_factors = df.groupby('expr')['inv_size_factor'].mean(), df.groupby('expr')['inv_size_factor_sq'].mean()

In [24]:
imp.reload(estimator)
expr, count = np.unique(data[:, 0].todense().A1, return_counts=True)
print(estimator._poisson_1d((expr, count), data.shape[0], precomputed_size_factors))

[0.9063726711273193, 4.058114381881536]


### Check 2D estimates of `sceb` and `scmemo`

Using the Poisson model. The outputs should be identical, this is for checking the implementation. 

In [25]:
data = sp.sparse.csr_matrix(simulate.simulate_transcriptomes(1000, 4))
adata = sc.AnnData(data)
size_factors = scdd.dd_size_factor(adata)

In [26]:
mean_scdd, cov_scdd, corr_scdd = scdd.dd_covariance(adata, size_factors)
print(cov_scdd)

[[ 4.09247186e-01 -7.00641019e-03  1.11981475e-02 -5.42475049e-01]
 [-7.00641019e-03  3.07633212e-01 -6.79097739e-03 -3.44895561e-01]
 [ 1.11981475e-02 -6.79097739e-03  5.31315636e-01 -6.31933899e-01]
 [-5.42475049e-01 -3.44895561e-01 -6.31933899e-01  1.00000000e-12]]


In [27]:
imp.reload(estimator)
cov_scmemo = estimator._poisson_cov(data, data.shape[0], size_factors, idx1=[0, 1, 2], idx2=[1, 2, 3])
print(cov_scmemo)

[[-0.00700641  0.01119815 -0.54247505]
 [ 0.30763321 -0.00679098 -0.34489556]
 [-0.00679098  0.53131564 -0.6319339 ]]


In [28]:
expr, count = np.unique(data[:, :2].toarray(), return_counts=True, axis=0)

df = pd.DataFrame()
df['size_factor'] = size_factors
df['inv_size_factor'] = 1/size_factors
df['inv_size_factor_sq'] = 1/size_factors**2
df['expr1'] = data[:, 0].todense().A1
df['expr2'] = data[:, 1].todense().A1

precomputed_size_factors = df.groupby(['expr1', 'expr2'])['inv_size_factor'].mean(), df.groupby(['expr1', 'expr2'])['inv_size_factor_sq'].mean()

In [30]:
cov_scmemo = estimator._poisson_cov((expr[:, 0], expr[:, 1], count), data.shape[0], size_factor=precomputed_size_factors)
print(cov_scmemo)

-0.007006410053126586


### Variance/correlation estimation while varying q_sq

In [34]:
q = 0.1
q_sq_list = [0.011]#[0.1**2+1e-6, 0.0102, 0.0106, 0.012]

In [35]:
import warnings
warnings.filterwarnings("ignore")


In [67]:
plt.figure(figsize=(7, 5))
plt.subplots_adjust(hspace=1.1, wspace=0.3)
num_subplot_col = len(q_sq_list)
num_subplot_row = 3

for col_num, q_sq in enumerate(q_sq_list):
    
    true_data = simulate.simulate_transcriptomes(5000, 1000, correlated=True)
    qs, captured_data = simulate.capture_sampling(true_data, q, q_sq)
    adata = sc.AnnData(sp.sparse.csr_matrix(captured_data))

    
    x_true, y_true = np.log(true_data.mean(axis=0)), np.log(true_data.var(axis=0))
    x_obs, y_obs = np.log(captured_data.mean(axis=0)), np.log(captured_data.var(axis=0))
    
    _, M_dd = scdd.dd_1d_moment(adata, size_factor=scdd.dd_size_factor(adata), verbose=False)
    var_scdd = scdd.M_to_var(M_dd)/q**2
    
    mean_scdd, _, corr_scdd = scdd.dd_covariance(adata, size_factor=scdd.dd_size_factor(adata))
    corr_scdd[corr_scdd == 0] = np.nan
    
    condition = np.log(var_scdd) > -20
    
    true_corrs = np.corrcoef(true_data, rowvar=False).ravel()
    estimated_corrs = corr_scdd.ravel()
    corr_cond = np.isfinite(estimated_corrs)
    true_corrs = true_corrs[corr_cond]
    estimated_corrs = estimated_corrs[corr_cond]
    
    break
    
    plt.subplot(num_subplot_row, num_subplot_col, col_num+1)
    sns.distplot(qs)
    plt.xlabel('q, Var(q)={:.1e}'.format(q_sq-q**2))
    plt.xlim(0, 0.5)
    if col_num == 0:
        plt.ylabel('Density')
        
    plt.subplot(num_subplot_row, num_subplot_col, col_num+1+num_subplot_col)
    plt.plot(y_true, y_true, color='m', lw=1)
    plt.scatter(y_true[condition], np.log(var_scdd)[condition], s=1)
    plt.xlabel('True variance');
    if col_num == 0:
        plt.ylabel('Estimated\nvariance')
    plt.title('R={:.3f}'.format(stats.pearsonr(np.log(var_scdd)[condition], y_true[condition])[0]))
    
    sample_idxs = np.random.choice(estimated_corrs.shape[0], 5000, replace=False)
    plt.subplot(num_subplot_row, num_subplot_col, col_num+1+num_subplot_col*2)
    plt.plot(true_corrs, true_corrs, color='m', lw=1)
    plt.scatter(true_corrs, estimated_corrs, s=.01, alpha=0.01)
    plt.xlabel('True correlation');
    if col_num == 0:
        plt.ylabel('Estimated\ncorrelation')
    plt.title('R={:.3f}'.format(stats.pearsonr(true_corrs, estimated_corrs)[0]))
    

<Figure size 504x360 with 0 Axes>