# 1D Post stimulation testing

In [1]:
import scanpy as sc
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
from pybedtools import BedTool
import pickle as pkl
%matplotlib inline

In [2]:
import sys
sys.path.append('/home/ssm-user/Github/scrna-parameter-estimation/dist/memento-0.0.6-py3.8.egg')
sys.path.append('/home/ssm-user/Github/misc-seq/miscseq/')
import encode
import memento

In [3]:
data_path = '/data_volume/memento/hbec/'

### Read the processed RNA data

Focus on the club and bc/club cells and type I interferons for now.

Encode the timestamps to integers.

In [4]:
adata = sc.read(data_path + 'HBEC_type_I_filtered_counts_deep.h5ad')

In [5]:
adata = adata[:, ~adata.var.index.str.startswith('MT-')].copy()
adata.obs['cell_type'] = adata.obs['cell_type'].apply(lambda x: x if x != 'basal/club' else 'bc')
adata.obs['cell_type'] = adata.obs['cell_type'].apply(lambda x: x if x != 'ionocyte/tuft' else 'ion-tuft')

  res = method(*args, **kwargs)


In [6]:
adata.shape

(69958, 36588)

### Get results from ctrl -> 3 hr

In [7]:
cts = [['ciliated']]
tps = ['3']
stims = ['alpha', 'beta', 'gamma', 'lambda']

In [8]:
def read_result(ct):
    
    ht_dict = {}
    moments_dict = {}
    
    for stim in stims:
        ht_dict[stim] = {}
        moments_dict[stim] = {}
        for tp in tps: 
            adata_subset = sc.read(data_path + 'binary_test_deep/{}_{}_{}.h5ad'.format('-'.join(ct), stim, tp))
            ht_dict[stim][tp] = memento.get_1d_ht_result(adata_subset)
            ht_dict[stim][tp]['de_fdr'] = memento.util._fdrcorrect(ht_dict[stim][tp]['de_pval'])
            ht_dict[stim][tp]['dv_fdr'] = memento.util._fdrcorrect(ht_dict[stim][tp]['dv_pval'])
            moments_dict[stim][tp] = memento.get_1d_moments(adata_subset, groupby='time_step')
    return ht_dict,moments_dict

In [9]:
all_result = {}
for ct in cts:
    
    key = '-'.join(ct)
    all_result[key] = {}
    
    all_result[key]['ht'], all_result[key]['moments'] = read_result(ct)

### Setup memento

In [10]:
def assign_q(batch):
    
    if batch == 0:
        return 0.387*0.25
    elif batch == 1:
        return 0.392*0.25
    elif batch == 2:
        return 0.436*0.25
    else:
        return 0.417*0.25

In [11]:
adata.obs['q'] = adata.obs['batch'].apply(assign_q)

In [12]:
memento.setup_memento(adata, q_column='q')

### Run memento for each stimulation against all time - beta

In [13]:
ct = ['ciliated']
stim = 'beta'

adata_stim = adata[
    adata.obs.cell_type.isin(ct) & \
    (adata.obs.stim==stim)].copy().copy()
time_converter = {3:0, 6:1, 9:1, 24:2, 48:2}
adata_stim.obs['time_step'] = adata_stim.obs['time'].astype(int).apply(lambda x: time_converter[x])

memento.create_groups(adata_stim, label_columns=['time_step', 'donor'])
memento.compute_1d_moments(adata_stim, min_perc_group=.9)

memento.ht_1d_moments(
    adata_stim, 
    formula_like='1 + time_step',
    cov_column='time_step', 
    num_boot=10000, 
    verbose=1,
    num_cpus=70)

del adata_stim.uns['memento']['mv_regressor']

adata_stim.write(data_path + 'post_stim/{}_{}.h5ad'.format('-'.join(ct), stim))

[Parallel(n_jobs=70)]: Using backend LokyBackend with 70 concurrent workers.
[Parallel(n_jobs=70)]: Done  60 tasks      | elapsed:    4.2s
[Parallel(n_jobs=70)]: Done 310 tasks      | elapsed:    7.0s
[Parallel(n_jobs=70)]: Done 660 tasks      | elapsed:   10.9s
[Parallel(n_jobs=70)]: Done 1110 tasks      | elapsed:   15.4s
[Parallel(n_jobs=70)]: Done 1660 tasks      | elapsed:   21.0s
[Parallel(n_jobs=70)]: Done 2310 tasks      | elapsed:   27.4s
[Parallel(n_jobs=70)]: Done 3060 tasks      | elapsed:   34.8s
[Parallel(n_jobs=70)]: Done 3910 tasks      | elapsed:   43.5s
[Parallel(n_jobs=70)]: Done 4860 tasks      | elapsed:   52.7s
[Parallel(n_jobs=70)]: Done 5910 tasks      | elapsed:  1.1min
[Parallel(n_jobs=70)]: Done 7060 tasks      | elapsed:  1.2min
[Parallel(n_jobs=70)]: Done 8310 tasks      | elapsed:  1.5min
[Parallel(n_jobs=70)]: Done 9660 tasks      | elapsed:  1.7min
[Parallel(n_jobs=70)]: Done 10072 out of 10072 | elapsed:  1.8min finished
... storing 'memento_group' as c

In [36]:
# Get DV genes from tp 3
initial_dv_genes = all_result['ciliated']['ht']['beta']['3'].query('dv_fdr < 0.1 & dv_coef < 0').gene.tolist()

In [37]:
['time_step_' + str(i) for i in range(5)]

['time_step_0', 'time_step_1', 'time_step_2', 'time_step_3', 'time_step_4']

In [38]:
result = memento.get_1d_ht_result(adata_stim)
moments = memento.get_1d_moments(adata_stim, groupby='time_step')
means_df = moments[0][['gene'] + ['time_step_' + str(i) for i in range(3)]]
vars_df = moments[1][['gene'] + ['time_step_' + str(i) for i in range(3)]]

In [39]:
filtered_result = result.query('gene in @initial_dv_genes').copy()

filtered_result['dv_fdr'] = memento.util._fdrcorrect(filtered_result['dv_pval'])
filtered_result['de_fdr'] = memento.util._fdrcorrect(filtered_result['de_pval'])

In [40]:
len(initial_dv_genes)

52

In [41]:
vars_df.merge(filtered_result[['gene', 'dv_coef', 'dv_pval', 'dv_fdr']], on='gene').query('dv_fdr < 0.1')

Unnamed: 0,gene,time_step_0,time_step_1,time_step_2,dv_coef,dv_pval,dv_fdr
1,IFI6,-0.299882,0.672666,0.672666,0.770877,5.2e-05,0.000464
7,STAT1,-0.704221,0.115411,0.115411,0.659232,0.000162,0.001052
16,CD74,-0.816358,0.092769,0.092769,0.887657,4e-06,0.000221
17,DUSP1,0.206393,0.418957,0.418957,0.239038,0.017098,0.055569
18,HLA-B,-0.948606,0.09683,0.09683,1.01974,1.3e-05,0.000351
19,HLA-DRB1,-0.274799,0.697002,0.697002,0.823732,4e-05,0.000464
20,HLA-DMA,-0.434836,0.489893,0.489893,0.861638,0.000475,0.002472
21,HLA-DPA1,0.012088,1.017631,1.017631,0.970503,5.4e-05,0.000464
31,IFITM1,0.372703,0.90592,0.90592,0.53851,0.000332,0.00192
32,IFITM3,-0.048456,0.374957,0.374957,0.44526,0.000142,0.001052


### Run memento for each stimulation against all time - gamma

In [28]:
ct = ['ciliated']
stim = 'gamma'

adata_stim = adata[
    adata.obs.cell_type.isin(ct) & \
    (adata.obs.stim==stim)].copy().copy()
time_converter = {3:0, 6:1, 9:1, 24:2, 48:2}
adata_stim.obs['time_step'] = adata_stim.obs['time'].astype(int).apply(lambda x: time_converter[x])

memento.create_groups(adata_stim, label_columns=['time_step', 'donor'])
memento.compute_1d_moments(adata_stim, min_perc_group=.9)

memento.ht_1d_moments(
    adata_stim, 
    formula_like='1 + time_step',
    cov_column='time_step', 
    num_boot=10000, 
    verbose=1,
    num_cpus=70)

del adata_stim.uns['memento']['mv_regressor']

adata_stim.write(data_path + 'post_stim/{}_{}.h5ad'.format('-'.join(ct), stim))

  res = method(*args, **kwargs)
[Parallel(n_jobs=70)]: Using backend LokyBackend with 70 concurrent workers.
[Parallel(n_jobs=70)]: Done  60 tasks      | elapsed:    6.6s
[Parallel(n_jobs=70)]: Done 310 tasks      | elapsed:    9.5s
[Parallel(n_jobs=70)]: Done 660 tasks      | elapsed:   13.6s
[Parallel(n_jobs=70)]: Done 1110 tasks      | elapsed:   18.5s
[Parallel(n_jobs=70)]: Done 1660 tasks      | elapsed:   24.1s
[Parallel(n_jobs=70)]: Done 2310 tasks      | elapsed:   30.7s
[Parallel(n_jobs=70)]: Done 3060 tasks      | elapsed:   38.5s
[Parallel(n_jobs=70)]: Done 3910 tasks      | elapsed:   47.7s
[Parallel(n_jobs=70)]: Done 4860 tasks      | elapsed:   57.3s
[Parallel(n_jobs=70)]: Done 5910 tasks      | elapsed:  1.1min
[Parallel(n_jobs=70)]: Done 7060 tasks      | elapsed:  1.3min
[Parallel(n_jobs=70)]: Done 8310 tasks      | elapsed:  1.6min
[Parallel(n_jobs=70)]: Done 9660 tasks      | elapsed:  1.8min
[Parallel(n_jobs=70)]: Done 9822 out of 9822 | elapsed:  1.8min finished
..

In [64]:
# Get DV genes from tp 3
initial_dv_genes = all_result['ciliated']['ht']['beta']['3'].query('de_coef > 0 & de_fdr < 0.1 & dv_fdr < 0.1 & dv_coef > 0').gene.tolist()

In [65]:
print(len(initial_dv_genes))

44


In [66]:
result = memento.get_1d_ht_result(adata_stim)
moments = memento.get_1d_moments(adata_stim, groupby='time_step')
means_df = moments[0][['gene'] + ['time_step_' + str(i) for i in range(3)]]
vars_df = moments[1][['gene'] + ['time_step_' + str(i) for i in range(3)]]

In [67]:
filtered_result = result.query('gene in @initial_dv_genes').copy()

filtered_result['dv_fdr'] = memento.util._fdrcorrect(filtered_result['dv_pval'])
filtered_result['de_fdr'] = memento.util._fdrcorrect(filtered_result['de_pval'])

In [70]:
means_df.merge(filtered_result[['gene', 'dv_coef', 'dv_pval', 'dv_fdr']], on='gene')#.query('dv_fdr < 0.15')

Unnamed: 0,gene,time_step_0,time_step_1,time_step_2,dv_coef,dv_pval,dv_fdr
0,TACSTD2,2.249322,2.258143,2.258143,0.2237,0.024998,0.153556
1,PSMA5,0.863947,0.799512,0.799512,-0.076016,0.671833,0.940042
2,S100A16,-0.202163,-0.084257,-0.084257,0.677986,0.000655,0.01638
3,ATP1B1,1.548521,1.496719,1.496719,0.109724,0.432457,0.885507
4,MTHFD2,-0.66276,-1.377934,-1.377934,0.192772,0.513849,0.89861
5,TMSB10,3.250234,3.047912,3.047912,0.147518,0.064094,0.275602
6,CIR1,1.16336,1.093314,1.093314,-0.057985,0.719928,0.940042
7,S100P,1.854977,1.717059,1.717059,0.191933,0.215278,0.578561
8,CXCL10,0.384769,0.724104,0.724104,0.327069,0.171883,0.538879
9,CXCL11,-1.519264,-1.25432,-1.25432,-0.072199,0.866713,0.972103
