# PBMC comparison

In [3]:
import scanpy as sc
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
from pybedtools import BedTool
import pickle as pkl
%matplotlib inline
import itertools

In [4]:
import sys
sys.path.append('/home/ssm-user/Github/scrna-parameter-estimation/dist/memento-0.0.5-py3.8.egg')
sys.path.append('/home/ssm-user/Github/misc-seq/miscseq')
import encode
import memento

In [5]:
data_path = '/data_volume/ifn_hbec/'
fig_path = '/data/home/Github/scrna-parameter-estimation/figures/fig6/'

In [6]:
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

In [7]:
import matplotlib.pylab as pylab
params = {'legend.fontsize': 'medium',
         'axes.labelsize': 'medium',
         'axes.titlesize':'medium',
         'figure.titlesize':'medium',
         'xtick.labelsize':'small',
         'ytick.labelsize':'small'}
pylab.rcParams.update(params)

### Read the processed RNA data

Focus on the club and bc/club cells and type I interferons for now.

Encode the timestamps to integers.

In [8]:
# adata_processed = sc.read(data_path + 'HBEC_type_I_processed_deep.h5ad')

In [9]:
adata = sc.read(data_path + 'HBEC_type_I_filtered_counts_deep.h5ad')

In [10]:
adata = adata[:, ~adata.var.index.str.startswith('MT-')].copy()
adata.obs['cell_type'] = adata.obs['cell_type'].apply(lambda x: x if x != 'basal/club' else 'bc')
adata.obs['cell_type'] = adata.obs['cell_type'].apply(lambda x: x if x != 'ionocyte/tuft' else 'ion-tuft')

  res = method(*args, **kwargs)


In [11]:
adata.shape

(69958, 36588)

### Read HBEC result

In [12]:
cts = [['ciliated'], ['bc', 'basal']]
tps = ['3', '6', '9', '24', '48']
stims = ['alpha', 'beta', 'gamma', 'lambda']

In [13]:
def read_result(ct):
    
    ht_dict = {}
    moments_dict = {}
    
    for stim in stims:
        ht_dict[stim] = {}
        moments_dict[stim] = {}
        for tp in tps: 
            adata_subset = sc.read('/data_volume/ifn_hbec/binary_test_deep/{}_{}_{}.h5ad'.format('-'.join(ct), stim, tp))
            ht_dict[stim][tp] = memento.get_1d_ht_result(adata_subset)
            ht_dict[stim][tp]['de_fdr'] = memento.util._fdrcorrect(ht_dict[stim][tp]['de_pval'])
            ht_dict[stim][tp]['dv_fdr'] = memento.util._fdrcorrect(ht_dict[stim][tp]['dv_pval'])
            moments_dict[stim][tp] = memento.get_1d_moments(adata_subset, groupby='time_step')
    return ht_dict,moments_dict

In [14]:
all_result = {}
for ct in cts:
    
    key = '-'.join(ct)
    all_result[key] = {}
    
    all_result[key]['ht'], all_result[key]['moments'] = read_result(ct)

### Read PBMC result

In [110]:
cm_adata = sc.read('/data_volume/parameter_estimation/' + 'result_1d/ifn/cM_20210104.h5ad')
cm_ht_df = memento.get_1d_ht_result(cm_adata)
cm_ht_df['dv_fdr'] = memento.util._fdrcorrect(cm_ht_df['dv_pval'])
cm_ht_df['de_fdr'] = memento.util._fdrcorrect(cm_ht_df['de_pval'])

In [111]:
avail_genes = list(set(cm_ht_df.gene) & set(all_result['ciliated']['ht']['beta']['6'].gene))

### Compare DV genes

In [95]:
hbec_dv_genes = all_result['ciliated']['ht']['beta']['6']\
    .query('dv_fdr < 0.05 & gene in @avail_genes').gene.tolist()
cm_dv_genes = cm_ht_df\
    .query('dv_fdr < 0.05 & gene in @avail_genes').gene.tolist()
all_genes = all_result['ciliated']['ht']['beta']['6']\
    .query('gene in @avail_genes').gene.tolist()

In [100]:
overlap = set(hbec_dv_genes) & set(cm_dv_genes)
only_hbec = set(hbec_dv_genes) - set(cm_dv_genes)
only_pbmc = set(cm_dv_genes) - set(hbec_dv_genes)
neither = set(avail_genes) - overlap - only_hbec - only_pbmc

In [101]:
len(neither) + len(overlap) + len(only_hbec) + len(only_pbmc)

1337

In [105]:
tab = np.zeros((2,2))
tab[0, 0] = len(neither)
tab[0, 1] = len(only_hbec)
tab[1, 0] = len(only_pbmc)
tab[1, 1] = len(overlap)

In [106]:
tab

array([[959., 274.],
       [ 42.,  62.]])

In [107]:
stats.chi2_contingency(tab)

(69.30173836046919,
 8.449508446902029e-17,
 1,
 array([[923.13612565, 309.86387435],
        [ 77.86387435,  26.13612565]]))

### Compare DE genes

In [112]:
hbec_dv_genes = all_result['ciliated']['ht']['beta']['6']\
    .query('de_fdr < 0.05 & gene in @avail_genes').gene.tolist()
cm_dv_genes = cm_ht_df\
    .query('de_fdr < 0.05 & gene in @avail_genes').gene.tolist()
all_genes = all_result['ciliated']['ht']['beta']['6']\
    .query('gene in @avail_genes').gene.tolist()

In [113]:
overlap = set(hbec_dv_genes) & set(cm_dv_genes)
only_hbec = set(hbec_dv_genes) - set(cm_dv_genes)
only_pbmc = set(cm_dv_genes) - set(hbec_dv_genes)
neither = set(avail_genes) - overlap - only_hbec - only_pbmc

In [114]:
len(neither) + len(overlap) + len(only_hbec) + len(only_pbmc)

1337

In [115]:
tab = np.zeros((2,2))
tab[0, 0] = len(neither)
tab[0, 1] = len(only_hbec)
tab[1, 0] = len(only_pbmc)
tab[1, 1] = len(overlap)

In [116]:
tab

array([[118., 143.],
       [291., 785.]])

In [117]:
stats.chi2_contingency(tab)

(31.79658050499693,
 1.7119387731464417e-08,
 1,
 array([[ 79.84218399, 181.15781601],
        [329.15781601, 746.84218399]]))