# Identifying feedback ISGs

In [34]:
import pandas as pd
import matplotlib.pyplot as plt
import scanpy as sc
import scipy as sp
import itertools
import numpy as np
import scipy.stats as stats
from scipy.integrate import dblquad
import seaborn as sns
from statsmodels.stats.multitest import fdrcorrection
import imp
pd.options.display.max_rows = 999
pd.set_option('display.max_colwidth', -1)
import pickle as pkl
import time

  pd.set_option('display.max_colwidth', -1)


In [35]:
%matplotlib inline

In [36]:
import matplotlib.pylab as pylab
params = {'legend.fontsize': 'medium',
         'axes.labelsize': 'medium',
         'axes.titlesize':'medium',
         'figure.titlesize':'medium',
         'xtick.labelsize':'small',
         'ytick.labelsize':'small'}
pylab.rcParams.update(params)


In [37]:
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

In [38]:
import warnings
# warnings.filterwarnings('ignore')

In [39]:
import sys
sys.path.append('/home/ssm-user/Github/scrna-parameter-estimation/dist/memento-0.0.5-py3.8.egg')
sys.path.append('/home/ssm-user/Github/misc-seq/miscseq/')
import encode
import memento

In [40]:
data_path = '/data_volume/memento/pbmc/'

### Load the data

In [41]:
cts = ['CD4 T cells',  'CD14+ Monocytes', 'FCGR3A+ Monocytes', 'NK cells','CD8 T cells', 'B cells']
label_converter = dict(zip(cts, ['Th', 'cM', 'ncM', 'NK', 'Tc', 'B']))

In [42]:
def simplify_name(name):
    return name.split('.')[0]

In [43]:
adata = sc.read(data_path + 'interferon_filtered.h5ad')

### Setup and run 1D memento to identify major ISGs

In [187]:
adata.obs['q'] = 0.1
memento.setup_memento(adata, q_column='q', filter_mean_thresh=0.04)

In [188]:
ct = 'CD14+ Monocytes'
adata_filtered = adata[adata.obs['cell'] == ct].copy()

In [189]:
memento.create_groups(adata_filtered, label_columns=['stim'])
memento.compute_1d_moments(adata_filtered, min_perc_group=.9)

In [190]:
memento.ht_1d_moments(
    adata_filtered, 
    formula_like='1 + stim',
    cov_column='stim', 
    num_boot=10000, 
    verbose=1,
    num_cpus=13)

[Parallel(n_jobs=13)]: Using backend LokyBackend with 13 concurrent workers.
[Parallel(n_jobs=13)]: Done  26 tasks      | elapsed:    0.9s
[Parallel(n_jobs=13)]: Done 322 tasks      | elapsed:    9.2s
[Parallel(n_jobs=13)]: Done 822 tasks      | elapsed:   21.9s
[Parallel(n_jobs=13)]: Done 1522 tasks      | elapsed:   41.2s
[Parallel(n_jobs=13)]: Done 2069 tasks      | elapsed:   56.6s
[Parallel(n_jobs=13)]: Done 2619 tasks      | elapsed:  1.2min
[Parallel(n_jobs=13)]: Done 2676 out of 2676 | elapsed:  1.2min finished


In [191]:
result_1d = memento.get_1d_ht_result(adata_filtered)

In [192]:
result_1d['de_fdr'] = memento.util._fdrcorrect(result_1d['de_pval'])

In [193]:
isg_list = result_1d.query('de_fdr < 0.01 & de_coef > 0').gene.tolist()

### Create total activation score (just sum of major ISG counts)

In [197]:
adata_filtered.obs['ifn_score'] = adata_filtered[:, isg_list].X.sum(axis=1).A1

### Get IFNAR genes

In [198]:
ifnr_genes = []
for g in adata_filtered.var.index:
    if 'IFNAR' in g or 'IFNBR' in g:
        ifnr_genes.append(g)

In [199]:
adata_filtered.obs['ifnar_score'] = adata_filtered[:, ifnr_genes].X.sum(axis=1).A1

In [200]:
unique_ifnar_counts = adata_filtered[adata_filtered.obs.stim=='ctrl', :].obs.ifnar_score.value_counts()

In [201]:
for i, j in unique_ifnar_counts.items():
    print(i,j)

0.0 2328
1.0 375
2.0 52
3.0 6
9.0 1
10.0 1


### Calculate association against IFN genes

In [202]:
import statsmodels.api as sm

In [203]:
import warnings
warnings.simplefilter("ignore")

In [204]:

cell_count = adata_filtered[adata_filtered.obs.stim=='ctrl', :].shape[0]
adjusted_coefs = []
for gene in isg_list:
    
    coef = 0
    actual_count = 0
    
    for ifnar_value, count in unique_ifnar_counts.items():        
        
        temp = adata_filtered[(adata_filtered.obs['ifnar_score']==ifnar_value) & (adata_filtered.obs['stim'] == 'ctrl'), :]
        
        X = temp.obs['ifn_score']
        X = sm.add_constant(X)
        Y = temp[:, gene].X.todense()

        nb_model = sm.GLM(Y, X, family=sm.families.NegativeBinomial())
        
        try:
            nb_result = nb_model.fit()
            coef += nb_result.params['ifn_score']*count
            actual_count += count
        except:
            continue


    coef /= actual_count
    adjusted_coefs.append((gene, coef))

In [205]:
result_1d.query('gene == "DDX58"')

Unnamed: 0,gene,de_coef,de_se,de_pval,dv_coef,dv_se,dv_pval,de_fdr
1328,DDX58,3.335185,0.118385,2e-06,-0.754449,0.466545,0.107189,1.7e-05


In [214]:
adata_filtered[:, 'TMEM173']

KeyError: 'TMEM173'

In [211]:
ranked = pd.DataFrame(adjusted_coefs, columns=['gene', 'coef']).sort_values('coef', ascending=False)
ranked.shape

(761, 2)

In [212]:
ranked['rank'] = np.arange(ranked.shape[0])

In [213]:
ranked

Unnamed: 0,gene,coef,rank
744,APOBEC3B,0.00506,0
89,RSAD2,0.004656,1
636,CCL2,0.00458,2
643,CCR7,0.004203,3
641,CCL4,0.004202,4
445,IFIT1,0.004149,5
64,SLAMF7,0.004136,6
407,CTSL,0.004133,7
574,IFI27,0.004006,8
191,PLAC8,0.003992,9


In [138]:
nb_result.params['ifn_score']

0.0009539119673533953

In [128]:
gene = 'IRF1'
X = temp.obs['ifn_score']
X = sm.add_constant(X)
Y = temp[:, gene].X.todense()

nb_model = sm.GLM(Y, X, family=sm.families.NegativeBinomial())
nb_result = nb_model.fit()

nb_result.params

  res = method(*args, **kwargs)


const       -1.766184
ifn_score    0.000954
dtype: float64

In [136]:
dir(nb_result)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_cache',
 '_data_attr',
 '_data_attr_model',
 '_data_in_cache',
 '_endog',
 '_freq_weights',
 '_get_robustcov_results',
 '_iweights',
 '_n_trials',
 '_use_t',
 '_var_weights',
 'aic',
 'bic',
 'bic_deviance',
 'bic_llf',
 'bse',
 'conf_int',
 'converged',
 'cov_kwds',
 'cov_params',
 'cov_type',
 'deviance',
 'df_model',
 'df_resid',
 'f_test',
 'family',
 'fit_history',
 'fittedvalues',
 'get_hat_matrix_diag',
 'get_influence',
 'get_prediction',
 'initialize',
 'k_constant',
 'llf',
 'llnull',
 'load',
 'method',
 'mle_settings',
 'model',
 'mu',
 'nobs',
 'normalized_cov_params',
 'null',
 'null_deviance',
 'params',


In [130]:
nb_result.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,2328.0
Model:,GLM,Df Residuals:,2326.0
Model Family:,NegativeBinomial,Df Model:,1.0
Link Function:,log,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-1540.7
Date:,"Thu, 12 Aug 2021",Deviance:,1628.0
Time:,17:28:49,Pearson chi2:,2740.0
No. Iterations:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-1.7662,0.108,-16.381,0.000,-1.978,-1.555
ifn_score,0.0010,0.000,5.033,0.000,0.001,0.001
