# Transcription factor dependence analysis

In [1]:
import scanpy as sc
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import itertools
from pybedtools import BedTool
import statsmodels.formula.api as smf
import statsmodels.api as sm

import os
import pickle as pkl
%matplotlib inline

In [2]:
pd.set_option('max_columns', None)
pd.set_option('display.max_rows', 500)

In [3]:
import sys
sys.path.append('/home/ssm-user/Github/scrna-parameter-estimation/dist/memento-0.0.6-py3.8.egg')
sys.path.append('/home/ssm-user/Github/misc-seq/miscseq/')
import encode
import memento

In [4]:
import matplotlib.pylab as pylab
params = {'legend.fontsize': 'medium',
         'axes.labelsize': 'medium',
         'axes.titlesize':'medium',
         'figure.titlesize':'medium',
         'xtick.labelsize':'medium',
         'ytick.labelsize':'medium',
         'axes.labelsize': 'large'}
pylab.rcParams.update(params)

In [5]:
data_path = '/data_volume/memento/eccite/'

### Read the guide labled perturb-seq data

From perturbseq paper

In [6]:
adata = sc.read(data_path + 'eccite.h5ad')
adata = adata[(adata.obs['replicate'] != 'rep4')].copy()
adata.var.index.name=None

  res = method(*args, **kwargs)


In [7]:
adata.obs.head(2)

Unnamed: 0,lane,n_genes_by_counts,total_counts,total_counts_mt,pct_counts_mt,orig.ident,nCount_HTO,nFeature_HTO,MULTI_ID,MULTI_classification,replicate,treatment,guide_ID,second_percent,gene
GACGGCTGTTTGACTG-8,8,1755,5233.0,279.0,5.33155,SeuratProject,265,5,rep2-tx-TCGATAATGCGA,rep2-tx-TCGATAATGCGA,rep2,tx,IRF7g1-CGTGTAGCGCAGCTGCTTC,0.006061,IRF7
GCAGCCATCCGTACAA-6,6,2219,7110.0,525.0,7.383966,SeuratProject,104,5,rep1-tx-AGGACCATCCAA,rep1-tx-AGGACCATCCAA,rep1,tx,IRF7g1-CGTGTAGCGCAGCTGCTTC,0.076923,IRF7


In [8]:
adata.obs['treatment'].value_counts()

tx      19822
ctrl     2046
Name: treatment, dtype: int64

### Read pre-defined ISGs

In [9]:
subset = sc.read(data_path + 'ctrl_vs_tx.h5ad')
stim_ht_df = memento.get_1d_ht_result(subset)
stim_ht_df['de_fdr'] = memento.util._fdrcorrect(stim_ht_df['de_pval'])

In [10]:
thp1_isgs = stim_ht_df.query('de_fdr < 0.05 & de_coef > 1').gene.tolist()

### Read TFs

In [15]:
tf_df = pd.read_csv('human_tf.txt', sep='\t')
tf_df = tf_df.query('Family != "Others"')
tf_list = tf_df['Symbol'].tolist()
tf_list += ['CIITA', 'NLRC5']

### Setup memento

In [11]:
adata.obs['q'] = 0.15

In [12]:
memento.setup_memento(adata, q_column='q', filter_mean_thresh=0.07 ,trim_percent=0.05)

Version 0.0.6




In [13]:
ko_genes = adata.obs.gene.drop_duplicates().tolist()

### Filter TFs

In [40]:
subset = adata.copy().copy()
subset = adata[adata.obs['gene'].isin(['NT', 'STAT1', 'JAK2', 'IFNGR1', 'CUL3'])].copy().copy()
memento.create_groups(subset, label_columns=['guide_ID', 'replicate'])
memento.compute_1d_moments(subset, min_perc_group=.9)

In [41]:
filtered_tfs = list(set(tf_list) & set(subset.var.index))
print(len(filtered_tfs))

145


In [42]:
'NFE2L2' in filtered_tfs

True

### Get JAK/STAT dependent TFs

In [79]:
g_ko = 'STAT1'
subset = adata.copy().copy()
subset = adata[adata.obs['gene'].isin(['NT', g_ko]) & (adata.obs['treatment']=='tx')].copy().copy()
subset.obs['is_ko'] = subset.obs.guide_ID.str.contains(g_ko).values.astype(int)

memento.create_groups(subset, label_columns=['is_ko', 'replicate', 'guide_ID'])

memento.compute_1d_moments(subset, min_perc_group=.7)

available_genes = list(set(subset.var.index.tolist()) & set(thp1_isgs))
memento.compute_2d_moments(subset, list(itertools.product(filtered_tfs, available_genes))   )
moments = memento.get_2d_moments(subset, groupby='is_ko')

  res = method(*args, **kwargs)


In [80]:
moments['diff'] = moments['is_ko_1'] - moments['is_ko_0']
ranking = moments.groupby('gene_1')['diff'].mean().sort_values(ascending=False)

In [81]:
ranking

gene_1
BAZ2A       0.115563
IRF3        0.091754
ETV6        0.091668
NLRC5       0.087784
TSC22D1     0.079191
IRF2        0.076080
REL         0.072580
FOXJ3       0.065531
IRF8        0.064049
MLXIP       0.060018
ATF3        0.059424
IRF7        0.055840
HOXA10      0.054371
HOXA9       0.054264
SP110       0.054110
KLF3        0.051919
NSD2        0.051159
IRF1        0.050900
YBX3        0.049660
STAT1       0.048799
DMRTA2      0.047937
ATF4        0.047298
SP100       0.045610
CEBPE       0.045096
ETV7        0.043718
ZNF684      0.042989
CEBPB       0.041654
PPARG       0.041559
ARID4B      0.040879
FOXN2       0.040490
ZNF385A     0.039403
SP9         0.038719
PRDM1       0.037936
FOXP2       0.036930
HIVEP3      0.036851
DDIT3       0.036829
HLX         0.036388
SATB1       0.035718
MYBL2       0.034374
TGIF1       0.034372
MITF        0.033356
SSRP1       0.033144
SMAD7       0.033095
NKX2-4      0.033078
GLMP        0.032965
MXD1        0.032679
CEBPD       0.032500
KLF6  

In [77]:
moments.groupby('gene_1')[['is_ko_1', 'is_ko_0']].mean()

Unnamed: 0_level_0,is_ko_1,is_ko_0
gene_1,Unnamed: 1_level_1,Unnamed: 2_level_1
AR,0.055258,0.067339
ARID3A,0.106901,0.105963
ARID4B,0.245901,0.201491
ARID5A,0.182418,0.168897
ATF3,0.15813,0.163294
ATF4,0.14231,0.10495
ATF5,0.187735,0.191477
BARX1,0.103311,0.104063
BATF,0.056553,0.07712
BAZ2A,0.185833,0.173608


In [70]:
ranking

gene_1
BAZ2A       0.112058
NLRC5       0.085033
IRF3        0.071706
IRF1        0.059652
SP110       0.059183
IRF7        0.058923
PPARG       0.056005
STAT1       0.054455
FOXN2       0.050685
CEBPE       0.049281
TSC22D1     0.048706
CEBPB       0.047427
ATF3        0.045113
ETV7        0.044419
FOXJ3       0.040726
HOXA10      0.040406
HLX         0.039032
HOXA9       0.038111
SP100       0.037624
KLF6        0.035555
TGIF1       0.035479
MTF1        0.034824
DMRTA2      0.034271
STAT3       0.033223
TRPS1       0.032182
DDIT3       0.031538
ATF4        0.031355
KLF4        0.031304
ZNF267      0.030811
JUN         0.029706
AR          0.029341
ETV6        0.027511
LITAF       0.027431
IRF2        0.027005
YBX3        0.026839
RUNX2       0.026790
NKX2-4      0.026002
TFEC        0.025698
SHOX2       0.025674
NSD2        0.024573
CBFB        0.022890
ZNF281      0.022430
CAMTA1      0.021667
XBP1        0.021378
CEBPG       0.019805
ARID4B      0.019386
CARHSP1     0.018157
MAFG  

### Run 2D memento for transcription factors

In [54]:
done_files = os.listdir(data_path + '2d_tf/')
for g in filtered_tfs:
    
    for g_ko in ['NT', 'STAT1', 'JAK2', 'IFNGR1', 'CUL3']:
        print(g, g_ko)
        
        if g == g_ko or g == 'NT' or g_ko == 'NT': continue
        fname = '{}_with_{}_KO_isg.h5ad'.format(g, g_ko)
        if fname in done_files: continue
        
        subset = adata.copy().copy()

        subset = adata[adata.obs['gene'].isin(['NT', g_ko]) & (adata.obs['treatment']=='tx')].copy().copy()
    
        subset.obs['is_ko'] = subset.obs.guide_ID.str.contains(g_ko).values.astype(int)

        memento.create_groups(subset, label_columns=['is_ko', 'replicate'])
        
        memento.compute_1d_moments(subset, min_perc_group=.7)
        
        if g not in subset.var.index: continue
        available_genes = list(set(subset.var.index.tolist()) & set(thp1_isgs))
        memento.compute_2d_moments(subset, list(itertools.product([g], available_genes))   )
#         moments = memento.get_2d_moments(subset, groupby='is_ko')
#         available_genes = moments.query('is_ko_0 > 0.2 | is_ko_0 < -0.2').gene_2.tolist()
#         memento.compute_2d_moments(subset, list(itertools.product([g], available_genes))   )
        
        print(g, g_ko, len(available_genes))
        memento.ht_2d_moments( 
            subset, 
            formula_like='1 + is_ko + replicate',
            treatment_col='is_ko', 
            num_boot=10000, 
            verbose=1,
            num_cpus=94,
            resampling='permutation',
            approx=True)
        
        subset.write(data_path + '2d_tf/{}_with_{}_KO_isg.h5ad'.format(g, g_ko))

In [59]:
'STAT1' in filtered_tfs

True

In [60]:
tf = 'STAT1'
g_ko = 'STAT1'
subset = sc.read(data_path + '2d_tf/{}_with_{}_KO_isg.h5ad'.format(tf, g_ko))

FileNotFoundError: [Errno 2] Unable to open file (unable to open file: name = '/data_volume/memento/eccite/2d_tf/STAT1_with_STAT1_KO_isg.h5ad', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

### Check TF's correlation to other genes following KO

In [123]:
# subset_ko_genes = ['STAT1', 'STAT2', 'JAK2', 'CUL3', 'IFNGR1', 'IFNGR2', 'MYC', 'SMAD4']
done_files = os.listdir(data_path + '/2d/')
for g in ['JAK2']:#ko_genes:
    
    if g == 'NT': continue
    fname = '{}_with_{}_KO_guide_rep.h5ad'.format(g, g)
#         if fname in done_files: continue

    subset = adata.copy().copy()

    subset = adata[adata.obs['gene'].isin(['NT', g])].copy().copy()

    subset.obs['is_ko'] = subset.obs.guide_ID.str.contains(g).values

    memento.create_groups(subset, label_columns=['is_ko', 'replicate', 'guide_ID'])

    memento.compute_1d_moments(subset, min_perc_group=.7)

    if g not in subset.var.index: continue

    available_genes = list(set(subset.var.index))

    if len(available_genes) < 10: continue
    memento.compute_2d_moments(subset, list(itertools.product([g], available_genes))   )

    memento.ht_2d_moments( 
        subset, 
        formula_like='1 + is_ko + replicate',
        treatment_col='is_ko', 
        num_boot=10000, 
        verbose=1,
        num_cpus=14,
        resampling='bootstrap',
        approx=False)

    subset.write(data_path + '2d/{}_with_{}_KO_guide_rep.h5ad'.format(g, g))

  res = method(*args, **kwargs)
  rv[cond] = np.exp(np.log(var[cond]) - f(np.log(mean[cond])))


In [124]:
moments, counts = memento.get_2d_moments(subset)

In [131]:
results['IFNGR1'].query('gene == "IFNGR1"')

Unnamed: 0,gene,de_coef,de_se,de_pval,dv_coef,dv_se,dv_pval,de_fdr,dv_fdr
3183,IFNGR1,-0.150763,0.045791,0.001701,0.199772,0.201333,0.314369,0.006115,0.849193


In [127]:
dv_genes = results['JAK2'].query('dv_fdr < 0.1 & dv_coef > 0').gene.tolist()

In [120]:
subset.obs['STAT1'] = subset[:, 'STAT1'].X.todense().A1
subset.obs['GBP4'] = subset[:, 'GBP4'].X.todense().A1

  res = method(*args, **kwargs)


In [70]:
dv_genes = results['STAT1'].query('dv_fdr < 0.1 & dv_coef > 0').gene.tolist()

### Debugging
