# 00__motifs

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import sys

from scipy.stats import spearmanr

# import utils
sys.path.append("../../../utils")
from plotting_utils import *
from misc_utils import *
from norm_utils import *

%matplotlib inline
%config InlineBackend.figure_format = 'svg'
mpl.rcParams['figure.autolayout'] = False

In [2]:
sns.set(**PAPER_PRESET)
fontsize = PAPER_FONTSIZE

In [3]:
np.random.seed(2019)

## variables

In [30]:
data_dir = "../../../data/02__mpra/02__activs"
human_max_f = "%s/human_TSS_vals.max_tile.txt" % data_dir
mouse_max_f = "%s/mouse_TSS_vals.max_tile.txt" % data_dir

In [12]:
results_dir = "../../../data/02__mpra/03__results"
results_f = "%s/native_cis_trans_effects_data.txt" % results_dir

In [20]:
motif_dir = "../../../data/04__mapped_motifs"
human_motifs_f = "%s/hg19_human_curated_tfs_out/fimo.txt.gz" % motif_dir
mouse_motifs_f = "%s/mm9_human_curated_tfs_out/fimo.txt.gz" % motif_dir

In [14]:
expr_dir = "../../../data/03__rna_seq/04__TF_expr"
orth_expr_f = "%s/orth_TF_expression.txt" % expr_dir
human_expr_f = "%s/hESC_TF_expression.txt" % expr_dir
mouse_expr_f = "%s/mESC_TF_expression.txt" % expr_dir

## 1. import data

In [21]:
results = pd.read_table(results_f, sep="\t")
results.head()

Unnamed: 0,chr_tss_hg19,start_tss_hg19,end_tss_hg19,strand_tss_hg19,cage_id_hg19,biotype_hg19,name_peak_hg19,seq_orth,cage_orth,chr_tss_mm9,...,trans_status_detail_human,fdr_trans_mouse,logFC_trans_mouse,abs_logFC_trans_mouse,trans_status_mouse,trans_status_detail_mouse,trans_status_one,trans_status_detail_one,logFC_trans_max,abs_logFC_trans_max
0,chr1,2985430,2985431,-,"chr1:2985420..2985438,-",div_lnc,ENSG00000177133.6,True,True,chr4,...,no trans effect,0.585918,-0.384559,0.384559,no trans effect,no trans effect,no trans effect,no trans effect,-0.384559,0.384559
1,chr1,8086552,8086553,+,"chr1:8086546..8086571,+",div_lnc,ENSG00000238290.1,True,True,chr4,...,no trans effect,0.492102,0.500227,0.500227,no trans effect,no trans effect,no trans effect,no trans effect,1.300086,1.300086
2,chr1,26498322,26498323,-,"chr1:26498321..26498327,-",antisense_upep,ENSG00000236782.1,True,True,chr4,...,no trans effect,0.902393,-0.088485,0.088485,no trans effect,no trans effect,no trans effect,no trans effect,0.443366,0.443366
3,chr1,65533428,65533429,-,"chr1:65533390..65533443,-",intergenic,ENSG00000231485.1,True,True,chr4,...,no trans effect,0.311051,0.460589,0.460589,no trans effect,no trans effect,no trans effect,no trans effect,0.460589,0.460589
4,chr1,65533462,65533463,-,"chr1:65533457..65533465,-",intergenic,ENSG00000231485.1,True,True,chr4,...,no trans effect,0.375934,0.447387,0.447387,no trans effect,no trans effect,no trans effect,no trans effect,0.447387,0.447387


In [31]:
human_max = pd.read_table(human_max_f, sep="\t")
mouse_max = pd.read_table(mouse_max_f, sep="\t")
human_max.head()

Unnamed: 0,element,tss_id,biotype_hg19,tss_tile_num,HUES64,HUES64_log,HUES64_padj,HUES64_sig,cleaner_biotype
0,GCCCTCTCGCCCCGCCCTTGCCCAGGCAGCCCCCGGTCGCGACGGC...,h.999,div_pc,tile2,0.588526,-0.230235,0.01848145,sig,mRNA
1,GGGCGGGACGGAGACTCTGGGCTCAAGGCTCCTGGAAATGGGCGGG...,h.998,div_pc,tile2,0.885958,-0.052587,1.044316e-06,sig,mRNA
2,AAAAGGCAGTGCTTGATTCAATTCAACATTCACTGCGCCACTTACC...,h.997,div_pc,tile2,0.423129,-0.373527,0.2996315,not sig,mRNA
3,CGGAGGGGCGGGGCAAGAGTGGGAGGAGACCCTGCGCGCGGCCGCC...,h.996,div_pc,tile2,2.452145,0.389546,5.019212e-74,sig,mRNA
4,AGGGTGGTGCGTGGTCTACGGCGAGCGGAGTGGGGCGGGGTCGCGC...,h.995,div_pc,tile1,0.994045,-0.002594,6.400385e-09,sig,mRNA


In [22]:
human_motifs = pd.read_table(human_motifs_f, sep="\t")
human_motifs.head()

Unnamed: 0,#pattern name,sequence name,start,stop,strand,score,p-value,q-value,matched sequence
0,ZNF212,HUMAN_EVO_TSS__h.3519__tile2;WT::chr7:47680526...,5,34,+,55.8485,7.5e-19,2.85e-13,gagagagagagagagagagagagagagaga
1,ZNF212,HUMAN_EVO_TSS__h.3519__tile2;WT::chr7:47680526...,7,36,+,55.8485,7.5e-19,2.85e-13,gagagagagagagagagagagagagagaga
2,ZNF212,HUMAN_EVO_TSS__h.3519__tile2;WT::chr7:47680526...,9,38,+,55.8485,7.5e-19,2.85e-13,gagagagagagagagagagagagagagaga
3,ZNF212,HUMAN_EVO_TSS__h.3519__tile2;WT::chr7:47680526...,11,40,+,55.8485,7.5e-19,2.85e-13,gagagagagagagagagagagagagagaga
4,ZNF212,HUMAN_EVO_TSS__h.357__tile2;WT::chr8:65285637-...,43,72,+,52.5051,5.31e-18,1.15e-12,gagagagagagagagagagaaagagagaga


In [23]:
mouse_motifs = pd.read_table(mouse_motifs_f, sep="\t")
mouse_motifs.head()

Unnamed: 0,#pattern name,sequence name,start,stop,strand,score,p-value,q-value,matched sequence
0,ZNF212,MOUSE_EVO_TSS__m.2482__tile2;WT::chr4:9214553-...,1,30,+,55.8485,7.5e-19,7.97e-15,gagagagagagagagagagagagagagaga
1,ZNF212,MOUSE_EVO_TSS__m.2482__tile2;WT::chr4:9214553-...,3,32,+,55.8485,7.5e-19,7.97e-15,gagagagagagagagagagagagagagaga
2,ZNF212,MOUSE_EVO_TSS__m.2482__tile2;WT::chr4:9214553-...,5,34,+,55.8485,7.5e-19,7.97e-15,gagagagagagagagagagagagagagaga
3,ZNF212,MOUSE_EVO_TSS__m.315__tile1;WT::chr18:7003691-...,9,38,+,55.8485,7.5e-19,7.97e-15,gagagagagagagagagagagagagagaga
4,ZNF212,MOUSE_EVO_TSS__m.315__tile1;WT::chr18:7003691-...,11,40,+,55.8485,7.5e-19,7.97e-15,gagagagagagagagagagagagagagaga


In [24]:
orth_expr = pd.read_table(orth_expr_f, sep="\t")
orth_expr.head()

Unnamed: 0,gene_id_human,gene_name_human,mean_tpm_human,gene_id_mouse,gene_name_mouse,mean_tpm_mouse,baseMean,log2FoldChange,lfcSE,padj,sig
0,ENSG00000197921,HES5,101.620874,ENSMUSG00000048001,Hes5,0.0,42.463063,9.220488,1.288307,1.707594e-12,sig
1,ENSG00000069812,HES2,30.322266,ENSMUSG00000028940,Hes2,11.737072,19.474793,1.49364,0.690207,0.03923633,not sig
2,ENSG00000009709,PAX7,240.239581,ENSMUSG00000028736,Pax7,8.372211,105.367706,4.965287,0.503225,1.561741e-22,sig
3,ENSG00000007968,E2F2,582.324386,ENSMUSG00000018983,E2f2,737.478304,671.003203,-0.234384,0.12364,0.0727451,not sig
4,ENSG00000020633,RUNX3,32.967646,ENSMUSG00000070691,Runx3,41.452263,37.799922,-0.224945,0.457367,0.662658,not sig


In [25]:
human_expr = pd.read_table(human_expr_f, sep="\t")
human_expr.head()

Unnamed: 0,index,gene_name,mean_tpm
0,ENSG00000197921,HES5,101.620874
1,ENSG00000069812,HES2,30.322266
2,ENSG00000074800,ENO1,90568.747186
3,ENSG00000009709,PAX7,240.239581
4,ENSG00000007968,E2F2,582.324386


In [26]:
mouse_expr = pd.read_table(mouse_expr_f, sep="\t")
mouse_expr.head()

Unnamed: 0,gene_id_human,gene_name_human,gene_id_mouse,gene_name_mouse,mean_tpm_mouse
0,ENSG00000197921,HES5,ENSMUSG00000048001,Hes5,0.0
1,ENSG00000069812,HES2,ENSMUSG00000028940,Hes2,11.737072
2,ENSG00000074800,ENO1,ENSMUSG00000059040,Eno1b,1312.160341
3,ENSG00000074800,ENO1,ENSMUSG00000063524,Eno1,73990.521463
4,ENSG00000009709,PAX7,ENSMUSG00000028736,Pax7,8.372211


## 2. parse motif files

In [28]:
human_motifs["hg19_id"] = human_motifs["sequence name"].str.split("__", expand=True)[1]
human_motifs["tile_num"] = human_motifs["sequence name"].str.split(";", expand=True)[0].str.split("__", expand=True)[2]
human_motifs["tss_strand"] = human_motifs["sequence name"].str[-2]
human_motifs.head()

Unnamed: 0,#pattern name,sequence name,start,stop,strand,score,p-value,q-value,matched sequence,hg19_id,tile_num,tss_strand
0,ZNF212,HUMAN_EVO_TSS__h.3519__tile2;WT::chr7:47680526...,5,34,+,55.8485,7.5e-19,2.85e-13,gagagagagagagagagagagagagagaga,h.3519,tile2,+
1,ZNF212,HUMAN_EVO_TSS__h.3519__tile2;WT::chr7:47680526...,7,36,+,55.8485,7.5e-19,2.85e-13,gagagagagagagagagagagagagagaga,h.3519,tile2,+
2,ZNF212,HUMAN_EVO_TSS__h.3519__tile2;WT::chr7:47680526...,9,38,+,55.8485,7.5e-19,2.85e-13,gagagagagagagagagagagagagagaga,h.3519,tile2,+
3,ZNF212,HUMAN_EVO_TSS__h.3519__tile2;WT::chr7:47680526...,11,40,+,55.8485,7.5e-19,2.85e-13,gagagagagagagagagagagagagagaga,h.3519,tile2,+
4,ZNF212,HUMAN_EVO_TSS__h.357__tile2;WT::chr8:65285637-...,43,72,+,52.5051,5.31e-18,1.15e-12,gagagagagagagagagagaaagagagaga,h.357,tile2,+


In [29]:
mouse_motifs["mm9_id"] = mouse_motifs["sequence name"].str.split("__", expand=True)[1]
mouse_motifs["tss_strand"] = mouse_motifs["sequence name"].str[-2]
mouse_motifs["tile_num"] = mouse_motifs["sequence name"].str.split(";", expand=True)[0].str.split("__", expand=True)[2]
mouse_motifs.head()

Unnamed: 0,#pattern name,sequence name,start,stop,strand,score,p-value,q-value,matched sequence,mm9_id,tss_strand,tile_num
0,ZNF212,MOUSE_EVO_TSS__m.2482__tile2;WT::chr4:9214553-...,1,30,+,55.8485,7.5e-19,7.97e-15,gagagagagagagagagagagagagagaga,m.2482,-,tile2
1,ZNF212,MOUSE_EVO_TSS__m.2482__tile2;WT::chr4:9214553-...,3,32,+,55.8485,7.5e-19,7.97e-15,gagagagagagagagagagagagagagaga,m.2482,-,tile2
2,ZNF212,MOUSE_EVO_TSS__m.2482__tile2;WT::chr4:9214553-...,5,34,+,55.8485,7.5e-19,7.97e-15,gagagagagagagagagagagagagagaga,m.2482,-,tile2
3,ZNF212,MOUSE_EVO_TSS__m.315__tile1;WT::chr18:7003691-...,9,38,+,55.8485,7.5e-19,7.97e-15,gagagagagagagagagagagagagagaga,m.315,+,tile1
4,ZNF212,MOUSE_EVO_TSS__m.315__tile1;WT::chr18:7003691-...,11,40,+,55.8485,7.5e-19,7.97e-15,gagagagagagagagagagagagagagaga,m.315,+,tile1


In [32]:
# limit motif tiles to those that are max tiles (since we mapped motifs in both tiles)
human_max_motifs = human_max.merge(human_motifs, left_on=["tss_id", "tss_tile_num"],
                                   right_on=["hg19_id", "tile_num"], how="left").reset_index()
human_max_motifs = human_max_motifs[~pd.isnull(human_max_motifs["element"])]
human_max_motifs.head()

Unnamed: 0,index,element,tss_id,biotype_hg19,tss_tile_num,HUES64,HUES64_log,HUES64_padj,HUES64_sig,cleaner_biotype,...,start,stop,strand,score,p-value,q-value,matched sequence,hg19_id,tile_num,tss_strand
0,0,GCCCTCTCGCCCCGCCCTTGCCCAGGCAGCCCCCGGTCGCGACGGC...,h.999,div_pc,tile2,0.588526,-0.230235,0.018481,sig,mRNA,...,4.0,23.0,-,19.0367,9.94e-08,9.8e-05,GGGCAAGGGCGGGGCGAGAG,h.999,tile2,+
1,1,GCCCTCTCGCCCCGCCCTTGCCCAGGCAGCCCCCGGTCGCGACGGC...,h.999,div_pc,tile2,0.588526,-0.230235,0.018481,sig,mRNA,...,70.0,84.0,-,19.0367,2.41e-07,0.00052,CCAGGGCCGCCGCCC,h.999,tile2,+
2,2,GCCCTCTCGCCCCGCCCTTGCCCAGGCAGCCCCCGGTCGCGACGGC...,h.999,div_pc,tile2,0.588526,-0.230235,0.018481,sig,mRNA,...,9.0,23.0,-,18.2857,2.46e-07,0.000586,GGGCAAGGGCGGGGC,h.999,tile2,+
3,3,GCCCTCTCGCCCCGCCCTTGCCCAGGCAGCCCCCGGTCGCGACGGC...,h.999,div_pc,tile2,0.588526,-0.230235,0.018481,sig,mRNA,...,108.0,128.0,+,17.6429,2.86e-07,0.000805,GGGGCAGGCGCGAGGAGCGGC,h.999,tile2,+
4,4,GCCCTCTCGCCCCGCCCTTGCCCAGGCAGCCCCCGGTCGCGACGGC...,h.999,div_pc,tile2,0.588526,-0.230235,0.018481,sig,mRNA,...,3.0,23.0,-,18.1414,3.48e-07,0.000311,GGGCAAGGGCGGGGCGAGAGG,h.999,tile2,+


In [33]:
# limit motif tiles to those that are max tiles (since we mapped motifs in both tiles)
mouse_max_motifs = mouse_max.merge(mouse_motifs, left_on=["tss_id", "tss_tile_num"],
                                   right_on=["mm9_id", "tile_num"], how="left").reset_index()
mouse_max_motifs = mouse_max_motifs[~pd.isnull(mouse_max_motifs["element"])]
mouse_max_motifs.head()

Unnamed: 0,index,element,tss_id,biotype_mm9,tss_tile_num,mESC,mESC_log,mESC_padj,mESC_sig,cleaner_biotype,...,start,stop,strand,score,p-value,q-value,matched sequence,mm9_id,tss_strand,tile_num
0,0,GGGGGTTGGTGGCGGCGAGGTGGAGATGCTAGAAAAGGGCGGGACC...,m.999,div_pc,tile1,2.43217,0.385994,5.320842e-37,sig,mRNA,...,109.0,123.0,+,22.5918,3.22e-09,0.00136,CGCCAAGATGGCGGC,m.999,-,tile1
1,1,GGGGGTTGGTGGCGGCGAGGTGGAGATGCTAGAAAAGGGCGGGACC...,m.999,div_pc,tile1,2.43217,0.385994,5.320842e-37,sig,mRNA,...,112.0,124.0,+,21.9388,1.71e-08,0.00208,CAAGATGGCGGCG,m.999,-,tile1
2,2,GGGGGTTGGTGGCGGCGAGGTGGAGATGCTAGAAAAGGGCGGGACC...,m.999,div_pc,tile1,2.43217,0.385994,5.320842e-37,sig,mRNA,...,112.0,126.0,+,21.2347,2.04e-08,0.00156,CAAGATGGCGGCGCT,m.999,-,tile1
3,3,GGGGGTTGGTGGCGGCGAGGTGGAGATGCTAGAAAAGGGCGGGACC...,m.999,div_pc,tile1,2.43217,0.385994,5.320842e-37,sig,mRNA,...,112.0,126.0,+,21.5204,2.18e-08,0.00109,CAAGATGGCGGCGCT,m.999,-,tile1
4,4,GGGGGTTGGTGGCGGCGAGGTGGAGATGCTAGAAAAGGGCGGGACC...,m.999,div_pc,tile1,2.43217,0.385994,5.320842e-37,sig,mRNA,...,112.0,126.0,+,21.051,2.34e-08,0.00168,CAAGATGGCGGCGCT,m.999,-,tile1


## 3. find motifs enriched in trans effects

In [35]:
uniq_human_tfs = human_max_motifs["#pattern name"].unique()
len(uniq_human_tfs)

5523

In [38]:
human_max_motifs.sample().iloc[0]

index                                                          215102
element             GCAGTTTCGGCCGCTCAGTTGCAGGCCCTCGCCACGGAGGCCACGC...
tss_id                                                          h.498
biotype_hg19                                           protein_coding
tss_tile_num                                                    tile2
HUES64                                                       0.326717
HUES64_log                                                  -0.485828
HUES64_padj                                                   0.63133
HUES64_sig                                                    not sig
cleaner_biotype                                                  mRNA
#pattern name                                                M00636_1
sequence name       HUMAN_EVO_TSS__h.498__tile2;WT::chr6:151815027...
start                                                              70
stop                                                               79
strand              

In [40]:
human_motifs.sample(5)

Unnamed: 0,#pattern name,sequence name,start,stop,strand,score,p-value,q-value,matched sequence,hg19_id,tile_num,tss_strand
2918883,M04338_1,HUMAN_EVO_TSS__h.21__tile2;WT::chr1:185286371-...,48,61,+,9.90816,9.6e-05,0.482,tcaccacgtgcgct,h.21,tile2,+
1625360,M05277_1,HUMAN_EVO_TSS__h.2261__tile2;WT::chr5:17971972...,49,63,-,11.3232,4.3e-05,0.3,AGGGCGCGGTGGCTC,h.2261,tile2,-
2330669,M05433_1,HUMAN_EVO_TSS__h.197__tile1;WT::chr2:105469616...,65,84,-,8.2449,7.1e-05,0.028,CCCAGAGGAGGGAGGAGGGA,h.197,tile1,-
1612245,M10334_1,HUMAN_EVO_TSS__h.1702__tile1;WT::chr10:1727265...,43,57,+,11.3367,4.2e-05,0.00815,TGGGTGGGGTCAGGC,h.1702,tile1,-
464446,M05646_1,HUMAN_EVO_TSS__h.2111__tile1;WT::chr8:14384437...,77,92,-,11.5714,7e-06,0.146,CCCCGCCCCCCCACCT,h.2111,tile1,+
