# Introduction

This analysis to quantify the cooperative binding of Tead4 on Tead double motifs genome-wide using trined ESC model (single task). We  with measure the fold increase in binding to the intact double motif as compared to the sum of the binding to each motif alone.

# Computational setup

## Environment

In [1]:
import warnings
warnings.filterwarnings("ignore")
from tensorflow.python.util import deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False

#Packages
import os
import sys
import pandas as pd
import numpy as np
import plotnine
#!pip install logomaker
import seaborn as sns
from pybedtools import BedTool
from tqdm import tqdm
from bpnet.utils import create_tf_session
from bpnet.BPNet import BPNetSeqModel
from bpnet.simulate import motif_coords
from bpnet.preproc import resize_interval
from concise.preprocessing import encodeDNA
from bpnet.extractors import extract_seq
from plotnine import *
from Bio.Seq import Seq

# Settings
os.chdir(f'/n/projects/kd2200/publication/bpnet/analysis/')
pd.set_option('display.max_columns', 100)
create_tf_session('0',1)

# Custom functions
sys.path.insert(0, f'/n/projects/kd2200/publication/bpnet/analysis/scripts/bpnet/scripts')
from perturb_functions import insert_motif, one_hot_decode_sequence, one_hot_encode_sequences, one_hot_encode_sequence
from motif_functions import remove_palindromic_motif_duplicates
from data_format_functions import tidy_bpnet_predictions_nexus,tidy_bpnet_contributions
sys.path.insert(0, f'/n/projects/kd2200/publication/bpnet/analysis/scripts/py')
from motifs import extract_seqs_from_df

Using TensorFlow backend.






2024-01-15 13:31:48,986 [INFO] Note: detected 80 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
2024-01-15 13:31:48,988 [INFO] Note: NumExpr detected 80 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2024-01-15 13:31:48,989 [INFO] NumExpr defaulting to 8 threads.












2024-01-15 13:31:52,151 [INFO] Failed to extract font properties from /usr/share/fonts/google-noto-emoji/NotoColorEmoji.ttf: In FT2Font: Can not load face.  Unknown file format.


## Variables

In [2]:
%matplotlib inline

# Pre-existing variables
fasta_file = f'/n/projects/kd2200/publication/bpnet/fasta/mm10.fa'
model_dir = f'/n//projects/kd2200/publication/bpnet/bpnet_single_tead4_esc/tead4_esc/model/dataspec.yaml_default/'

# Independent variables
#tead4_motifs_path = f'tsv/tead4_double_escsingletask_motifs_noerv_no_promoter_test_cwm.tsv.gz'
tead4_motifs_path = f'/n//projects/kd2200/publication/bpnet/bpnet_single_tead4_esc/tead4_esc/modisco/profile/dataspec.yaml_default/tead4_esc/motif-instances-all-regions.tsv.gz'


## Load BPNet model

In [3]:
model = BPNetSeqModel.from_mdir(model_dir)





































The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.















# Analysis code

## Import Tead4 double motifs

In [4]:
all_motifs_df = pd.read_csv(tead4_motifs_path, sep = '\t')

motif_df = all_motifs_df[all_motifs_df['pattern']=='metacluster_0/pattern_2']  
motif_df

Unnamed: 0,example_chrom,pattern_start_abs,pattern_end_abs,pattern,contrib_weighted_p,strand,match_weighted_p,example_idx,pattern_start,pattern_end,pattern_center,pattern_len,match_weighted,match_weighted_cat,match_max,match_max_task,contrib_weighted,contrib_weighted_cat,contrib_max,contrib_max_task,seq_match,seq_match_p,seq_match_cat,match/tead4_esc,contrib/tead4_esc,pattern_short,example_start,example_end,example_strand,example_interval_from_task
14221,chr1,155438830,155438848,metacluster_0/pattern_2,0.929825,-,0.727273,47,529,547,538,18,0.570771,high,0.570771,tead4_esc,2.918528,high,2.918528,tead4_esc,13.782475,0.732057,high,0.570771,2.918528,m0_p2,155438301,155439301,.,tead4_esc
14222,chr1,164004494,164004512,metacluster_0/pattern_2,0.149920,-,0.272727,69,473,491,482,18,0.452638,low,0.452638,tead4_esc,0.656639,low,0.656639,tead4_esc,8.362925,0.098884,low,0.452638,0.656639,m0_p2,164004021,164005021,.,tead4_esc
14223,chr1,73938886,73938904,metacluster_0/pattern_2,0.397129,-,0.556619,75,464,482,473,18,0.526379,medium,0.526379,tead4_esc,1.145823,medium,1.145823,tead4_esc,12.745531,0.580542,medium,0.526379,1.145823,m0_p2,73938422,73939422,.,tead4_esc
14224,chr1,30937831,30937849,metacluster_0/pattern_2,0.794258,+,0.221691,76,533,551,542,18,0.437897,low,0.437897,tead4_esc,2.113826,high,2.113826,tead4_esc,8.372957,0.100478,low,0.437897,2.113826,m0_p2,30937298,30938298,.,tead4_esc
14225,chr1,30937834,30937852,metacluster_0/pattern_2,0.830941,-,0.588517,76,536,554,545,18,0.534469,medium,0.534469,tead4_esc,2.263070,high,2.263070,tead4_esc,15.926104,0.939394,high,0.534469,2.263070,m0_p2,30937298,30938298,.,tead4_esc
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15435,chrX,36211345,36211363,metacluster_0/pattern_2,0.620415,+,0.732057,14164,481,499,490,18,0.571804,high,0.571804,tead4_esc,1.674750,medium,1.674750,tead4_esc,13.710750,0.722488,high,0.571804,1.674750,m0_p2,36210864,36211864,.,tead4_esc
15436,chrX,36211348,36211366,metacluster_0/pattern_2,0.637959,-,0.891547,14164,484,502,493,18,0.632386,high,0.632386,tead4_esc,1.714056,medium,1.714056,tead4_esc,15.543586,0.923445,high,0.632386,1.714056,m0_p2,36210864,36211864,.,tead4_esc
15437,chrX,79671592,79671610,metacluster_0/pattern_2,0.307815,+,0.331738,14168,493,511,502,18,0.469482,medium,0.469482,tead4_esc,0.989046,low,0.989046,tead4_esc,9.950651,0.208931,low,0.469482,0.989046,m0_p2,79671099,79672099,.,tead4_esc
15438,chrX,42050823,42050841,metacluster_0/pattern_2,0.617225,+,0.759171,14191,406,424,415,18,0.579459,high,0.579459,tead4_esc,1.667640,medium,1.667640,tead4_esc,14.311833,0.802233,high,0.579459,1.667640,m0_p2,42050417,42051417,.,tead4_esc


In [5]:
# make grnages using pyrange
import pyranges as pr

motif_df['row_idx'] = np.arange(len(motif_df))
motif_df['Chromosome'] = motif_df.example_chrom
motif_df['Start'] = motif_df.pattern_start_abs
motif_df['End'] = motif_df.pattern_end_abs
motif_df['Strand'] = motif_df.strand

cwm_scan_regions_pr = pr.PyRanges(motif_df)
len(cwm_scan_regions_pr)

1219

In [None]:
# Remove ERVs

In [6]:
#functions
def read_repeat_masker(file_path):
    dfrm = pd.read_table(file_path, delim_whitespace=True, header=[1])

    dfrm.columns = [x.replace("\n", "_") for x in dfrm.columns]

    dfrm['name'] = dfrm['repeat'] + "//" + dfrm['class/family']

    dfrm = dfrm[['ins.', 'sequence', 'begin', 'name']]
    dfrm.columns = ['chrom', 'start', 'end', 'name']
    return dfrm


def intersect_repeat_masker(pattern_name, seqlets: BedTool, repeat_masker: BedTool, f=1.0):
    """Intersect the seqlets bed file with 
    """
    try:
        dfint = seqlets.intersect(repeat_masker, wa=True, wb=True, f=f).to_dataframe()
    except Exception:
        return None
    t = dfint.blockCount.str.split("//", expand=True)
    dfint['pattern_name'] = pattern_name
    dfint['repeat_name'] = t[0]
    dfint['repeat_family'] = t[1]
    dfint['n_pattern'] = seqlets.to_dataframe()[['chrom', 'start', 'end']].drop_duplicates().shape[0]
    dfint['interval'] = dfint['chrom'] + ":" + dfint['start'].astype(str) + "-" + dfint['end'].astype(str)
    return dfint[['chrom', 'start', 'end', 'interval', 'pattern_name', 'n_pattern', 'repeat_name', 'repeat_family']]

In [7]:
# load repeat masker file and remove ervs
rm_filepath = f'/n/projects/kd2200/publication/bpnet/analysis/data/repeatmasker.mm10.fa.out.gz'
dfrm = read_repeat_masker(rm_filepath)
dfrm_erv = dfrm[dfrm.name.str.contains("ERV")]
dfrm_erv['Start'] = dfrm_erv['start']
dfrm_erv['End'] = dfrm_erv['end']
dfrm_erv['Chromosome'] = dfrm_erv['chrom']
dfrm_erv = pr.PyRanges(dfrm_erv)

# exclude erv's
exclude = cwm_scan_regions_pr.overlap(dfrm_erv).df.row_idx.unique()
exclude_rows = motif_df.row_idx.isin(exclude)
print(exclude_rows.value_counts())
print(f"Excluding: {len(exclude)} / {len(motif_df)} rows")
motif_df['is_erv'] = exclude_rows #.astype(str)

#check motif counts without ERVs
dfi = motif_df[(motif_df['is_erv'].astype(str)=='False')]
dfi.is_erv.value_counts()

False    979
True     240
Name: row_idx, dtype: int64
Excluding: 240 / 1219 rows


False    979
Name: is_erv, dtype: int64

In [8]:
motifs_df =dfi
motifs_df

Unnamed: 0,example_chrom,pattern_start_abs,pattern_end_abs,pattern,contrib_weighted_p,strand,match_weighted_p,example_idx,pattern_start,pattern_end,pattern_center,pattern_len,match_weighted,match_weighted_cat,match_max,match_max_task,contrib_weighted,contrib_weighted_cat,contrib_max,contrib_max_task,seq_match,seq_match_p,seq_match_cat,match/tead4_esc,contrib/tead4_esc,pattern_short,example_start,example_end,example_strand,example_interval_from_task,row_idx,Chromosome,Start,End,Strand,is_erv
14221,chr1,155438830,155438848,metacluster_0/pattern_2,0.929825,-,0.727273,47,529,547,538,18,0.570771,high,0.570771,tead4_esc,2.918528,high,2.918528,tead4_esc,13.782475,0.732057,high,0.570771,2.918528,m0_p2,155438301,155439301,.,tead4_esc,0,chr1,155438830,155438848,-,False
14222,chr1,164004494,164004512,metacluster_0/pattern_2,0.149920,-,0.272727,69,473,491,482,18,0.452638,low,0.452638,tead4_esc,0.656639,low,0.656639,tead4_esc,8.362925,0.098884,low,0.452638,0.656639,m0_p2,164004021,164005021,.,tead4_esc,1,chr1,164004494,164004512,-,False
14223,chr1,73938886,73938904,metacluster_0/pattern_2,0.397129,-,0.556619,75,464,482,473,18,0.526379,medium,0.526379,tead4_esc,1.145823,medium,1.145823,tead4_esc,12.745531,0.580542,medium,0.526379,1.145823,m0_p2,73938422,73939422,.,tead4_esc,2,chr1,73938886,73938904,-,False
14224,chr1,30937831,30937849,metacluster_0/pattern_2,0.794258,+,0.221691,76,533,551,542,18,0.437897,low,0.437897,tead4_esc,2.113826,high,2.113826,tead4_esc,8.372957,0.100478,low,0.437897,2.113826,m0_p2,30937298,30938298,.,tead4_esc,3,chr1,30937831,30937849,+,False
14225,chr1,30937834,30937852,metacluster_0/pattern_2,0.830941,-,0.588517,76,536,554,545,18,0.534469,medium,0.534469,tead4_esc,2.263070,high,2.263070,tead4_esc,15.926104,0.939394,high,0.534469,2.263070,m0_p2,30937298,30938298,.,tead4_esc,4,chr1,30937834,30937852,-,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15428,chrX,130557449,130557467,metacluster_0/pattern_2,0.354067,-,0.649123,14077,474,492,483,18,0.548904,medium,0.548904,tead4_esc,1.080100,medium,1.080100,tead4_esc,12.617257,0.559809,medium,0.548904,1.080100,m0_p2,130556975,130557975,.,tead4_esc,1207,chrX,130557449,130557467,-,False
15429,chrX,52442025,52442043,metacluster_0/pattern_2,0.773525,-,0.787879,14095,469,487,478,18,0.591039,high,0.591039,tead4_esc,2.036481,high,2.036481,tead4_esc,14.261723,0.797448,high,0.591039,2.036481,m0_p2,52441556,52442556,.,tead4_esc,1208,chrX,52442025,52442043,-,False
15432,chrX,12942634,12942652,metacluster_0/pattern_2,0.437002,-,0.259968,14146,470,488,479,18,0.449455,low,0.449455,tead4_esc,1.251646,medium,1.251646,tead4_esc,11.521040,0.376396,medium,0.449455,1.251646,m0_p2,12942164,12943164,.,tead4_esc,1211,chrX,12942634,12942652,-,False
15433,chrX,52277941,52277959,metacluster_0/pattern_2,0.373206,-,0.318979,14150,597,615,606,18,0.465604,low,0.465604,tead4_esc,1.101125,medium,1.101125,tead4_esc,9.926002,0.207337,low,0.465604,1.101125,m0_p2,52277344,52278344,.,tead4_esc,1212,chrX,52277941,52277959,-,False


In [9]:
#Get sequences on the forward strand
motifs_df['fwd_seq'] = extract_seqs_from_df(coords_df = motifs_df, fasta_path = fasta_file, chrom_column = 'Chromosome', start_column = 'Start', end_column = 'End')

In [10]:
# Reorient the negative strand sequences
motifs_oriented_df = pd.DataFrame()
for i,row in tqdm(motifs_df.iterrows()):
    if row.strand=='-':
        fwd_seq = Seq(row.fwd_seq)
        rev_seq = str(fwd_seq.reverse_complement())
        row['seq'] = rev_seq
    else:
        row['seq'] = row.fwd_seq
    motifs_oriented_df = motifs_oriented_df.append(row)

979it [00:12, 78.11it/s]


In [11]:
motifs_oriented_df

Unnamed: 0,Chromosome,End,Start,Strand,contrib/tead4_esc,contrib_max,contrib_max_task,contrib_weighted,contrib_weighted_cat,contrib_weighted_p,example_chrom,example_end,example_idx,example_interval_from_task,example_start,example_strand,fwd_seq,is_erv,match/tead4_esc,match_max,match_max_task,match_weighted,match_weighted_cat,match_weighted_p,pattern,pattern_center,pattern_end,pattern_end_abs,pattern_len,pattern_short,pattern_start,pattern_start_abs,row_idx,seq,seq_match,seq_match_cat,seq_match_p,strand
14221,chr1,155438848.0,155438830.0,-,2.918528,2.918528,tead4_esc,2.918528,high,0.929825,chr1,155439301.0,47.0,tead4_esc,155438301.0,.,GCATTCTTAGCATTCCAA,0.0,0.570771,0.570771,tead4_esc,0.570771,high,0.727273,metacluster_0/pattern_2,538.0,547.0,155438848.0,18.0,m0_p2,529.0,155438830.0,0.0,TTGGAATGCTAAGAATGC,13.782475,high,0.732057,-
14222,chr1,164004512.0,164004494.0,-,0.656639,0.656639,tead4_esc,0.656639,low,0.149920,chr1,164005021.0,69.0,tead4_esc,164004021.0,.,GCATACCTTGCATAGCAC,0.0,0.452638,0.452638,tead4_esc,0.452638,low,0.272727,metacluster_0/pattern_2,482.0,491.0,164004512.0,18.0,m0_p2,473.0,164004494.0,1.0,GTGCTATGCAAGGTATGC,8.362925,low,0.098884,-
14223,chr1,73938904.0,73938886.0,-,1.145823,1.145823,tead4_esc,1.145823,medium,0.397129,chr1,73939422.0,75.0,tead4_esc,73938422.0,.,ACATTCCTTGCATTTCTC,0.0,0.526379,0.526379,tead4_esc,0.526379,medium,0.556619,metacluster_0/pattern_2,473.0,482.0,73938904.0,18.0,m0_p2,464.0,73938886.0,2.0,GAGAAATGCAAGGAATGT,12.745531,medium,0.580542,-
14224,chr1,30937849.0,30937831.0,+,2.113826,2.113826,tead4_esc,2.113826,high,0.794258,chr1,30938298.0,76.0,tead4_esc,30937298.0,.,ATCACATGCCTGGAATTC,0.0,0.437897,0.437897,tead4_esc,0.437897,low,0.221691,metacluster_0/pattern_2,542.0,551.0,30937849.0,18.0,m0_p2,533.0,30937831.0,3.0,ATCACATGCCTGGAATTC,8.372957,low,0.100478,+
14225,chr1,30937852.0,30937834.0,-,2.263070,2.263070,tead4_esc,2.263070,high,0.830941,chr1,30938298.0,76.0,tead4_esc,30937298.0,.,ACATGCCTGGAATTCCAG,0.0,0.534469,0.534469,tead4_esc,0.534469,medium,0.588517,metacluster_0/pattern_2,545.0,554.0,30937852.0,18.0,m0_p2,536.0,30937834.0,4.0,CTGGAATTCCAGGCATGT,15.926104,high,0.939394,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15428,chrX,130557467.0,130557449.0,-,1.080100,1.080100,tead4_esc,1.080100,medium,0.354067,chrX,130557975.0,14077.0,tead4_esc,130556975.0,.,GCATACCTTGAATGCCAG,0.0,0.548904,0.548904,tead4_esc,0.548904,medium,0.649123,metacluster_0/pattern_2,483.0,492.0,130557467.0,18.0,m0_p2,474.0,130557449.0,1207.0,CTGGCATTCAAGGTATGC,12.617257,medium,0.559809,-
15429,chrX,52442043.0,52442025.0,-,2.036481,2.036481,tead4_esc,2.036481,high,0.773525,chrX,52442556.0,14095.0,tead4_esc,52441556.0,.,ACATTCCACAAATTCCAG,0.0,0.591039,0.591039,tead4_esc,0.591039,high,0.787879,metacluster_0/pattern_2,478.0,487.0,52442043.0,18.0,m0_p2,469.0,52442025.0,1208.0,CTGGAATTTGTGGAATGT,14.261723,high,0.797448,-
15432,chrX,12942652.0,12942634.0,-,1.251646,1.251646,tead4_esc,1.251646,medium,0.437002,chrX,12943164.0,14146.0,tead4_esc,12942164.0,.,GGATTCCAGGGATGCCTC,0.0,0.449455,0.449455,tead4_esc,0.449455,low,0.259968,metacluster_0/pattern_2,479.0,488.0,12942652.0,18.0,m0_p2,470.0,12942634.0,1211.0,GAGGCATCCCTGGAATCC,11.521040,medium,0.376396,-
15433,chrX,52277959.0,52277941.0,-,1.101125,1.101125,tead4_esc,1.101125,medium,0.373206,chrX,52278344.0,14150.0,tead4_esc,52277344.0,.,AGATTCCAGGCCTTCCCA,0.0,0.465604,0.465604,tead4_esc,0.465604,low,0.318979,metacluster_0/pattern_2,606.0,615.0,52277959.0,18.0,m0_p2,597.0,52277941.0,1212.0,TGGGAAGGCCTGGAATCT,9.926002,low,0.207337,-


In [23]:
motifs_oriented_df

Unnamed: 0,Chromosome,End,Start,Strand,contrib.tead4_esc,contrib_max,contrib_max_task,contrib_weighted,contrib_weighted_cat,contrib_weighted_p,end,example_end,example_idx,example_interval_from_task,example_start,example_strand,fwd_seq,match.tead4_esc,match_max,match_max_task,match_weighted,match_weighted_cat,match_weighted_p,modisco_task,motif_id,motif_type,pattern,pattern_center,pattern_end,pattern_len,pattern_name,pattern_short,pattern_start,region_end_1based,region_id,region_start_1based,row_idx,seq,seq_match,seq_match_cat,seq_match_p,seqnames,start,strand,width
0,chr1,13846735.0,13846716.0,+,0.849098,0.849098,tead4_esc,0.849098,low,0.245614,13846735.0,13847176.0,755.0,tead4_esc,13846176.0,.,CTGGCAATGTCAGGTATTT,0.434705,0.434705,tead4_esc,0.434705,low,0.216906,tead4_esc,4207.0,short,metacluster_0/pattern_2,550.0,559.0,18.0,tead4_double_2,m0_p2,541.0,13847225.0,17.0,13846226.0,2.0,CTGGCAATGTCAGGTATTT,8.835777,low,0.118022,chr1,13846717.0,+,19.0
1,chr1,36030648.0,36030629.0,+,1.762870,1.762870,tead4_esc,1.762870,medium,0.658692,36030648.0,36031146.0,450.0,tead4_esc,36030146.0,.,GCTGGCTTCCCAGGAATGC,0.430655,0.430655,tead4_esc,0.430655,low,0.202552,tead4_esc,4209.0,short,metacluster_0/pattern_2,493.0,502.0,18.0,tead4_double_2,m0_p2,484.0,36031140.0,41.0,36030141.0,4.0,GCTGGCTTCCCAGGAATGC,10.763284,low,0.301435,chr1,36030630.0,+,19.0
2,chr1,36987347.0,36987328.0,+,0.779971,0.779971,tead4_esc,0.779971,low,0.215311,36987347.0,36987721.0,104.0,tead4_esc,36986721.0,.,CCAGAAATGCTTTGCATAC,0.428606,0.428606,tead4_esc,0.428606,low,0.200957,tead4_esc,4210.0,short,metacluster_0/pattern_2,617.0,626.0,18.0,tead4_double_2,m0_p2,608.0,36987837.0,44.0,36986838.0,5.0,CCAGAAATGCTTTGCATAC,4.268427,low,0.020734,chr1,36987329.0,+,19.0
3,chr1,43139689.0,43139670.0,+,1.174237,1.174237,tead4_esc,1.174237,medium,0.406699,43139689.0,43140205.0,187.0,tead4_esc,43139205.0,.,CCAGGCTGGCCTGGAATGC,0.435208,0.435208,tead4_esc,0.435208,low,0.218501,tead4_esc,4211.0,short,metacluster_0/pattern_2,475.0,484.0,18.0,tead4_double_2,m0_p2,466.0,43140181.0,52.0,43139182.0,6.0,CCAGGCTGGCCTGGAATGC,7.941748,low,0.081340,chr1,43139671.0,+,19.0
4,chr1,63200444.0,63200425.0,+,0.899451,0.899451,tead4_esc,0.899451,low,0.267943,63200444.0,63200796.0,198.0,tead4_esc,63199796.0,.,CCTGGAATCCCCGGTATTC,0.610551,0.610551,tead4_esc,0.610551,high,0.842105,tead4_esc,4216.0,short,metacluster_0/pattern_2,639.0,648.0,18.0,tead4_double_2,m0_p2,630.0,63200932.0,75.0,63199933.0,10.0,CCTGGAATCCCCGGTATTC,11.136514,low,0.328549,chr1,63200426.0,+,19.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
770,chrX,73566462.0,73566443.0,-,1.951741,1.951741,tead4_esc,1.951741,high,0.730463,73566461.0,73566917.0,14067.0,tead4_esc,73565917.0,.,GCATACCTGTCATTCCAGC,0.599519,0.599519,tead4_esc,0.599519,high,0.811802,tead4_esc,5128.0,short,metacluster_0/pattern_2,535.0,544.0,18.0,tead4_double_2,m0_p2,526.0,73566951.0,4309.0,73565952.0,769.0,GCTGGAATGACAGGTATGC,13.686196,high,0.719298,chrX,73566443.0,-,19.0
771,chrX,99058276.0,99058257.0,-,0.774073,0.774073,tead4_esc,0.774073,low,0.215311,99058275.0,99058637.0,14076.0,tead4_esc,99057637.0,.,ACATTCTCTGGATTCCTGC,0.523486,0.523486,tead4_esc,0.523486,medium,0.545455,tead4_esc,5131.0,short,metacluster_0/pattern_2,629.0,638.0,18.0,tead4_double_2,m0_p2,620.0,99058767.0,4319.0,99057768.0,771.0,GCAGGAATCCAGAGAATGT,11.206011,medium,0.339713,chrX,99058257.0,-,19.0
772,chrX,128576734.0,128576715.0,-,0.853421,0.853421,tead4_esc,0.853421,low,0.248804,128576733.0,128577201.0,14045.0,tead4_esc,128576201.0,.,GCATTCCATTCATTCTTCT,0.441512,0.441512,tead4_esc,0.441512,low,0.236045,tead4_esc,5133.0,short,metacluster_0/pattern_2,523.0,532.0,18.0,tead4_double_2,m0_p2,514.0,128577200.0,4343.0,128576201.0,772.0,AGAAGAATGAATGGAATGC,10.932216,low,0.317384,chrX,128576715.0,-,19.0
773,chrX,130557468.0,130557449.0,-,1.080100,1.080100,tead4_esc,1.080100,medium,0.354067,130557467.0,130557975.0,14077.0,tead4_esc,130556975.0,.,GCATACCTTGAATGCCAGA,0.548904,0.548904,tead4_esc,0.548904,medium,0.649123,tead4_esc,5134.0,short,metacluster_0/pattern_2,483.0,492.0,18.0,tead4_double_2,m0_p2,474.0,130557957.0,4344.0,130556958.0,773.0,TCTGGCATTCAAGGTATGC,12.617257,medium,0.559809,chrX,130557449.0,-,19.0


## Extract first half site and second half site groups.

In [12]:
first_half_site = (0,9) 
second_half_site = (9,19) 


motifs_oriented_df['first_half_site_seq'] = [s[first_half_site[0]:first_half_site[1]] for s in motifs_oriented_df.seq.values]
motifs_oriented_df['second_half_site_seq'] = [s[second_half_site[0]:second_half_site[1]] for s in motifs_oriented_df.seq.values]

In [13]:
#Save results.
motifs_oriented_df.to_csv('tsv/tead4_double_escsingletask_motifs_noerv_no_promoter_cwm_withseq.tsv.gz', sep = '\t', index = False)

motifs_oriented_df

Unnamed: 0,Chromosome,End,Start,Strand,contrib/tead4_esc,contrib_max,contrib_max_task,contrib_weighted,contrib_weighted_cat,contrib_weighted_p,example_chrom,example_end,example_idx,example_interval_from_task,example_start,example_strand,fwd_seq,is_erv,match/tead4_esc,match_max,match_max_task,match_weighted,match_weighted_cat,match_weighted_p,pattern,pattern_center,pattern_end,pattern_end_abs,pattern_len,pattern_short,pattern_start,pattern_start_abs,row_idx,seq,seq_match,seq_match_cat,seq_match_p,strand,first_half_site_seq,second_half_site_seq
14221,chr1,155438848.0,155438830.0,-,2.918528,2.918528,tead4_esc,2.918528,high,0.929825,chr1,155439301.0,47.0,tead4_esc,155438301.0,.,GCATTCTTAGCATTCCAA,0.0,0.570771,0.570771,tead4_esc,0.570771,high,0.727273,metacluster_0/pattern_2,538.0,547.0,155438848.0,18.0,m0_p2,529.0,155438830.0,0.0,TTGGAATGCTAAGAATGC,13.782475,high,0.732057,-,TTGGAATGC,TAAGAATGC
14222,chr1,164004512.0,164004494.0,-,0.656639,0.656639,tead4_esc,0.656639,low,0.149920,chr1,164005021.0,69.0,tead4_esc,164004021.0,.,GCATACCTTGCATAGCAC,0.0,0.452638,0.452638,tead4_esc,0.452638,low,0.272727,metacluster_0/pattern_2,482.0,491.0,164004512.0,18.0,m0_p2,473.0,164004494.0,1.0,GTGCTATGCAAGGTATGC,8.362925,low,0.098884,-,GTGCTATGC,AAGGTATGC
14223,chr1,73938904.0,73938886.0,-,1.145823,1.145823,tead4_esc,1.145823,medium,0.397129,chr1,73939422.0,75.0,tead4_esc,73938422.0,.,ACATTCCTTGCATTTCTC,0.0,0.526379,0.526379,tead4_esc,0.526379,medium,0.556619,metacluster_0/pattern_2,473.0,482.0,73938904.0,18.0,m0_p2,464.0,73938886.0,2.0,GAGAAATGCAAGGAATGT,12.745531,medium,0.580542,-,GAGAAATGC,AAGGAATGT
14224,chr1,30937849.0,30937831.0,+,2.113826,2.113826,tead4_esc,2.113826,high,0.794258,chr1,30938298.0,76.0,tead4_esc,30937298.0,.,ATCACATGCCTGGAATTC,0.0,0.437897,0.437897,tead4_esc,0.437897,low,0.221691,metacluster_0/pattern_2,542.0,551.0,30937849.0,18.0,m0_p2,533.0,30937831.0,3.0,ATCACATGCCTGGAATTC,8.372957,low,0.100478,+,ATCACATGC,CTGGAATTC
14225,chr1,30937852.0,30937834.0,-,2.263070,2.263070,tead4_esc,2.263070,high,0.830941,chr1,30938298.0,76.0,tead4_esc,30937298.0,.,ACATGCCTGGAATTCCAG,0.0,0.534469,0.534469,tead4_esc,0.534469,medium,0.588517,metacluster_0/pattern_2,545.0,554.0,30937852.0,18.0,m0_p2,536.0,30937834.0,4.0,CTGGAATTCCAGGCATGT,15.926104,high,0.939394,-,CTGGAATTC,CAGGCATGT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15428,chrX,130557467.0,130557449.0,-,1.080100,1.080100,tead4_esc,1.080100,medium,0.354067,chrX,130557975.0,14077.0,tead4_esc,130556975.0,.,GCATACCTTGAATGCCAG,0.0,0.548904,0.548904,tead4_esc,0.548904,medium,0.649123,metacluster_0/pattern_2,483.0,492.0,130557467.0,18.0,m0_p2,474.0,130557449.0,1207.0,CTGGCATTCAAGGTATGC,12.617257,medium,0.559809,-,CTGGCATTC,AAGGTATGC
15429,chrX,52442043.0,52442025.0,-,2.036481,2.036481,tead4_esc,2.036481,high,0.773525,chrX,52442556.0,14095.0,tead4_esc,52441556.0,.,ACATTCCACAAATTCCAG,0.0,0.591039,0.591039,tead4_esc,0.591039,high,0.787879,metacluster_0/pattern_2,478.0,487.0,52442043.0,18.0,m0_p2,469.0,52442025.0,1208.0,CTGGAATTTGTGGAATGT,14.261723,high,0.797448,-,CTGGAATTT,GTGGAATGT
15432,chrX,12942652.0,12942634.0,-,1.251646,1.251646,tead4_esc,1.251646,medium,0.437002,chrX,12943164.0,14146.0,tead4_esc,12942164.0,.,GGATTCCAGGGATGCCTC,0.0,0.449455,0.449455,tead4_esc,0.449455,low,0.259968,metacluster_0/pattern_2,479.0,488.0,12942652.0,18.0,m0_p2,470.0,12942634.0,1211.0,GAGGCATCCCTGGAATCC,11.521040,medium,0.376396,-,GAGGCATCC,CTGGAATCC
15433,chrX,52277959.0,52277941.0,-,1.101125,1.101125,tead4_esc,1.101125,medium,0.373206,chrX,52278344.0,14150.0,tead4_esc,52277344.0,.,AGATTCCAGGCCTTCCCA,0.0,0.465604,0.465604,tead4_esc,0.465604,low,0.318979,metacluster_0/pattern_2,606.0,615.0,52277959.0,18.0,m0_p2,597.0,52277941.0,1212.0,TGGGAAGGCCTGGAATCT,9.926002,low,0.207337,-,TGGGAAGGC,CTGGAATCT


## Group results based on half site summaries.

In [14]:
summarized_tead4_sites_df = motifs_oriented_df.groupby(['first_half_site_seq', 'second_half_site_seq']).size().reset_index()
summarized_tead4_sites_df.columns = ['first_half_site_seq', 'second_half_site_seq', 'frequency']
summarized_tead4_sites_df = summarized_tead4_sites_df.sort_values('frequency', ascending = False)
summarized_tead4_sites_df = summarized_tead4_sites_df.reset_index()

In [15]:
summarized_tead4_sites_df
#summarized_tead4_sites_df = summarized_tead4_sites_df.head(n=10)

Unnamed: 0,index,first_half_site_seq,second_half_site_seq,frequency
0,555,CTGGCTGTC,CTGGAATTC,8
1,505,CTGGAATTC,CAGGCATGT,3
2,472,CTGGAATGG,CGGAAATTC,3
3,488,CTGGAATTA,CAGGCATGC,3
4,553,CTGGCTGGC,CTGGAATTC,2
...,...,...,...,...
945,324,CATGAATGC,CTAGCATTG,1
946,325,CATGAATTC,TAAGAATTC,1
947,326,CATGCATGC,CTGGAATTT,1
948,327,CCAACATGC,CTGGCATGC,1


## Get in silico contributions for each combination of half sites. 

### A. For each combination...
    1. Generate random sequences.
    2. Inject each half site at appropriate distances.
    3. Get predict_all across all trials
    4. Average across appropriate areas for contribution.

In [16]:
## functions for generating random sequences & encoding and decoding in 1he format

# seq in form of ACGT letters
def generate_random_seq(seqlen, weights = [.25, .25, .25, .25]):
    """
    Purpose: Generate a random DNA sequence of a specified length.
    """
    import random
    seq = random.choices(['A','C','G','T'], weights = weights, k=seqlen)
    return(''.join(seq))

# 1he seq
def one_hot_encode_sequences(sequences):
    """
    Purpose: Given an array of sequences, one-hot-encode into a [region x position x 4] array.
    """
    return(np.stack([one_hot_encode_sequence(s) for s in sequences]))


def one_hot_encode_sequence(sequence):
    """
    Kudos to Charles: /n/projects/cm2363/bpnet-nucleosomes/work/localimportance/allLocalImportances.py
    Purpose: Given a SINGLE sequence string, one-hot encode the data.
        + default control_profiles and control_logcounts is to be zeroed out
        + naively detects whether the sequence is one-hot-encoded.
    """
    onehot_mapping = {
    'A': [1,0,0,0],
    'C': [0,1,0,0],
    'G': [0,0,1,0],
    'T': [0,0,0,1],
    'a': [1,0,0,0],
    'c': [0,1,0,0],
    'g': [0,0,1,0],
    't': [0,0,0,1],
    'N': [0,0,0,0]
    }
    return np.array([onehot_mapping[x] for x in sequence])


def one_hot_decode_sequence(array):
    """
    Purpose: Given an array [position x 4], decode sequence to a string.
    """
    onehot_decoder = {
    0: 'A',
    1: 'C',
    2: 'G',
    3: 'T'
    }

    idxs = np.where(array)[1]
    return (''.join([onehot_decoder[i] for i in idxs]))

In [17]:
# generate 64 random seq
import random
random.seed(10)
trials = 64
random_seqs = [generate_random_seq(seqlen = 1000) for i in range(trials)]

In [18]:
# make it 1he
random_seqs_1he =one_hot_encode_sequences(random_seqs)
flanks = (random_seqs_1he.shape[1]-1000)//2
random_seqs_1he = random_seqs_1he[:, flanks:(1000+flanks)]
random_seqs_1he.shape

(64, 1000, 4)

## Collect all sequences with a pd.df that keeps indexes.

In [19]:
combination_index_df = pd.DataFrame()
combination_w_trials_list = []

for i,row in tqdm(summarized_tead4_sites_df.iterrows()):
        
    #define half sites
    first_half_site = row.first_half_site_seq
    second_half_site = row.second_half_site_seq
    
    #Create sequences
    for j,s in enumerate(random_seqs_1he):
        seq = one_hot_decode_sequence(s)
        seq_w_first = insert_motif(seq = seq, motif = first_half_site, position = 500)
        seq_w_second = insert_motif(seq = seq_w_first, motif = second_half_site, position = 509)
        combination_w_trials_list.append(seq_w_second)
        row['trial_index'] = j
        row['combo_index'] = i
        combination_index_df = combination_index_df.append(row)

#Create row index that combines trial index and ocmbo index and matches index of contrib predictions
combination_index_df['row_index'] = (combination_index_df.combo_index * 64 + combination_index_df.trial_index).astype(int)

950it [05:17,  3.00it/s]


In [20]:
len(combination_w_trials_list)

60800

##### In a single prediction step, get all contribution scores (`list(3K) -> ['contrib_score']['tead4/profile']`), save as .pkl file.

In [21]:
%%script false --no-raise-error

import pickle

#One hot encode sequences
combination_w_trials_seq_1he = one_hot_encode_sequences(combination_w_trials_list)

#Predict all sequences
combinations_w_trials_pred_dict = model.predict_all(combination_w_trials_seq_1he, contrib_method='deeplift')

with open('tsv/all_combination_of_halfsites_w_trials_esc_signelmodel.pkl', 'wb') as f:
    pickle.dump(combinations_w_trials_pred_dict, f)



















DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True


### Import pkl file and isolate and average values. 

In [None]:
import pickle
with open('tsv/all_combination_of_halfsites_w_trials_esc_signelmodel.pkl', 'rb') as f:
    combinations_w_trials_pred_dict = pickle.load(f)

In [22]:
summarized_tead4_sites_w_contrib_df = pd.DataFrame()

for i in tqdm(range(len(combinations_w_trials_pred_dict))):
     #Collect contribution scores under proper coordinates for all trials
    first_half_site_contrib = (combinations_w_trials_pred_dict[i]['contrib_score']['tead4_esc/profile'][496:505] * combinations_w_trials_pred_dict[i]['seq'][496:505]).sum()
    second_half_site_contrib = (combinations_w_trials_pred_dict[i]['contrib_score']['tead4_esc/profile'][505:514] * combinations_w_trials_pred_dict[i]['seq'][505:514]).sum()
    df = pd.DataFrame()
    df['first_half_site_contrib'] = [first_half_site_contrib]
    df['second_half_site_contrib'] = [second_half_site_contrib]
    df['row_index'] = i

    summarized_tead4_sites_w_contrib_df = summarized_tead4_sites_w_contrib_df.append(df) 

100%|██████████| 60800/60800 [02:14<00:00, 451.24it/s]


### Sanity check

In [23]:
summarized_tead4_sites_w_contrib_df.shape[0] == combination_index_df.shape[0]

True

## Reset index and average the contribution over trial

In [24]:
contrib_over_combo_over_trials_df = combination_index_df.reset_index().join(summarized_tead4_sites_w_contrib_df.reset_index(), on = ['row_index'], how = 'left', rsuffix = 'test')
contrib_over_combo_df = contrib_over_combo_over_trials_df.groupby(['combo_index', 'first_half_site_seq', 'second_half_site_seq']).agg({'first_half_site_contrib' : 'mean', 'second_half_site_contrib' : 'mean'}).reset_index()
contrib_over_combo_df

Unnamed: 0,combo_index,first_half_site_seq,second_half_site_seq,first_half_site_contrib,second_half_site_contrib
0,0.0,CTGGCTGTC,CTGGAATTC,0.170847,0.748742
1,1.0,CTGGAATTC,CAGGCATGT,1.518642,0.555458
2,2.0,CTGGAATGG,CGGAAATTC,0.273166,0.281554
3,3.0,CTGGAATTA,CAGGCATGC,0.242850,0.240610
4,4.0,CTGGCTGGC,CTGGAATTC,0.197927,0.746503
...,...,...,...,...,...
945,945.0,CATGAATGC,CTAGCATTG,0.213915,0.210198
946,946.0,CATGAATTC,TAAGAATTC,0.344827,0.422612
947,947.0,CATGCATGC,CTGGAATTT,0.360961,0.863678
948,948.0,CCAACATGC,CTGGCATGC,0.352013,0.461368


### Cut the contrib values in each half into quantiles and save

In [25]:
first_half_site_cutoffs = np.quantile(contrib_over_combo_df.first_half_site_contrib.values, [0, .25, .5, .75, 1])
second_half_site_cutoffs = np.quantile(contrib_over_combo_df.second_half_site_contrib.values, [0, .25, .5, .75, 1])

contrib_over_combo_df['first_half_site_q'] = pd.cut(contrib_over_combo_df.first_half_site_contrib, bins=first_half_site_cutoffs, labels=['q1','q2','q3','q4'], include_lowest = True)
contrib_over_combo_df['second_half_site_q'] = pd.cut(contrib_over_combo_df.second_half_site_contrib, bins=second_half_site_cutoffs, labels=['q1','q2','q3','q4'], include_lowest = True)

#save
contrib_over_combo_df.to_csv('tsv/tead4_double_motif_contrib_over_combo_completehalfs_quantiles_esc_singlemodel.tsv.gz', sep = '\t', index = False)

#check freq
df = contrib_over_combo_df.value_counts(['first_half_site_q', 'second_half_site_q']).reset_index()

## Get insilico predictions keeping the combo index from above df

1. get random seq decoded as list
2. get first_half_site & second half sites from contrib_over_combo_df, keeping the combo index.
3. inject each tead4 combo and measure total counts with and without entire motif (background).
4. append to df above and save the df for plotting in R

In [26]:
## get random seq decoded as list
seqs = []
for j,s in enumerate(random_seqs_1he):
    seqs.append(one_hot_decode_sequence(s))

In [27]:
def one_hot_decode_sequence(array):
    """
    Purpose: Given an array [position x 4], decode sequence to a string.
    """
    onehot_decoder = {
    0: 'A',
    1: 'C',
    2: 'G',
    3: 'T'
    }

    idxs = np.where(array)[1]
    return (''.join([onehot_decoder[i] for i in idxs]))

In [28]:
def collect_summarized_counts(contrib_over_combo_df, random_seqs, tasks_of_interest, second_motif_injection_site, motif_window, measurement_window):

    """
    Purpose: Given WT, dA, dB, and dAdB states of a WT genomic tead4 halfs -> randomize other halfs, how does the double motif enrichment change?
    Inputs:
    Outputs:
    """
    
    from tqdm import tqdm
    from concise.preprocessing import encodeDNA
    def generate_seq(random_seq, central_motif, side_motif=None, side_distances=[], seqlen=1000):
        from bpnet.simulate import insert_motif
        injected_seq = insert_motif(random_seq, central_motif, seqlen // 2)
        for d in side_distances:
            injected_seq = insert_motif(injected_seq, side_motif, d)
        return injected_seq
    
    summary_for_all_combos_df = pd.DataFrame()
    for i,row in tqdm(contrib_over_combo_df.iterrows()):
        
        #define half sites
        first_half_site = row.first_half_site_seq
        second_half_site = row.second_half_site_seq
        combo_index = row.combo_index

        #center_injection_site = (motif_window[1]-motif_window[0])//2 + motif_window[0]
        #Generate sequences with variant states
        WT_seqs = [generate_seq(random_seq = i, central_motif = first_half_site, side_motif = second_half_site, 
                                side_distances = [second_motif_injection_site]) for i in random_seqs]
        dAdB_seqs = [generate_seq(random_seq = i, central_motif = '', side_motif = '', 
                                  side_distances = []) for i in random_seqs]
        dA_seqs = [generate_seq(random_seq = i, central_motif = '', side_motif = second_half_site, 
                                side_distances = [second_motif_injection_site]) for i in random_seqs]
        dB_seqs = [generate_seq(random_seq = i, central_motif = first_half_site, side_motif = '', 
                                side_distances = [second_motif_injection_site]) for i in random_seqs]

        #Collect into dictionary
        seq_dict = {'WT': WT_seqs, 'dAdB': dAdB_seqs, 'dA': dA_seqs, 'dB': dB_seqs}

        #Predict each variant state({inj_state -> {task -> [64 x 1000 x 2]}))
        preds_dict = {k: model.predict(encodeDNA(v)) for k,v in seq_dict.items()}
        
        # Average across trials and measurement window({inj_state -> {task -> #}})
        preds_single_dict = {inj_state: {task: np.mean(np.sum(v1[:, measurement_window[0]:measurement_window[1], :], axis = (1,2))) 
                     for task,v1 in v.items()} 
         for inj_state,v in preds_dict.items()}

        #Summarize counts together pd.df based on predictions
        summary_df = pd.DataFrame()
        for inj_state, v in preds_single_dict.items():
            df = pd.DataFrame.from_dict(v, orient = 'index').transpose()
            df['inj_state'] = inj_state
            df = df.melt(id_vars = ['inj_state'], var_name = 'task', value_name = 'counts')
            summary_df = summary_df.append(df)
        summary_df['first_half_site'] = first_half_site
        summary_df['second_half_site'] = second_half_site        
        summary_df['combo_index'] = combo_index
        summary_for_all_combos_df = summary_for_all_combos_df.append(summary_df)
    return(summary_for_all_combos_df)

## Collect summarized counts and profiles across trials for each halfs.

In [29]:
counts_df = collect_summarized_counts(contrib_over_combo_df=contrib_over_combo_df,random_seqs = seqs, 
                                   tasks_of_interest = ['tead4'], #','yap1'
                                   second_motif_injection_site = 509,
                                   motif_window = (496, 514), measurement_window = (475, 535))
#counts_df

950it [16:37,  1.05s/it]


In [30]:
#save .tsv.gz for each object
counts_df.to_csv(f'csv/all_counts_summary_predictions_genomic_regions_9mer_singlemodel.csv.gz', index = False)