

# Introduction

The purpose of this .ipynb is to generate motif-pair in-silico perturbations across canonical `TSC model` motifs and their variants. This will allow us to assess whether motif pairs have preferential distances or spacings between them.

# Computational setup





In [27]:
import warnings
warnings.filterwarnings("ignore")
from tensorflow.python.util import deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False

#Packages
import os
import sys
import pandas as pd
import numpy as np
from tqdm import tqdm
from pybedtools import BedTool

# Settings
os.chdir(f'/n/projects/kd2200/publication/bpnet/analysis/')
pd.set_option('display.max_columns', 100)

# Custom commands
sys.path.insert(0, f'/n/projects/kd2200/publication/bpnet/analysis/scripts/bpnet/scripts')
from data_format_functions import myround, myfloor, myceiling

# function to return key for any value 
def get_key(val, my_dict): 
    for key, value in my_dict.items(): 
        if val == value: 
            return key 
    return "key doesn't exist"

# Get tsc model directory
tsc_model_dir = f"/n/projects/kd2200/publication/bpnet/model/dataspec.yaml_default_fold_5/"


In [28]:
from bpnet.BPNet import BPNetSeqModel
tsc_model_bpnet = BPNetSeqModel.from_mdir(tsc_model_dir)

In [78]:
from collections import OrderedDict

##when Tead4 and Yap1 task are given same tead4 single motif
##tead4
# motif_seq_dict = OrderedDict([
#     ("tead4", 'GGAATGT'),    
#     ("tfap2c", 'GCCTCAGGG'),
#     ("cdx2", 'GCCATAAA'),
#     ("gata3", 'AGATAAG'),
# ])
##yap1
# motif_seq_dict = OrderedDict([
#     ("yap1", 'GGAATGT'),    
#     ("tfap2c", 'GCCTCAGGG'),
#     ("cdx2", 'GCCATAAA'),
#     ("gata3", 'AGATAAG'),
# ])

##when Tead4 and Yap1 task are given same tead4 double motif
##tead4
# motif_seq_dict = OrderedDict([
#     ("tfap2c", 'GCCTCAGGG'),
#     ("cdx2", 'GCCATAAA'),
#     ("gata3", 'AGATAAG'),
#     ("tead4", 'GGAATGCCAGGAATGT'), 
# ])

##yap1
motif_seq_dict = OrderedDict([
    ("tfap2c", 'GCCTCAGGG'),
    ("cdx2", 'GCCATAAA'),
    ("gata3", 'AGATAAG'),
    ("yap1", 'GGAATGCCAGGAATGT'), 
])

In [79]:
#Get each pairwise combination
#The order matters so I use permutation instead of combination
from itertools import permutations
pair_combos =[",".join(map(str, perms)) for perms in permutations(motif_seq_dict.keys(), 2)]

#Note: This does not include motif-motif combinations!
pair_combos

['tfap2c,cdx2',
 'tfap2c,gata3',
 'tfap2c,tead4',
 'cdx2,tfap2c',
 'cdx2,gata3',
 'cdx2,tead4',
 'gata3,tfap2c',
 'gata3,cdx2',
 'gata3,tead4',
 'tead4,tfap2c',
 'tead4,cdx2',
 'tead4,gata3']

In [80]:
# Split the pairwise combination to have center motif and side motif

motif_keys = pd.DataFrame([])

for pair in pair_combos:
    keys = pair.split(",")
    motif_keys = motif_keys.append(keys)
motif_keys.head(n=5)    

Unnamed: 0,0
0,tfap2c
1,cdx2
0,tfap2c
1,gata3
0,tfap2c


In [81]:
# Get all pair-wise spacing simulation

from bpnet.simulate import generate_sim
spacing_pairs = pd.DataFrame([])
for pair in pair_combos:
    motif_keys = pair.split(",")
    res = generate_sim(tsc_model_bpnet, central_motif=motif_seq_dict[motif_keys[0]], 
                       side_motif=motif_seq_dict[motif_keys[1]], repeat=256,
                       side_distances=np.arange(505, 650), 
                       center_coords=[475, 525], #Window to compute signal across
                       contribution=[], correct=True)

    dfs, profiles = res
    dfs['central_motif_name'] = motif_keys[0]
    dfs['side_motif_name'] = motif_keys[1]
    spacing_pairs = spacing_pairs.append(dfs)
    spacing_pairs.head(n=10)

100%|██████████| 145/145 [01:36<00:00,  1.50it/s]
100%|██████████| 145/145 [01:39<00:00,  1.46it/s]
100%|██████████| 145/145 [01:38<00:00,  1.48it/s]
100%|██████████| 145/145 [01:37<00:00,  1.48it/s]
100%|██████████| 145/145 [01:39<00:00,  1.46it/s]
100%|██████████| 145/145 [01:39<00:00,  1.46it/s]
100%|██████████| 145/145 [01:35<00:00,  1.51it/s]
100%|██████████| 145/145 [01:36<00:00,  1.51it/s]
100%|██████████| 145/145 [01:39<00:00,  1.46it/s]
100%|██████████| 145/145 [01:13<00:00,  1.96it/s]
100%|██████████| 145/145 [01:22<00:00,  1.76it/s]
100%|██████████| 145/145 [01:10<00:00,  2.06it/s]


In [82]:
# Save in tsv

##when Tead4 and Yap1 task are given same tead4 single motif
#spacing_pairs.to_csv(f"tsv/4a_cttg_insilico_distance_analysis.tsv", sep="\t")
#spacing_pairs.to_csv(f"tsv/4a_ctyg_insilico_distance_analysis.tsv", sep="\t")

##when Tead4 and Yap1 task are given same tead4 double motif
#spacing_pairs.to_csv(f"tsv/4a_cttdblg_insilico_distance_analysis_tead4.tsv", sep="\t")
spacing_pairs.to_csv(f"tsv/4a_cttdblg_insilico_distance_analysis_yap1.tsv", sep="\t")


In [83]:
# Load the tsv

##when Tead4 and Yap1 task are given same tead4 single motif
#spacing_pairs = pd.read_csv(f"tsv/4a_cttg_insilico_distance_analysis.tsv", sep="\t")
#spacing_pairs = pd.read_csv(f"tsv/4a_ctyg_insilico_distance_analysis.tsv", sep="\t")

##when Tead4 and Yap1 task are given same tead4 double motif
#spacing_pairs = pd.read_csv(f"tsv/4a_cttdblg_insilico_distance_analysis_tead4.tsv", sep="\t")
spacing_pairs = pd.read_csv(f"tsv/4a_cttdblg_insilico_distance_analysis_yap1.tsv", sep="\t")
spacing_pairs.head(n=5)

Unnamed: 0.1,Unnamed: 0,profile/simmetric_kl,profile/counts,profile/counts_frac,profile/max,profile/max_frac,profile/counts_max_ref,profile/counts_max_ref_frac,task,central_motif,side_motif,position,distance,central_motif_name,side_motif_name
0,0,0.034442,1.955727,0.578136,0.035876,0.800894,0.054361,0.615869,cdx2,GCCTCAGGG,GCCATAAA,505,5,tfap2c,cdx2
1,1,0.266865,0.962834,0.129233,0.025464,0.06589,0.017325,0.023688,tfap2c,GCCTCAGGG,GCCATAAA,505,5,tfap2c,cdx2
2,2,0.075594,0.890591,0.66484,0.018842,0.81384,0.026749,0.580436,tead4,GCCTCAGGG,GCCATAAA,505,5,tfap2c,cdx2
3,3,0.017937,0.762339,0.764571,0.013129,0.849914,0.01952,0.637199,yap1,GCCTCAGGG,GCCATAAA,505,5,tfap2c,cdx2
4,4,0.020277,1.076689,0.9274,0.017628,0.881846,0.027256,0.756624,gata3,GCCTCAGGG,GCCATAAA,505,5,tfap2c,cdx2


In [84]:
# Create a new column for motif_pair
# The Motif1-Motif2 and Motif2-Motif1 will be grouped as same motif_pair 
# and I will only plot the center motif for these two cases
import collections
spacing_pairs['motif_pair_name'] = spacing_pairs[['central_motif_name', 'side_motif_name']].apply(lambda x: '<>'.join(sorted(x)), axis=1)
collections.Counter(spacing_pairs['motif_pair_name'])
spacing_pairs

Unnamed: 0.1,Unnamed: 0,profile/simmetric_kl,profile/counts,profile/counts_frac,profile/max,profile/max_frac,profile/counts_max_ref,profile/counts_max_ref_frac,task,central_motif,side_motif,position,distance,central_motif_name,side_motif_name,motif_pair_name
0,0,0.034442,1.955727,0.578136,0.035876,0.800894,0.054361,0.615869,cdx2,GCCTCAGGG,GCCATAAA,505,5,tfap2c,cdx2,cdx2<>tfap2c
1,1,0.266865,0.962834,0.129233,0.025464,0.065890,0.017325,0.023688,tfap2c,GCCTCAGGG,GCCATAAA,505,5,tfap2c,cdx2,cdx2<>tfap2c
2,2,0.075594,0.890591,0.664840,0.018842,0.813840,0.026749,0.580436,tead4,GCCTCAGGG,GCCATAAA,505,5,tfap2c,cdx2,cdx2<>tfap2c
3,3,0.017937,0.762339,0.764571,0.013129,0.849914,0.019520,0.637199,yap1,GCCTCAGGG,GCCATAAA,505,5,tfap2c,cdx2,cdx2<>tfap2c
4,4,0.020277,1.076689,0.927400,0.017628,0.881846,0.027256,0.756624,gata3,GCCTCAGGG,GCCATAAA,505,5,tfap2c,cdx2,cdx2<>tfap2c
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8695,720,0.000623,4.446076,1.093506,0.146847,1.077981,0.204961,1.058322,cdx2,GGAATGCCAGGAATGT,AGATAAG,649,149,tead4,gata3,gata3<>tead4
8696,721,0.004054,1.693618,1.071606,0.057376,0.960946,0.085944,0.994849,tfap2c,GGAATGCCAGGAATGT,AGATAAG,649,149,tead4,gata3,gata3<>tead4
8697,722,0.001237,33.790920,0.926350,3.949574,0.864716,5.739555,0.860358,tead4,GGAATGCCAGGAATGT,AGATAAG,649,149,tead4,gata3,gata3<>tead4
8698,723,0.000638,5.715462,0.975305,0.330996,1.000668,0.466558,0.975586,yap1,GGAATGCCAGGAATGT,AGATAAG,649,149,tead4,gata3,gata3<>tead4


In [85]:
# Get the task that match the center motif
spacing_pairs['task_and_motif_match'] = [(spacing_pairs['task'].iat[i] == spacing_pairs['central_motif_name'].iat[i]) for i in range(spacing_pairs.shape[0])]
spacing_pairs['task_and_motif_match']= spacing_pairs['task_and_motif_match'].astype(str) 
spacing_pairs_subset = spacing_pairs[spacing_pairs['task_and_motif_match'] =='True']

In [86]:
tf_colors = {
    "tead4": "#007d47",
    "tfap2c": "#c22f2f",
     "yap1": "#14a0b5",  
     "cdx2": "#E69F00",
     "gata3": "#3600e6",
}

In [89]:
from plotnine import *
import plotnine

plotnine.options.figure_size = (8, 8)
spacing_fig = (ggplot(aes(x='distance', y='profile/counts_max_ref_frac', color = 'task'), spacing_pairs_subset) +  
 geom_line(size=0.9) + 
 geom_hline(yintercept=1, alpha=0.5) + 
 scale_color_manual(values = tf_colors) + 
 scale_x_continuous(limits = [5, 150], breaks = range(0, 150, 25), name = 'Motif pair distance (bp)')+
 scale_y_continuous(limits = [0,3]) +
 facet_wrap('motif_pair_name', ncol = 2, scales = "free")+
 theme_bw())

In [64]:
##save plots
#spacing_fig.save('figures/4a_pairwise_insilico_distance_analysis/4a_cttg_pairwise_motif_spacing.pdf', height=10, width=10)
#spacing_fig.save('figures/4a_pairwise_insilico_distance_analysis/4a_ctyg_pairwise_motif_spacing.pdf', height=10, width=10)
#spacing_fig.save('figures/4a_pairwise_insilico_distance_analysis/4a_cttdblg_insilico_distance_analysis_tead4.pdf', height=10, width=10)
spacing_fig.save('figures/4a_pairwise_insilico_distance_analysis/4a_cttdblg_insilico_distance_analysis_yap1.pdf', height=10, width=10)