# Introduction

The purpose of this .ipynb is to generate motif-pair in-silico perturbations across canonical `ZDTBCH` motifs and their variants. This will allow us to assess whether motif pairs have preferential distances or spacings between them.

# Computational setup

In [1]:
import warnings
warnings.filterwarnings("ignore")
from tensorflow.python.util import deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False

#Packages
import os
import sys
import pandas as pd
import numpy as np
from tqdm import tqdm
from pybedtools import BedTool

# Settings
os.chdir(f'/n/projects/mw2098/analysis/zelda')
pd.set_option('display.max_columns', 100)
figure_filepath = 'analysis/ZDTBCG/figures/9_collect_insilico_perturbs'

# Custom commands
sys.path.insert(0, f'/n/projects/mw2098/shared_code/bpnet/scripts')
from data_format_functions import myround, myfloor, myceiling

# function to return key for any value 
def get_key(val, my_dict): 
    for key, value in my_dict.items(): 
        if val == value: 
            return key 
    return "key doesn't exist"

Using TensorFlow backend.
2022-04-28 17:05:37,421 [INFO] Note: detected 80 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
2022-04-28 17:05:37,423 [INFO] Note: NumExpr detected 80 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2022-04-28 17:05:37,424 [INFO] NumExpr defaulting to 8 threads.


In [2]:
# #Pre-existing variables
model_name = 'seq_width1000-lr0.005-lambda100-n_dil_layers9-conv_kernel_size7-tconv_kernel_size7-filters128'
model_dir = f'/n/projects/mw2098/analysis/zelda/models/ZDTBCG/{model_name}/'
motif_seqs = {
    'Zld': 'CAGGTAG',
    'Gaf': 'GAGAGAGAGAGAGAGAG',
    'Cad': 'TTTTATGGCC',
    'Dl': 'GGGAAAACCC',
    'Twi': 'AACACATGTT',
    'Bcd': 'TTAATCC'
}
motif_colors = {'Zld': '#38b6f1',
                'Dl': '#d9a404',
                'Twi': '#ef3f54',
                'Bcd': '#1bb04c',
                'Cad': '#c4acb4',
                'Gaf': '#f47b2f'
             }

In [3]:
!mkdir -p {figure_filepath}
!mkdir -p analysis/ZDTBCG/tsv/insilico_perturb/
!mkdir {figure_filepath}/individual_pairs

mkdir: cannot create directory ‘analysis/ZDTBCG/figures/9_collect_insilico_perturbs/individual_pairs’: File exists


# Define motif pairs

Here, we will look at each homotypic and heterotypic motif pair.

In [4]:
from itertools import permutations 
motif_perms = list(permutations(motif_seqs.keys(), 2))
motif_perms.extend([('Dl', 'Dl'), ('Zld', 'Zld'), ('Twi', 'Twi'), ('Bcd', 'Bcd'), ('Cad', 'Cad'), ('Gaf', 'Gaf')])

# Import BPNet model

Load the BPNet model that was trained.

In [5]:
from bpnet.BPNet import BPNetSeqModel
bpn = BPNetSeqModel.from_mdir(model_dir)  # wrap SeqModel to BPNetSeqModel to get `sim_pred` method

TF-MoDISco is using the TensorFlow backend.


# Define functions

We want to define functions that will allow us to generate simulated, injected versions of motif pairs and format them into pd.dfs that can be plotted.

In [6]:
from bpnet.simulate import generate_sim

#Get results of bpnet.BPNet.sim_pred into a tidy pd.df
def sim_pred_to_df(sim_pred_profiles):
    dfp_ref = pd.DataFrame()
    for k in sim_pred_profiles.keys():
        df = pd.DataFrame(sim_pred_profiles[k]).reset_index()
        df['task'] = os.path.basename(k)
        df.columns = ['position','pos','neg', 'task']
        dfp_ref = dfp_ref.append(df)
    dfp_ref = dfp_ref.melt(id_vars = ['position','task'], var_name = 'strand', value_name = 'pred')
    return dfp_ref

#Get results of bpnet.simulate.generate_sim into a tidy pd.df
def generate_sim_to_df(generate_sim_profiles):
    dfp_alt = pd.DataFrame()
    for d in range(len(generate_sim_profiles)):
        dfp = sim_pred_to_df(generate_sim_profiles[d][1]['profile'])
        dfp['distance'] = generate_sim_profiles[d][0]
        dfp_alt = dfp_alt.append(dfp)
    return dfp_alt

#Get perturbations and convert into tidy.df (dfs = summary, dfp = profiles)
def get_sim(motif_comb, motif_seqs, side_distances = np.arange(505, 900), center_coords = [400, 600]):
    dfs, profiles = generate_sim(bpn, central_motif=motif_seqs[motif_comb[0]], side_motif=motif_seqs[motif_comb[1]], 
                       side_distances=side_distances, center_coords=center_coords, contribution=[], correct=True)
    dfs.central_motif = motif_comb[0]
    dfs.side_motif = motif_comb[1]
    
    #Get reference profiles
    profiles_ref = bpn.sim_pred(motif_seqs[motif_comb[0]], repeat = 64)
    dfp_ref = sim_pred_to_df(profiles_ref)
    dfp_ref['distance'] = 'Reference'

    #Get perturbed profiles
    dfp_alt = generate_sim_to_df(profiles)

    #Combine
    dfp = dfp_ref.append(dfp_alt)

    return dfs, dfp

# Generate distance-separated motif pair measurements

After knocking out the 'side' injected motif, measure the effect on the 'central' motif. Do this for varying distances to investigate the effects of motif pair spacing between each set.

In [7]:
%%script false --no-raise-error
dfs_all = pd.DataFrame()
for perm in motif_perms:
    dfs, _ = get_sim(motif_seqs = motif_seqs, motif_comb=perm, side_distances = np.arange(505, 900))
    dfs_all = dfs_all.append(dfs)
dfs_all.to_csv(f'analysis/ZDTBCG/tsv/insilico_perturb/insilico_summaries.tsv.gz', sep = '\t')

Read in the summaries after they are generated and assign motif pair names and information for simultaneous plotting.

In [8]:
dfs_all = pd.read_csv(f'analysis/ZDTBCG/tsv/insilico_perturb/insilico_summaries.tsv.gz', sep = '\t')
dfs_all.head(n=5)

Unnamed: 0.1,Unnamed: 0,profile/simmetric_kl,profile/counts,profile/counts_frac,profile/max,profile/max_frac,profile/counts_max_ref,profile/counts_max_ref_frac,task,central_motif,side_motif,position,distance
0,0,0.023655,144.907623,0.393253,0.516568,0.337716,0.80763,0.264805,Zld,Zld,Gaf,505,5
1,1,0.007098,55.613613,0.631416,0.169938,0.475241,0.308911,0.440448,Dl,Zld,Gaf,505,5
2,2,0.019757,91.101067,0.574631,0.409906,0.688328,0.498221,0.432846,Twi,Zld,Gaf,505,5
3,3,0.016586,135.749771,0.4223,0.479455,0.41443,0.750359,0.338939,Bcd,Zld,Gaf,505,5
4,4,0.014367,120.477982,0.575408,0.429203,0.477198,0.579262,0.370217,Cad,Zld,Gaf,505,5


In [9]:
#Assign motif_pair names for faceting
motif_pair = []
for i in range(dfs_all.shape[0]):
    if dfs_all.central_motif.iloc[i] > dfs_all.side_motif.iloc[i]:
        val = dfs_all.side_motif.iloc[i] + '_' + dfs_all.central_motif.iloc[i]
        motif_pair.append(val)
    else:
        val = dfs_all.central_motif.iloc[i] + '_' + dfs_all.side_motif.iloc[i]
        motif_pair.append(val)
dfs_all['motif_pair'] = motif_pair
dfs_all['motif_pair_raw'] = dfs_all.central_motif + '_' + dfs_all.side_motif

#Mark the BPNet tasks as 'featured' for plotting
featured_idx = [row.task in row.central_motif for idx,row in dfs_all.iterrows()]
dfs_all['featured_task'] = featured_idx

#Make categorigal variable column
dfs_all['task'] = pd.Categorical(dfs_all['task'], categories = list(motif_colors.keys()), ordered = False)
dfs_all.head(n=4)

Unnamed: 0.1,Unnamed: 0,profile/simmetric_kl,profile/counts,profile/counts_frac,profile/max,profile/max_frac,profile/counts_max_ref,profile/counts_max_ref_frac,task,central_motif,side_motif,position,distance,motif_pair,motif_pair_raw,featured_task
0,0,0.023655,144.907623,0.393253,0.516568,0.337716,0.80763,0.264805,Zld,Zld,Gaf,505,5,Gaf_Zld,Zld_Gaf,True
1,1,0.007098,55.613613,0.631416,0.169938,0.475241,0.308911,0.440448,Dl,Zld,Gaf,505,5,Gaf_Zld,Zld_Gaf,False
2,2,0.019757,91.101067,0.574631,0.409906,0.688328,0.498221,0.432846,Twi,Zld,Gaf,505,5,Gaf_Zld,Zld_Gaf,False
3,3,0.016586,135.749771,0.4223,0.479455,0.41443,0.750359,0.338939,Bcd,Zld,Gaf,505,5,Gaf_Zld,Zld_Gaf,False


Plot the motif pair summaries at their respective distances. 

In [10]:
dfs_all.motif_pair.unique()

array(['Gaf_Zld', 'Cad_Zld', 'Dl_Zld', 'Twi_Zld', 'Bcd_Zld', 'Cad_Gaf',
       'Dl_Gaf', 'Gaf_Twi', 'Bcd_Gaf', 'Cad_Dl', 'Cad_Twi', 'Bcd_Cad',
       'Dl_Twi', 'Bcd_Dl', 'Bcd_Twi', 'Dl_Dl', 'Zld_Zld', 'Twi_Twi',
       'Bcd_Bcd', 'Cad_Cad', 'Gaf_Gaf'], dtype=object)

In [11]:
import plotnine
from plotnine import *
plotnine.options.figure_size = (8, 2)

for pair in tqdm(dfs_all.motif_pair.unique()):
    df = dfs_all[dfs_all.motif_pair==pair]
    df['profile/counts_max_ref_log2'] = np.log2(df['profile/counts_max_ref_frac'])
    df['profile/counts_log2'] = np.log2(df['profile/counts_frac'])
    
#     gmax = (ggplot(data = df, mapping = aes(x='distance', y='profile/counts_max_ref_log2')) + 
#      geom_line(aes(color = 'task', alpha = 'featured_task'))+ #, linetype = 'featured_task')) + 
#      facet_wrap('~motif_pair_raw', nrow = 1) +
#      geom_hline(yintercept=0, alpha=0.2) + 
#      scale_x_continuous(breaks = range(0,200,20), name = 'Center-to-center distance (bp)')+
#      scale_y_continuous(name = 'Max. log2(FC) of preds.')+
#      scale_color_manual(values = ['#357C42','#4b3046','#40e0d0'], 
#                         name = "Task") + 
#      scale_alpha_manual(values = [.4,1], name = "Main task") +
# #      scale_linetype_manual(values = ['dotted','solid'], name = "Main task") +
#      theme_minimal())
#     gmax.save(f'figures/4_in_silico/max/insilico_perturb_max_{pair}.pdf', height = 2, width = 8)
    
    gsum = (ggplot(data = df, mapping = aes(x='distance', y='profile/counts_log2')) + 
     geom_line(aes(color = 'task', alpha = 'featured_task'))+ #, linetype = 'featured_task')) + 
     facet_wrap('~motif_pair_raw', nrow = 1) +
     geom_hline(yintercept=0, alpha=0.2) + 
     scale_x_continuous(breaks = range(0,400,50), name = 'Center-to-center distance (bp)')+
     scale_y_continuous(name = 'Sum log2(fc) of preds.')+
     scale_color_manual(values = list(motif_colors.values()), 
                        name = "Task") + 
     scale_alpha_manual(values = [.4,1], name = "Main task") +
#      scale_linetype_manual(values = ['dotted','solid'], name = "Main task") +
     theme_minimal())
    gsum.save(f'{figure_filepath}/individual_pairs/insilico_perturb_sum_{pair}.png', height = 2, width = 8)
    gsum.save(f'{figure_filepath}/individual_pairs/insilico_perturb_sum_{pair}.pdf', height = 2, width = 8)
    gsum

100%|██████████| 21/21 [00:33<00:00,  1.61s/it]
