# Introduction

The purpose of this .ipynb is to generate any-combination of mutation perturbations of motifs across a given set of regions and to plot the predictions alongside the contribution scores. The steps involved in this process are as follows:

1. Import enhancer regions based on annotated bed file.
2. Import motif regions based on curated set.
3. Match (1) and (2) for general overlaps to cut out motifs for filtering.
4. Redo 3, but match accurately and assign `example_idx` to motifs and enhancers for perturbation analysis.
5. Perturb the actual region sequences based on these motif locations in a combinatorial fashion.
6. Plot the predictions and contribution scores
    + Annotate motif locations in addition to which motifs are being removed.

We will be using `ZDTBCG` model with enhancer regions curated by Kaelan based on annotated enhancers. 

# Computational setup

In [1]:
import warnings
warnings.filterwarnings("ignore")
from tensorflow.python.util import deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False

#Packages
import os
import sys
import pandas as pd
import numpy as np
from tqdm import tqdm
from pybedtools import BedTool
from bpnet.cli.contrib import bpnet_contrib
from bpnet.preproc import resize_interval

# Settings
os.chdir('/l/Zeitlinger/ZeitlingerLab/Manuscripts/Zelda_and_Nucleosomes/Analysis/analysis/')
pd.set_option('display.max_columns', 100)

# Custom commands
sys.path.insert(0, f'scripts/py')
from bpnet_data_format_functions import myround, myfloor, myceiling, df_to_intervals, tidy_bpnet_predictions_nexus, tidy_bpnet_contributions
from bpnet_motif_functions import remove_palindromic_motif_duplicates, filter_overlapping_motifs_by_priority
from bpnet_perturb_functions import random_seq_onehot, generate_perturbs_across_window, plot_perturbs_across_window

#Pre-existing variables
figure_filepath = 'figures/5b_binding_enhancer_perturbs'
fasta_file = f'../data/indexes/bowtie2/dm6.fa'
model_dir = f'bpnet/models/optimized_model/fold1/'
contrib_file = f'bpnet/preds/fold1/contrib.h5'
modisco_dir = f'bpnet/modisco/fold1'
curated_motifs = f'bed/mapped_motifs/all_instances_curated_0based.bed'
curated_regions = f'bed/mapped_motifs/all_grouped_regions_0based.bed'
enhancer_regions_file = f'bed/enhancers/enhancers_for_models.bed'
tasks = ['Zld', 'Dl', 'Twi', 'Bcd', 'Cad', 'GAF']
enhancer_contrib_file = f'bpnet/preds/fold1/enhancers_for_models.h5'
enhancer_bed_file = f'bed/enhancers/enhancers_for_models.bed3'

task_color_dict = {'Zld': '#d53e4f', 'Dl': '#fc8d59', 'Twi': '#fee08b', 'Bcd': '#e6f598', 'Cad': '#99d594',  'GAF': '#3288bd'}

Using TensorFlow backend.
2022-08-04 10:14:26,999 [INFO] Note: NumExpr detected 64 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2022-08-04 10:14:27,003 [INFO] NumExpr defaulting to 8 threads.


In [2]:
!mkdir -p {figure_filepath}
!mkdir -p tsv/perturbs/binding/enhancer

# Generate contributions across grouped regions

We will generate contribution scores across our set of enhancer regions. This will control for indexes.

In [3]:
#%%script false --no-raise-error
!cat {enhancer_regions_file} | cut -f 1,2,3 > {enhancer_bed_file}

In [4]:
#%%script false --no-raise-error
bpnet_contrib(model_dir = model_dir, 
              output_file = enhancer_contrib_file, 
              method = 'deeplift', fasta_file = fasta_file,
              regions = enhancer_bed_file, 
              overwrite = True)

2022-08-04 10:14:31,107 [INFO] Loading the config files
2022-08-04 10:14:31,112 [INFO] Creating the dataset
TF-MoDISco is using the TensorFlow backend.
2022-08-04 10:14:54,390 [INFO] Using the following interpretation targets:


Bcd/profile/wn
Bcd/counts/pre-act
Cad/profile/wn
Cad/counts/pre-act
Dl/profile/wn
Dl/counts/pre-act
GAF/profile/wn
GAF/counts/pre-act
Twi/profile/wn
Twi/counts/pre-act
Zld/profile/wn
Zld/counts/pre-act


  0%|          | 0/1.0 [00:00<?, ?it/s]

DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  True
DeepExplain: running 

100%|██████████| 1/1.0 [02:15<00:00, 135.98s/it]
2022-08-04 10:17:12,254 [INFO] Done. Contribution score file was saved to: bpnet/preds/fold1/enhancers_for_models.h5


# Connect curated motifs with enhancer ids 

The enhancer contribution file will have each enhancer as an `example_idx`. Match the curated motifs to the correct range such that we can generate perturbations.

In [5]:
#Find overlaps between motifs to cut down on motifs
motif_ov_df = BedTool(curated_motifs).intersect(BedTool(enhancer_bed_file)).to_dataframe()

In [6]:
#Import motifs
motifs_df = BedTool(curated_motifs).to_dataframe()
motifs_df.columns = ['example_chrom','start','end','name','score','strand']

#Filter motifs by their overlap with enhancers
motifs_df = motifs_df[motifs_df.name.isin(motif_ov_df.name)]

#Separate motif name
motifs_df['pattern_len'] = motifs_df['end'] - motifs_df['start']
motifs_df['pattern_name'] = [n.split('_')[0] for n in motifs_df.name]
motifs_df['motif_id'] = [n.split('_')[1] for n in motifs_df.name]
motifs_df['region_id'] = [n.split('_')[2] for n in motifs_df.name]
motifs_df.shape

(302, 10)

Get enhancer coordinates over the exact contribution window.

In [7]:
from bpnet.cli.contrib import ContribFile

#Extract enhancer coordinates
contrib_file = ContribFile(enhancer_contrib_file)
enhancer_coords_df = contrib_file.get_ranges()

In [8]:
enhancer_coords_df.head()

Unnamed: 0,chrom,start,end,strand,interval_from_task,idx
0,chr3R,8698828,8699828,.,,0
1,chr3R,8700693,8701693,.,,1
2,chr3R,8694217,8695217,.,,2
3,chr2R,25236999,25237999,.,,3
4,chr2R,25225884,25226884,.,,4


Check for which motif overlaps with which enhancer. Filter out motifs with no overlap.

In [None]:
motifs_with_eidx = pd.DataFrame()
for i,motif in tqdm(motifs_df.iterrows()):
    enhancer_match = []
    for j,enh in enhancer_coords_df.iterrows():
        if motif.example_chrom == enh.chrom and motif.start > enh.start and motif.end < enh.end:
            row_new = motif
            row_new['example_idx'] = enh.idx
            row_new['pattern_start'] = motif.start - enh.start
            row_new['pattern_end'] = motif.end - enh.start
            row_new['pattern'] = motif.pattern_name
            motifs_with_eidx = motifs_with_eidx.append(row_new)

246it [00:08, 30.06it/s]

In [None]:
motifs_with_eidx.head()

Show how many regions have motifs mapped across them.

In [None]:
motifs_with_eidx.example_idx.value_counts().value_counts()

In [None]:
motifs_with_eidx.sort_values(['example_idx']).head(n = 10)

# Generate perturbation plots

Here we will undergo the following steps to generate perturbation plots:

1. Import the formatted CWM-scanned motifs from above.
2. For each enhancer:
    + Perturb the motifs from (1) in a combinatorial fashion.
    + Generate (1) profile predictions, (2) contribution scores, and (3) a list of the mutations generated.
    + Tidy information and save as .tsv.gz for more efficient plotting in R.

In [None]:
#Reassign motifs for clarity
dfi = motifs_with_eidx
dfi['pattern_name_unique'] = dfi['pattern_name'] + '-' + (dfi.groupby(['example_idx','pattern_name']).cumcount()+1).astype(str)
dfi[['start','end','example_idx','pattern_start',
     'pattern_end','pattern_len']] = dfi[['start','end','example_idx','pattern_start',
                                          'pattern_end','pattern_len']].astype(int)

#Reassign enhancers for clarity
enhancer_coords_df['name'] = BedTool(enhancer_regions_file).to_dataframe().name
enhancers_df = enhancer_coords_df

Iterate through enhancers and then collect perturbation profiles for each single motif mutation across the enhancers.

In [None]:
max_perturbs_per_mut = 1
xrange = [100, 900]

for ei in tqdm(dfi.example_idx.unique()):
    #Subset motifs by window
    dfi_across_window = dfi[dfi.example_idx==ei]
    enhancer_df = enhancers_df[enhancers_df.idx==ei]
    
    #Generate perturbation predictions, contribution, and a mutant list
    perturb_preds, _, muts = generate_perturbs_across_window(dfi = dfi_across_window, 
                                                             contrib_file = enhancer_contrib_file, 
                                                             comb_max = 1,
                                                             model_dir = model_dir, 
                                                             return_contrib = False)
    
    #Save profile predictions
    perturb_preds_format_dict = {m: {k: v[i] for k,v in perturb_preds.items()} for i,m in enumerate(muts)}
    profile_preds_df = pd.DataFrame()
    for i,m in enumerate(muts):
        profile_pred_df = tidy_bpnet_predictions_nexus(perturb_preds_format_dict[m], tasks = perturb_preds.keys())
        profile_pred_df['mut']=m
        profile_preds_df = profile_preds_df.append(profile_pred_df)
    profile_preds_df['enhancer_name'] = enhancer_df.name
    profile_preds_df['enhancer_idx'] = enhancer_df.idx
    profile_preds_df['genomic_position_0based'] = int(enhancer_df['start']) + profile_preds_df.position
    profile_preds_df
    profile_preds_df.to_csv(f"tsv/perturbs/binding/enhancer/{enhancer_df['name'].iloc[0]}_predictions.tsv.gz", 
                            index = False)

Extract enhancer information for subsequent plotting in later analysis. 

In [None]:
max_perturbs_per_mut = 1
xrange = [100, 900]

for ei in tqdm(dfi.example_idx.unique()):
    #Subset motifs by window
    dfi_across_window = dfi[dfi.example_idx==ei]
    enhancer_df = enhancers_df[enhancers_df.idx==ei]
    
    #Save motif information
    dfi_across_window['enhancer_name'] = enhancer_df.name.iloc[0]
    dfi_across_window['enhancer_idx'] = enhancer_df.idx.iloc[0]
    dfi_across_window['enhancer_start_0based'] = enhancer_df.start.iloc[0]
    dfi_across_window['enhancer_end_0based'] = enhancer_df.end.iloc[0]
    dfi_across_window.to_csv(f"tsv/perturbs/binding/enhancer/{enhancer_df['name'].iloc[0]}_motif_info_0based.csv.gz", 
                             index = False)

In [None]:
dfi_across_window