# Introduction

The purpose of this `.ipynb` is to use the `TSC TFs tranied BPNet model` to predict binding on genomic regions where Tfap2c and Tead4 have distance replationship and plot change in predicted binding upon their distance manipulation.

# Computational setup

In [38]:
import warnings
warnings.filterwarnings("ignore")
from tensorflow.python.util import deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False

#Packages
import os
import sys
import pandas as pd
import numpy as np
from pybedtools import BedTool
from genomelake.extractors import FastaExtractor
from bpnet.extractors import Interval
from bpnet.cli.contrib import bpnet_contrib
from bpnet.cli.modisco import cwm_scan
from bpnet.cli.contrib import ContribFile
from concise.preprocessing import encodeDNA
from bpnet.cli.contrib import ContribFile
from bpnet.simulate import random_seq, insert_motif
from bpnet.plot.tracks import plot_tracks, to_neg
from plotnine import *
from tqdm import tqdm
from bpnet.BPNet import BPNetSeqModel
from bpnet.utils import create_tf_session

#create_tf_session('0', .4)

# Settings
os.chdir(f'/n/projects/kd2200/publication/bpnet/analysis/test/')
pd.set_option('display.max_columns', 100)


# Custom commands
sys.path.insert(0, f'/n/projects/kd2200/publication/bpnet/analysis/scripts/bpnet/scripts')
from data_format_functions import myround, myfloor, myceiling, df_to_intervals,tidy_bpnet_predictions_nexus, one_hot_decode
from motif_functions import remove_palindromic_motif_duplicates, filter_overlapping_motifs_by_priority


# function to return key for any value 
def get_key(val, my_dict): 
    for key, value in my_dict.items(): 
        if val == value: 
            return key 
    return "key doesn't exist"

#Function to save contribution information with 4 columns and a task (does not use hypoth. contrib)
def tidy_bpnet_contributions(seq, contrib, tasks, contrib_type = 'profile'):
    contrib_df = pd.DataFrame()
    for task in tasks:
        c = contrib[f'{task}/{contrib_type}'] * seq
        df = pd.DataFrame(c, columns = ['A','C','G','T'])
        df['task'] = task
        contrib_df=contrib_df.append(df)
    return(contrib_df)

In [39]:
#Pre-existing variables
fasta_file = f'/n/projects/kd2200/publication/bpnet/fasta/mm10.fa'
model_dir = f'/n/projects/kd2200/publication/bpnet/model/dataspec.yaml_default_fold_5/'
contrib_file = f"/n/projects/kd2200/publication/bpnet/contrib/dataspec.yaml_default_fold_5/contrib.h5"
modisco_dir = f'/n/projects/kd2200/publication/bpnet/modisco/profile/dataspec.yaml_default_fold_5/'
modisco_all_region_cwm_dir = f'/n/projects/kd2200/publication/bpnet/cwm_all_regions/'
tasks = ['tead4','tfap2c','yap1','gata3','cdx2']

# Independent variables
motifs_of_interest_dict = {'tead4': 'tead4/m0_p0',
                           'tfap2c': 'tfap2c/m0_p0',
                            'cdx2': 'cdx2/m0_p0',
                            'gata3': 'gata3/m0_p4'}
# Dependent variables
enhancers_path = f'enhancers_genes_tead4_tfap2c_dis_0based.bed'
enhancers_bed3 = f'enhancers_genes_tead4_tfap2c_dis_0based.bed3'
tfap2c_path = f'genes_tfap2c_dis_0based.bed'
tead4_path = f'genes_tead4_dis_0based.bed'
prefix = 'tt_enhancers'
enhancer_bed_file = f'{prefix}.bed'
cwm_scan_prefix = f'cwm_scan/{prefix}'
enhancer_contrib_file = f"contrib/{prefix}_contrib.h5"

In [42]:
%%script false --no-raise-error
!mkdir -p {cwm_scan_prefix}
!mkdir -p bed
!mkdir -p contrib
!mkdir -p figures/1_{prefix}_tt_distance

# Generate contributions based on enhancer coordinates

Generate contributions, then run CWM-scanning to map all pertinent motifs.

In [43]:
!cut -f 1-3 {enhancers_path} > {enhancers_bed3}

In [44]:
%%script false --no-raise-error
!bpnet contrib --regions {enhancers_bed3} --fasta-file {fasta_file} --method deeplift --overwrite  {model_dir} {enhancer_contrib_file}

In [45]:
%%script false --no-raise-error
# For each task, CWM-scan the enhancers. The `example_idx` will be the same across tasks
for task in ['tead4','tfap2c','cdx2','gata3','yap1']:
    print(task)
    cwm_scan(modisco_dir = f'{modisco_dir}/{task}', 
             output_file = f'{cwm_scan_prefix}/enhancer_cwm_scan_instances.{task}.tsv.gz', 
             contrib_file = enhancer_contrib_file)     

2023-12-28 12:11:03,898 [INFO] Using tasks: ['tead4']
2023-12-28 12:11:03,900 [INFO] Loading the contribution scores from: contrib/tt_enhancers_contrib.h5
2023-12-28 12:11:03,907 [INFO] Centroid matches already exist.
2023-12-28 12:11:03,909 [INFO] Loading centroid matches from /n/projects/kd2200/publication/bpnet/modisco/profile/dataspec.yaml_default_fold_5/tead4/cwm-scan-seqlets.trim-frac=0.08.csv.gz
2023-12-28 12:11:04,285 [INFO] Scanning for patterns
100%|██████████| 11/11 [00:21<00:00,  1.94s/it]
2023-12-28 12:11:25,672 [INFO] Merging
2023-12-28 12:11:25,721 [INFO] Append ranges
2023-12-28 12:11:25,732 [INFO] Table info
2023-12-28 12:11:25,745 [INFO] Writing the resuling pd.DataFrame of shape (838, 30) to cwm_scan/tt_enhancers/enhancer_cwm_scan_instances.tead4.tsv.gz
2023-12-28 12:11:25,747 [INFO] Writing a tsv file
2023-12-28 12:11:25,801 [INFO] Done!
2023-12-28 12:11:25,826 [INFO] Using tasks: ['tfap2c']
2023-12-28 12:11:25,826 [INFO] Loading the contribution scores from: contri

# Collect annotated genomic loci

Import the regions for downstream analysis. Because the enhancers are already a bed file, no conversion is needed.

In [46]:
#Import regions and the genomic coordinates.

enhancers_df = BedTool(enhancers_path).to_dataframe().sort_values(['name']).reset_index(drop = True)
tfap2c_df = BedTool(tfap2c_path).to_dataframe().sort_values(['name']).reset_index(drop = True)
tead4_df = BedTool(tead4_path).to_dataframe().sort_values(['name']).reset_index(drop = True)
enhancers_bed = BedTool.from_dataframe(enhancers_df)
tfap2c_bed = BedTool.from_dataframe(tfap2c_df)
tead4_bed = BedTool.from_dataframe(tead4_df)

#Ensure everything is in the same order and unique
assert(all(enhancers_df.name.values==tfap2c_df.name.values)==all(enhancers_df.name.values==tead4_df.name.values), 
       'Either motifs occur more than once in region OR the motifs and enhancers are not in the same order')
    

In [None]:
enhancers_df

In [None]:
tfap2c_df

Import enhancers, motifs, and collect coordinates for sequence mutation.

In [47]:
#Assign in-window coordinates to these motifs (watch enhancer id!)
tfap2c_df['window_enhancer_start'] = enhancers_df.start
tead4_df['window_enhancer_start'] = enhancers_df.start
tfap2c_df['window_enhancer_end'] = enhancers_df.end
tead4_df['window_enhancer_end'] = enhancers_df.end

tfap2c_df['window_motif_start'] = [m.start - m.window_enhancer_start for i,m in tfap2c_df.iterrows()]
tead4_df['window_motif_start'] = [m.start - m.window_enhancer_start for i,m in tead4_df.iterrows()]

tfap2c_df['window_motif_end'] = [m.end - m.window_enhancer_start for i,m in tfap2c_df.iterrows()]
tead4_df['window_motif_end'] = [m.end - m.window_enhancer_start for i,m in tead4_df.iterrows()]

tfap2c_df['window_motif_center'] = np.floor((tfap2c_df.window_motif_end - tfap2c_df.window_motif_start)/2) + tfap2c_df.window_motif_start
tead4_df['window_motif_center'] = np.floor((tead4_df.window_motif_end - tead4_df.window_motif_start)/2) + tead4_df.window_motif_start

# Mark where the motif boundaries are and store in enhancer_df
enhancers_df['tfap2c_start_genomic'] = tfap2c_df.start
enhancers_df['tfap2c_end_genomic'] = tfap2c_df.end
enhancers_df['tead4_start_genomic'] = tead4_df.start
enhancers_df['tead4_end_genomic'] = tead4_df.end

enhancers_df['tfap2c_center_genomic'] = np.floor((tfap2c_df.end - tfap2c_df.start)/2) + tfap2c_df.start
enhancers_df['tead4_center_genomic'] = np.floor((tead4_df.end - tead4_df.start)/2) + tead4_df.start

#Mark distance between the two motifs, with tfap2c as the anchor.
enhancers_df['tead4_minus_tfap2c_distance'] = enhancers_df['tfap2c_center_genomic'] - enhancers_df['tead4_center_genomic']

#enhancers_df[enhancers_df.name==119]

In [27]:
fasta_file

'/n/projects/kd2200/publication/bpnet/fasta/mm10.fa'

In [None]:
tfap2c_df.head(n=15)

In [None]:
tfap2c_df.head(n=15)

Confirm that the sequences found by BPNet match the sequences based off of the genomic coordinates of the .fasta file. To do this, we need to collect sequences and hyper-contributions across each enhancer in order to mutate them at the targeted sites.

In [48]:
# Get sequence and contribution 
c = ContribFile(enhancer_contrib_file)
seqs = c.get_seq()
coords = c.get_ranges()
hyp_contrib = c.get_hyp_contrib()

In [37]:
x =tead4_bed[1]
x
#[one_hot_decode(i) for i in FastaExtractor(fasta_file)([Interval.from_pybedtools(i) for i in x])]
tfap2c_df

Unnamed: 0,chrom,start,end,name,score,strand,window_enhancer_start,window_enhancer_end,window_motif_start,window_motif_end,window_motif_center
0,chr1,33981094,33981106,0,0,+,33980621,33981621,473,485,479.0
1,chr1,34716429,34716441,1,0,-,146482883,146483883,-111766454,-111766442,-111766448.0
2,chr1,38259714,38259726,2,0,+,3206597,3207597,35053117,35053129,35053123.0
3,chr1,53939591,53939603,3,0,-,4716425,4717425,49223166,49223178,49223172.0
4,chr1,60852606,60852618,4,0,+,18749498,18750498,42103108,42103120,42103114.0
...,...,...,...,...,...,...,...,...,...,...,...
150,chr19,54982668,54982680,150,0,+,79261639,79262639,-24278971,-24278959,-24278965.0
151,chr19,55714436,55714448,151,0,+,84128795,84129795,-28414359,-28414347,-28414353.0
152,chrX,93690800,93690812,152,0,+,87089787,87090787,6601013,6601025,6601019.0
153,chrX,167414659,167414671,153,0,+,32632323,32633323,134782336,134782348,134782342.0


Create a check that looks at every sequence to make sure it is correct.

In [49]:
#Extract BPNet-contained sequences
tfap2c_bpnet_seqs = [one_hot_decode(seqs[row.name][row.window_motif_start:row.window_motif_end]) for i,row in tfap2c_df.iterrows()]
tfap2c_fasta_seqs = [one_hot_decode(i) for i in FastaExtractor(fasta_file)([Interval.from_pybedtools(i) for i in tfap2c_bed])]
#assert tfap2c_bpnet_seqs==tfap2c_fasta_seqs, 'BPNet sequences and genomic sequences do not match. Correct this before proceeding.'

#tead4_bpnet_seqs = [one_hot_decode(seqs[row.name][row.window_motif_start:row.window_motif_end]) for i,row in tead4_df.iterrows()]
#tead4_fasta_seqs = [one_hot_decode(i) for i in FastaExtractor(fasta_file)([Interval.from_pybedtools(i) for i in tead4_bed])]
#assert tead4_bpnet_seqs==tead4_fasta_seqs, 'BPNet sequences and genomic sequences do not match. Correct this before proceeding.'

ValueError: encoded array not the same length as given seq

# Begin generating perturbations based on 2-mutation dynamic approach

TODO: Explain approach in normal words.

Define distances that we want to inject motifs across.

In [50]:
distances = range(-200, 200, 1)
mutation_count = 2 #how many bases we will mutate in a targeted sense

## Generate all sequences 

Across every enhancer, motif, and distance generate the WT, mutant, and mutant-distance injection. Combine into array while storing index information in a separate `pd.df`.

In [51]:
#Preallocate objects for writing.
wt_seqs = []
mut_seqs = []
mut_inj_seqs = []
wt_info_all_df = pd.DataFrame()
mut_inj_info_all_df = pd.DataFrame()

#test = enhancers_df.head(n=118)
for enhancer_id,enhancer_row in tqdm(enhancers_df.iterrows()):
    
    for anchor_motif in ['tfap2c', 'tead4']:
        #Define motif coordinates
        if anchor_motif=='tfap2c':
            anchor_motif_coords = tfap2c_df.loc[enhancer_id] 
            moved_motif = 'tead4'
            moved_motif_coords = tead4_df.loc[enhancer_id]
        else:
            anchor_motif_coords = tead4_df.loc[enhancer_id]
            moved_motif = 'tfap2c'
            moved_motif_coords = tfap2c_df.loc[enhancer_id]
        #Define input sequence across enhancer
        wt_seq = seqs[enhancer_id] 
        wt_seqs.append(wt_seq)
        
        #Find the minimum hyp contribution across all positions
        hyp_contrib_across_motif_arr = hyp_contrib[moved_motif][enhancer_id][moved_motif_coords.window_motif_start:moved_motif_coords.window_motif_end]
        min_contrib_across_motif_arr = np.min(hyp_contrib_across_motif_arr, axis = 1)
        min_indices_across_motif_arr = np.stack([np.where(pos==min_contrib_across_motif_arr[i])[0].flatten() for i,pos in enumerate(hyp_contrib_across_motif_arr)], axis = 0).flatten()
        seq_across_motif_arr = wt_seq[moved_motif_coords.window_motif_start:moved_motif_coords.window_motif_end]

        #Find the top contributing indices across the motif
        contrib_across_motif_arr = np.sum(hyp_contrib_across_motif_arr * seq_across_motif_arr, axis = 1)
        top_contributing_indices = (-contrib_across_motif_arr).argsort()[:mutation_count]

        #Reassign the worst contributing nt for each mutation position.
        mut_motif_seq = []
        for i in range(seq_across_motif_arr.shape[0]):
            if i in top_contributing_indices:
                row = np.array([0,0,0,0], dtype = 'float32')
                row[min_indices_across_motif_arr[i]] = 1
            else: 
                row = seq_across_motif_arr[i]
            mut_motif_seq.append(row)
        mut_motif_seq = np.stack(mut_motif_seq)

        #Reinject the motif sequence back into the entire window
        mut_seq = wt_seq.copy()
        mut_seq[moved_motif_coords.window_motif_start:moved_motif_coords.window_motif_end] = mut_motif_seq
        mut_seqs.append(mut_seq)
  
        #Record all perturbation information in a single pd.df (defined by enhancer_id)
        wt_info_df = pd.DataFrame([enhancer_id, anchor_motif, anchor_motif_coords.window_motif_start, anchor_motif_coords.window_motif_end, anchor_motif_coords.window_motif_center,
                                moved_motif, moved_motif_coords.window_motif_start, moved_motif_coords.window_motif_end, moved_motif_coords.window_motif_center]).transpose()
        wt_info_df.columns = ['enhancer_id',
                           'anchor_motif','anchor_motif_window_start_0based','anchor_motif_window_end_0based','anchor_motif_window_center_0based',
                           'moved_motif','moved_motif_window_orig_start_0based','moved_motif_window_orig_end_0based','moved_motif_window_orig_center_0based']
        wt_info_df['rank_positions'] = ','.join([str(x) for x in top_contributing_indices])
        wt_info_df['injected_motif_seq'] = one_hot_decode(seq_across_motif_arr)
        wt_info_df['mutated_motif_seq'] = one_hot_decode(mut_motif_seq)
        wt_info_df['anchor_motif_seq'] = one_hot_decode(wt_seq[anchor_motif_coords.window_motif_start:anchor_motif_coords.window_motif_end])
        wt_info_all_df = wt_info_all_df.append(wt_info_df)
        
        for d in distances:
            
            #Add check to make sure distance injection doesn't move off window
            check1 = (d + anchor_motif_coords.window_motif_start) <= 0
            check2 = (d + anchor_motif_coords.window_motif_end) >= (anchor_motif_coords.window_enhancer_end - anchor_motif_coords.window_enhancer_start - 1) 
            if not (check1 or check2):

                #Inject original motif into the designated distance 
                mut_inj_seq = mut_seq.copy()

                #Define distance boundaries based on anchor center
                injected_moved_motif_center = int(anchor_motif_coords.window_motif_center + d)
                upstream_to_center_moved_motif_distance = moved_motif_coords.window_motif_center - moved_motif_coords.window_motif_start
                downstream_to_center_moved_motif_distance = moved_motif_coords.window_motif_end - moved_motif_coords.window_motif_center 
                injected_moved_motif_boundaries = (int(injected_moved_motif_center - upstream_to_center_moved_motif_distance), 
                                                   int(injected_moved_motif_center + downstream_to_center_moved_motif_distance))
                
                #Add check to make sure distance doesn't inject over another motif.
                inj_motif_int = pd.Interval(injected_moved_motif_boundaries[0], injected_moved_motif_boundaries[1])
                anchor_motif_int = pd.Interval(anchor_motif_coords.window_motif_start, anchor_motif_coords.window_motif_end)
                if not inj_motif_int.overlaps(anchor_motif_int):
                    #Add check to make sure your mutation and injected sequence cover the same coordinates
                    test_mut = mut_inj_seq[injected_moved_motif_boundaries[0]:injected_moved_motif_boundaries[1]].shape == seq_across_motif_arr.shape
                    assert(test_mut, 'You are injecting a misproportioned sequence, check coordinates.')
                    mut_inj_seq[injected_moved_motif_boundaries[0]:injected_moved_motif_boundaries[1]] = seq_across_motif_arr
                    mut_inj_seqs.append(mut_inj_seq)

                    #Record all perturbation information in a single pd.df (distance_id)
                    mut_inj_info_df = wt_info_df
                    mut_inj_info_df['distance_between_moved_center_and_anchor_center'] = d
                    mut_inj_info_df['distance_id'] = str(mut_inj_info_df.enhancer_id.values[0]) + '_' + str(d)
                    mut_inj_info_all_df = mut_inj_info_all_df.append(mut_inj_info_df)
                ##below commented out is the alternative approach to get rank, high contributing nucleotides and mutated nucleotides:
                # df = pd.DataFrame([top_contributing_indices, list(range(mutation_count)), 
                #              list(one_hot_decode(seq_across_motif_arr[top_contributing_indices])),
                #              list(one_hot_decode(mut_motif_seq[top_contributing_indices]))]).transpose()
                # df.columns = ['mutation_index_across_motif', 'mutation_rank', 'wt_nt','mut_nt']
                # df.pivot(columns='mutation_rank', values=['mutation_index_across_motif', 'wt_nt','mut_nt'])
wt_seqs = np.array(wt_seqs)
mut_seqs = np.array(mut_seqs)
mut_inj_seqs = np.array(mut_inj_seqs)

#Add checks to make sure correct dimensions match enhancer ids
assert wt_seqs.shape[0]==mut_seqs.shape[0]==wt_info_all_df.shape[0], 'WT and mut arrays dont match their index'
assert mut_inj_seqs.shape[0]==mut_inj_info_all_df.shape[0], 'WT and mut arrays dont match their index'


155it [39:34, 15.32s/it]


In [54]:
mut_inj_info_all_df.head

<bound method NDFrame.head of    enhancer_id anchor_motif anchor_motif_window_start_0based  \
0            0       tfap2c                              473   
0            0       tfap2c                              473   
0            0       tfap2c                              473   
0            0       tfap2c                              473   
0            0       tfap2c                              473   
..         ...          ...                              ...   
0          154        tead4                              372   
0          154        tead4                              372   
0          154        tead4                              372   
0          154        tead4                              372   
0          154        tead4                              372   

   anchor_motif_window_end_0based anchor_motif_window_center_0based  \
0                             485                               479   
0                             485                          

Export full seqs to csv.gz and make sure full seqs match.

In [53]:
wt_test_df = pd.DataFrame([list(one_hot_decode(wt_seqs[i])) for i in range(wt_seqs.shape[0])]).transpose()
mut_test_df = pd.DataFrame([list(one_hot_decode(mut_seqs[i])) for i in range(mut_seqs.shape[0])]).transpose()
mut_inj_test_df = pd.DataFrame([list(one_hot_decode(mut_inj_seqs[i])) for i in range(mut_inj_seqs.shape[0])]).transpose()
wt_test_df.to_csv('tmp/wt_seqs.tsv', sep = '\t')
mut_test_df.to_csv('tmp/mut_seqs.tsv', sep = '\t')
mut_inj_test_df.to_csv('tmp/mut_inj_seqs.tsv', sep = '\t')

# Predict each sequence set

Predict the WT, mutated, and mutated-injected sequences that were generated above. First, import the model of interest to generate predictions.

In [55]:
model = BPNetSeqModel.from_mdir(model_dir)









The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.





Get predictions from each set of sequences.

In [56]:
#Get predictions from sequences
wt_preds = model.predict(wt_seqs)
mut_preds = model.predict(mut_seqs)
mut_inj_preds = model.predict(mut_inj_seqs)

Measure the whole-window counts summary of these perturbations, and link them into a single data frame. Add motif-range measurements in addition to whole-window measurements. Make this dynamic based on the injection site of distance.

First, define the window range for the motif-range measurements.

In [57]:
motif_range_width = 50

Next, collect measurements.

In [58]:
#Measure WT and mutant task predictions across the whole window
wt_preds_df = pd.DataFrame.from_dict({k: np.sum(v, axis = (1,2)) for k,v in wt_preds.items()})
wt_preds_df.columns = [i + '_wt_whole-range' for i in wt_preds_df.columns]
mut_preds_df = pd.DataFrame.from_dict({k:np.sum(v, axis = (1,2)) for k,v in mut_preds.items()})
mut_preds_df.columns = [i + '_mut_whole-range' for i in mut_preds_df.columns]
wt_info_w_preds_df = pd.concat([wt_info_all_df.reset_index(), wt_preds_df.reset_index(), mut_preds_df.reset_index()], axis=1)

In [None]:
wt_info_w_preds_df.shape[0]

In [59]:
#Measure mut_inj predictions across the whole window for every distance
mut_inj_preds_df = pd.DataFrame.from_dict({k: np.sum(v, axis = (1,2)) for k,v in mut_inj_preds.items()})
mut_inj_preds_df.columns = [i + '_mut-inj_whole-range' for i in mut_inj_preds_df.columns]
mut_inj_info_w_preds_df = pd.concat([mut_inj_info_all_df.reset_index(), mut_inj_preds_df.reset_index()], axis=1)
mut_inj_info_w_preds_df['genomic_moved_minus_anchor_motif_distance'] = mut_inj_info_w_preds_df.moved_motif_window_orig_center_0based - mut_inj_info_w_preds_df.anchor_motif_window_center_0based
whole_window_summary_df = mut_inj_info_w_preds_df.merge(wt_info_w_preds_df[['enhancer_id', 'anchor_motif', 'moved_motif'] + list(wt_preds_df.columns) +  list(mut_preds_df.columns)], 
                              on=['enhancer_id', 'anchor_motif', 'moved_motif'], how = 'left')

In [60]:
mut_inj_info_w_preds_df.shape[0]

117486

For each set of enhancers, anchor/move motifs, and distances, extract the motif-window measurements.

In [61]:
motif_window_predictions = pd.DataFrame()
for i,row in tqdm(whole_window_summary_df.iterrows()):
    injected_center = row.anchor_motif_window_center_0based - row.distance_between_moved_center_and_anchor_center
    injected_coords = (injected_center - (row.moved_motif_window_orig_center_0based - row.moved_motif_window_orig_start_0based),
                       injected_center - (row.moved_motif_window_orig_end_0based - row.moved_motif_window_orig_center_0based))
    injected_motif_window = (int(injected_center - np.floor(motif_range_width/2)), 
                             int(injected_center + np.floor(motif_range_width/2)))
    
    #Define enhancer_id and anchor_motif to index WT_preds and MUT_preds
    enhancer_id = row.enhancer_id
    anchor_motif = row.anchor_motif
    wt_and_mut_index = int(wt_info_w_preds_df[(wt_info_w_preds_df.enhancer_id==enhancer_id) & (wt_info_w_preds_df.anchor_motif==anchor_motif)].index.values[0])
    print(wt_and_mut_index)                   
    #Measure motif range across 3 types for injected window
    i_wt_preds_df = pd.DataFrame({k: np.sum(v[wt_and_mut_index, injected_motif_window[0]:injected_motif_window[1], :], axis = (0,1)) 
                                          for k,v in wt_preds.items()}, index=[0])
    i_wt_preds_df.columns = [i + '_wt_motif-range_inj-window' for i in i_wt_preds_df.columns]
    i_mut_preds_df = pd.DataFrame({k:np.sum(v[wt_and_mut_index, injected_motif_window[0]:injected_motif_window[1], :], axis = (0,1)) 
                                           for k,v in mut_preds.items()}, index=[0])
    i_mut_preds_df.columns = [i + '_mut_motif-range_inj-window' for i in i_mut_preds_df.columns]    
    i_mut_inj_preds_df = pd.DataFrame({k: np.sum(v[i, injected_motif_window[0]:injected_motif_window[1], :], axis = (0,1)) 
                                               for k,v in mut_inj_preds.items()}, index=[0])
    i_mut_inj_preds_df.columns = [i + '_mut-inj_motif-range_inj-window' for i in i_mut_inj_preds_df.columns]
                      
    #Measure motif range across 3 types for anchor window
    anchor_motif_window = (int(row.anchor_motif_window_center_0based - np.floor(motif_range_width/2)), 
                           int(row.anchor_motif_window_center_0based + np.floor(motif_range_width/2)))                            
    a_wt_preds_df = pd.DataFrame({k: np.sum(v[wt_and_mut_index, anchor_motif_window[0]:anchor_motif_window[1], :], axis = (0,1)) 
                                          for k,v in wt_preds.items()}, index=[0])
    a_wt_preds_df.columns = [i + '_wt_motif-range_anchor-window' for i in a_wt_preds_df.columns]
    a_mut_preds_df = pd.DataFrame({k:np.sum(v[wt_and_mut_index, anchor_motif_window[0]:anchor_motif_window[1], :], axis = (0,1)) 
                                           for k,v in mut_preds.items()}, index=[0])
    a_mut_preds_df.columns = [i + '_mut_motif-range_anchor-window' for i in a_mut_preds_df.columns]   
    a_mut_inj_preds_df = pd.DataFrame({k: np.sum(v[i, anchor_motif_window[0]:anchor_motif_window[1], :], axis = (0,1)) 
                                               for k,v in mut_inj_preds.items()}, index=[0])
    a_mut_inj_preds_df.columns = [i + '_mut-inj_motif-range_anchor-window' for i in a_mut_inj_preds_df.columns]
    
    #Measure motif range across 3 types for mutation window
    moved_orig_motif_window = (int(row.moved_motif_window_orig_center_0based - np.floor(motif_range_width/2)), 
                               int(row.moved_motif_window_orig_center_0based + np.floor(motif_range_width/2)))                            
    m_wt_preds_df = pd.DataFrame({k: np.sum(v[wt_and_mut_index, moved_orig_motif_window[0]:moved_orig_motif_window[1], :], axis = (0,1)) 
                                          for k,v in wt_preds.items()}, index=[0])
    m_wt_preds_df.columns = [i + '_wt_motif-range_mut-window' for i in m_wt_preds_df.columns]
    m_mut_preds_df = pd.DataFrame({k:np.sum(v[wt_and_mut_index, moved_orig_motif_window[0]:moved_orig_motif_window[1], :], axis = (0,1)) 
                                           for k,v in mut_preds.items()}, index=[0])
    m_mut_preds_df.columns = [i + '_mut_motif-range_mut-window' for i in m_mut_preds_df.columns]    
    m_mut_inj_preds_df = pd.DataFrame({k: np.sum(v[i, moved_orig_motif_window[0]:moved_orig_motif_window[1], :], axis = (0,1)) 
                                               for k,v in mut_inj_preds.items()}, index=[0])
    m_mut_inj_preds_df.columns = [i + '_mut-inj_motif-range_mut-window' for i in m_mut_inj_preds_df.columns]
                               
    motif_window_prediction = pd.concat([a_wt_preds_df, a_mut_preds_df, a_mut_inj_preds_df,
                                         m_wt_preds_df, m_mut_preds_df, m_mut_inj_preds_df,
                                         i_wt_preds_df, i_mut_preds_df, i_mut_inj_preds_df],axis = 1)  

    motif_window_predictions = motif_window_predictions.append(motif_window_prediction)
                               
assert motif_window_predictions.shape[0]==whole_window_summary_df.shape[0]
final_pred_summary_df = pd.concat([whole_window_summary_df,motif_window_predictions.reset_index()], axis = 1)                              

117486it [27:06, 72.21it/s]


In [62]:
final_pred_summary_df.head(n=5)

Unnamed: 0,index,enhancer_id,anchor_motif,anchor_motif_window_start_0based,anchor_motif_window_end_0based,anchor_motif_window_center_0based,moved_motif,moved_motif_window_orig_start_0based,moved_motif_window_orig_end_0based,moved_motif_window_orig_center_0based,rank_positions,injected_motif_seq,mutated_motif_seq,anchor_motif_seq,distance_between_moved_center_and_anchor_center,distance_id,index.1,cdx2_mut-inj_whole-range,tfap2c_mut-inj_whole-range,tead4_mut-inj_whole-range,yap1_mut-inj_whole-range,gata3_mut-inj_whole-range,genomic_moved_minus_anchor_motif_distance,cdx2_wt_whole-range,tfap2c_wt_whole-range,tead4_wt_whole-range,yap1_wt_whole-range,gata3_wt_whole-range,cdx2_mut_whole-range,tfap2c_mut_whole-range,tead4_mut_whole-range,yap1_mut_whole-range,gata3_mut_whole-range,index.2,cdx2_wt_motif-range_anchor-window,tfap2c_wt_motif-range_anchor-window,tead4_wt_motif-range_anchor-window,yap1_wt_motif-range_anchor-window,gata3_wt_motif-range_anchor-window,cdx2_mut_motif-range_anchor-window,tfap2c_mut_motif-range_anchor-window,tead4_mut_motif-range_anchor-window,yap1_mut_motif-range_anchor-window,gata3_mut_motif-range_anchor-window,cdx2_mut-inj_motif-range_anchor-window,tfap2c_mut-inj_motif-range_anchor-window,tead4_mut-inj_motif-range_anchor-window,yap1_mut-inj_motif-range_anchor-window,gata3_mut-inj_motif-range_anchor-window,cdx2_wt_motif-range_mut-window,tfap2c_wt_motif-range_mut-window,tead4_wt_motif-range_mut-window,yap1_wt_motif-range_mut-window,gata3_wt_motif-range_mut-window,cdx2_mut_motif-range_mut-window,tfap2c_mut_motif-range_mut-window,tead4_mut_motif-range_mut-window,yap1_mut_motif-range_mut-window,gata3_mut_motif-range_mut-window,cdx2_mut-inj_motif-range_mut-window,tfap2c_mut-inj_motif-range_mut-window,tead4_mut-inj_motif-range_mut-window,yap1_mut-inj_motif-range_mut-window,gata3_mut-inj_motif-range_mut-window,cdx2_wt_motif-range_inj-window,tfap2c_wt_motif-range_inj-window,tead4_wt_motif-range_inj-window,yap1_wt_motif-range_inj-window,gata3_wt_motif-range_inj-window,cdx2_mut_motif-range_inj-window,tfap2c_mut_motif-range_inj-window,tead4_mut_motif-range_inj-window,yap1_mut_motif-range_inj-window,gata3_mut_motif-range_inj-window,cdx2_mut-inj_motif-range_inj-window,tfap2c_mut-inj_motif-range_inj-window,tead4_mut-inj_motif-range_inj-window,yap1_mut-inj_motif-range_inj-window,gata3_mut-inj_motif-range_inj-window
0,0,0,tfap2c,473,485,479,tead4,516,526,521,67,CCTGGAATGT,CCTGGAGAGT,CCCCCTGAGGCA,-200,0_-200,0,32.586403,17.43948,41.252026,17.924112,11.688992,42,40.289711,22.881042,82.098846,25.748079,12.507404,32.175621,18.674313,21.664646,13.90106,12.43005,0,3.78507,5.57162,9.336608,3.556051,1.061003,1.98358,4.03511,1.170352,0.901606,0.900368,1.864665,3.676927,1.489836,1.030099,0.835613,5.880242,5.489297,44.698868,7.368331,1.461488,2.872119,3.414649,2.980212,1.528275,1.171574,2.771478,3.135449,3.741745,1.714167,1.067181,2.067455,0.840098,1.286956,0.963832,0.46517,1.758967,0.807522,0.957595,0.81986,0.50248,1.610182,0.697082,1.25751,0.873194,0.457523
1,0,0,tfap2c,473,485,479,tead4,516,526,521,67,CCTGGAATGT,CCTGGAGAGT,CCCCCTGAGGCA,-199,0_-199,1,33.794708,17.476225,42.365692,17.766081,12.389852,42,40.289711,22.881042,82.098846,25.748079,12.507404,32.175621,18.674313,21.664646,13.90106,12.43005,0,3.78507,5.57162,9.336608,3.556051,1.061003,1.98358,4.03511,1.170352,0.901606,0.900368,1.830293,3.551563,1.324419,1.040776,0.900902,5.880242,5.489297,44.698868,7.368331,1.461488,2.872119,3.414649,2.980212,1.528275,1.171574,2.722175,3.169663,3.403313,1.693549,1.144201,2.089602,0.852262,1.302774,0.977092,0.464553,1.776769,0.81688,0.968045,0.827349,0.502215,1.674533,0.72736,1.219269,0.910385,0.484783
2,0,0,tfap2c,473,485,479,tead4,516,526,521,67,CCTGGAATGT,CCTGGAGAGT,CCCCCTGAGGCA,-198,0_-198,2,33.564041,17.3447,39.337379,17.659176,11.963973,42,40.289711,22.881042,82.098846,25.748079,12.507404,32.175621,18.674313,21.664646,13.90106,12.43005,0,3.78507,5.57162,9.336608,3.556051,1.061003,1.98358,4.03511,1.170352,0.901606,0.900368,1.922206,3.684247,1.677499,1.035957,0.869245,5.880242,5.489297,44.698868,7.368331,1.461488,2.872119,3.414649,2.980212,1.528275,1.171574,2.793713,3.068845,4.105606,1.703379,1.11165,2.117435,0.865505,1.338787,0.993072,0.462888,1.799637,0.826136,0.990214,0.835365,0.501156,1.666581,0.716631,1.310965,0.905573,0.46322
3,0,0,tfap2c,473,485,479,tead4,516,526,521,67,CCTGGAATGT,CCTGGAGAGT,CCCCCTGAGGCA,-197,0_-197,3,34.37653,18.410126,42.624908,18.048157,12.253555,42,40.289711,22.881042,82.098846,25.748079,12.507404,32.175621,18.674313,21.664646,13.90106,12.43005,0,3.78507,5.57162,9.336608,3.556051,1.061003,1.98358,4.03511,1.170352,0.901606,0.900368,1.855693,3.767128,1.407524,1.061882,0.875847,5.880242,5.489297,44.698868,7.368331,1.461488,2.872119,3.414649,2.980212,1.528275,1.171574,2.795211,3.304552,3.568894,1.751415,1.134392,2.145015,0.879383,1.375235,1.010723,0.463652,1.820538,0.836888,1.016771,0.847623,0.502921,1.760733,0.773272,1.338766,0.963139,0.479209
4,0,0,tfap2c,473,485,479,tead4,516,526,521,67,CCTGGAATGT,CCTGGAGAGT,CCCCCTGAGGCA,-196,0_-196,4,32.639538,17.831081,33.506481,16.410358,12.066856,42,40.289711,22.881042,82.098846,25.748079,12.507404,32.175621,18.674313,21.664646,13.90106,12.43005,0,3.78507,5.57162,9.336608,3.556051,1.061003,1.98358,4.03511,1.170352,0.901606,0.900368,1.918224,3.825709,1.486437,0.99667,0.870556,5.880242,5.489297,44.698868,7.368331,1.461488,2.872119,3.414649,2.980212,1.528275,1.171574,2.772931,3.166693,3.726338,1.651039,1.100751,2.176472,0.893857,1.404888,1.029042,0.466761,1.844383,0.848398,1.036326,0.859559,0.506482,1.747933,0.770797,1.29025,0.908727,0.484424


In [63]:
## save each enhancer_id as seperate tsv
for id in final_pred_summary_df['enhancer_id'].unique():
    subset_df = final_pred_summary_df[final_pred_summary_df['enhancer_id'] == id]
    filename = f'two_mutation/tsv/individual_pred_summary_by_enhancer_id/' + str(id) + '_pred_summary_df' + '.tsv.gz'
    subset_df.to_csv(filename, index=False, sep='\t')  

In [65]:
#save `final_pred_summary_df` as .tsv.gz
final_pred_summary_df.to_csv(f'two_mutation/tsv/combined/{prefix}_final_pred_summary_df.tsv.gz', sep = '\t', index = False)
final_pred_summary_df.shape

(117486, 79)

In [66]:
%%script false --no-raise-error
#read csv
final_pred_summary_df= pd.read_csv(f'two_mutation/tsv/combined/{prefix}_final_pred_summary_df.tsv.gz', sep = '\t')
final_pred_summary_df.shape

(117486, 79)

Measure entire profile signals across desired distances of mut/inj, mut, and wt predictions. Store in a tidy `pd.df` and export to a `tsv.gz` for plotting using `plotnine` or `ggplot` in R. First, specify distances that you want to preserve based on injection distance from the anchor motif.

In [72]:
distances_to_store_profile = list(range(0, 200, 15)) + [i * -1 for i in list(range(0, 200, 15))]
distances_to_store_profile

#distances_to_store_profile = list(range(0, 200, 10)) + [i * -1 for i in list(range(0, 200, 10))]
#distances_to_store_profile

[0,
 15,
 30,
 45,
 60,
 75,
 90,
 105,
 120,
 135,
 150,
 165,
 180,
 195,
 0,
 -15,
 -30,
 -45,
 -60,
 -75,
 -90,
 -105,
 -120,
 -135,
 -150,
 -165,
 -180,
 -195]

Next, extract and tidy profiles, eventually saving them as a `.tsv.gz`.

In [73]:
all_profiles_df = pd.DataFrame()

for task in model.tasks:
    ## Generate WT profiles
    #Allocate all positive reads as 2D matrix and then a data frame
    wt_pos_profile_df = pd.DataFrame(data=wt_preds[task][:,:,0], columns=list(range(0, model.input_seqlen())))
    wt_pos_profile_df['enhancer_id'] = wt_info_w_preds_df.enhancer_id
    wt_pos_profile_df['anchor_motif'] = wt_info_w_preds_df.anchor_motif
    wt_pos_profile_df['moved_motif'] = wt_info_w_preds_df.moved_motif
    wt_pos_profile_df = wt_pos_profile_df.melt(id_vars = ['enhancer_id', 'anchor_motif', 'moved_motif'], 
                                               var_name = 'position', value_name = 'signal')
    wt_pos_profile_df['type'] = 'wt'
    wt_pos_profile_df['strand'] = 'pos'
    wt_pos_profile_df['task'] = task

    #Allocate all negative reads as 2D matrix and then a data frame
    wt_neg_profile_df = pd.DataFrame(data=wt_preds[task][:,:,1], columns=list(range(0, model.input_seqlen())))
    wt_neg_profile_df['enhancer_id'] = wt_info_w_preds_df.enhancer_id
    wt_neg_profile_df['anchor_motif'] = wt_info_w_preds_df.anchor_motif
    wt_neg_profile_df['moved_motif'] = wt_info_w_preds_df.moved_motif
    wt_neg_profile_df = wt_neg_profile_df.melt(id_vars = ['enhancer_id', 'anchor_motif', 'moved_motif'], 
                                               var_name = 'position', value_name = 'signal')
    wt_neg_profile_df['type'] = 'wt'
    wt_neg_profile_df['strand'] = 'neg'
    wt_neg_profile_df['signal'] = -1 * wt_neg_profile_df['signal']
    wt_neg_profile_df['task'] = task

    ## Generate mut profiles
    #Allocate all positive reads as 2D matrix and then a data frame
    mut_pos_profile_df = pd.DataFrame(data=mut_preds[task][:,:,0], columns=list(range(0, model.input_seqlen())))
    mut_pos_profile_df['enhancer_id'] = wt_info_w_preds_df.enhancer_id
    mut_pos_profile_df['anchor_motif'] = wt_info_w_preds_df.anchor_motif
    mut_pos_profile_df['moved_motif'] = wt_info_w_preds_df.moved_motif
    mut_pos_profile_df = mut_pos_profile_df.melt(id_vars = ['enhancer_id', 'anchor_motif', 'moved_motif'], 
                                                 var_name = 'position', value_name = 'signal')
    mut_pos_profile_df['type'] = 'mut'
    mut_pos_profile_df['strand'] = 'pos'
    mut_pos_profile_df['task'] = task

    #Allocate all negative reads as 2D matrix and then a data frame
    mut_neg_profile_df = pd.DataFrame(data=mut_preds[task][:,:,1], columns=list(range(0, model.input_seqlen())))
    mut_neg_profile_df['enhancer_id'] = wt_info_w_preds_df.enhancer_id
    mut_neg_profile_df['anchor_motif'] = wt_info_w_preds_df.anchor_motif
    mut_neg_profile_df['moved_motif'] = wt_info_w_preds_df.moved_motif
    mut_neg_profile_df = mut_neg_profile_df.melt(id_vars = ['enhancer_id', 'anchor_motif', 'moved_motif'], 
                                                 var_name = 'position', value_name = 'signal')
    mut_neg_profile_df['type'] = 'mut'
    mut_neg_profile_df['strand'] = 'neg'
    mut_neg_profile_df['signal'] = -1 * mut_neg_profile_df['signal']
    mut_neg_profile_df['task'] = task

    ## Generate mut and injectedprofiles
    #Allocate all positive reads as 2D matrix and then a data frame
    mut_inj_pos_profile_df = pd.DataFrame(data=mut_inj_preds[task][:,:,0], columns=list(range(0, model.input_seqlen())))
    mut_inj_pos_profile_df['enhancer_id'] = mut_inj_info_w_preds_df.enhancer_id
    mut_inj_pos_profile_df['anchor_motif'] = mut_inj_info_w_preds_df.anchor_motif
    mut_inj_pos_profile_df['moved_motif'] = mut_inj_info_w_preds_df.moved_motif
    mut_inj_pos_profile_df['type'] = ['mut_inj/' +  str(i) for i in mut_inj_info_w_preds_df.distance_between_moved_center_and_anchor_center.values]
    mut_inj_pos_profile_df = mut_inj_pos_profile_df.melt(id_vars = ['enhancer_id', 'anchor_motif', 'moved_motif', 'type'], 
                                                         var_name = 'position', value_name = 'signal')
    mut_inj_pos_profile_df['strand'] = 'pos'
    mut_inj_pos_profile_df['task'] = task
    
    #Label distances desired to store
    distances_to_store_profile_label = ['mut_inj/' +  str(i) for i in distances_to_store_profile]

    #Filter out distances not desired to store        
    mut_inj_pos_profile_df = mut_inj_pos_profile_df[mut_inj_pos_profile_df['type'].isin(list(distances_to_store_profile_label))]
    
    #Allocate all negative reads as 2D matrix and then a data frame
    mut_inj_neg_profile_df = pd.DataFrame(data=mut_inj_preds[task][:,:,1], columns=list(range(0, model.input_seqlen())))
    mut_inj_neg_profile_df['enhancer_id'] = mut_inj_info_w_preds_df.enhancer_id
    mut_inj_neg_profile_df['anchor_motif'] = mut_inj_info_w_preds_df.anchor_motif
    mut_inj_neg_profile_df['moved_motif'] = mut_inj_info_w_preds_df.moved_motif
    mut_inj_neg_profile_df['type'] = ['mut_inj/' +  str(i) for i in mut_inj_info_w_preds_df.distance_between_moved_center_and_anchor_center.values]
    mut_inj_neg_profile_df = mut_inj_neg_profile_df.melt(id_vars = ['enhancer_id', 'anchor_motif', 'moved_motif', 'type'], 
                                                         var_name = 'position', value_name = 'signal')
    mut_inj_neg_profile_df['strand'] = 'neg'
    mut_inj_neg_profile_df['signal'] = -1 * mut_inj_neg_profile_df['signal']
    mut_inj_neg_profile_df['task'] = task
    mut_inj_neg_profile_df = mut_inj_neg_profile_df[mut_inj_neg_profile_df['type'].isin(list(distances_to_store_profile_label))]
    
    all_profile_df = pd.concat([wt_pos_profile_df, wt_neg_profile_df, mut_pos_profile_df, mut_neg_profile_df, 
                                mut_inj_pos_profile_df, mut_inj_neg_profile_df], axis = 0).reset_index()
    all_profiles_df = all_profiles_df.append(all_profile_df)


In [74]:
#Save as tsv.gz
all_profiles_df.to_csv(f'two_mutation/tsv/combined/{prefix}_all_profiles_multiples_15_df.tsv.gz', sep = '\t', index = False)
all_profiles_df.shape

(86800000, 9)

In [75]:
pdf= pd.read_csv(f'two_mutation/tsv/combined/{prefix}_all_profiles_multiples_15_df.tsv.gz',sep = '\t')

In [None]:
pdf

In [76]:
## save each enhancer_id as seperate tsv
for id in pdf['enhancer_id'].unique():
    subset_df = pdf[pdf['enhancer_id'] == id]
    filename = f'two_mutation/tsv/individual_profile_by_enhancer_id_dis_15_multiples/' + str(id) + '_profiles_df' + '.tsv.gz'
    subset_df.to_csv(filename, index=False, sep='\t')  