# Introduction

The goals of this analysis is to perform in genomic perturbations and analysis on the ATAC and OSKNZ binding models. Here we seek to:

+ assess the impact of all mapped motifs on binding and accessibility
+ assess the impact of all mapped motif pairs on binding and accessibility
+ assess the profile changes of mapped motif perturbations across selected enhancers

# Computational setup

In [1]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

import warnings
warnings.filterwarnings("ignore")

#Packages
import os
import sys
import keras
import json
import pandas as pd
import numpy as np
import itertools
import keras.backend as K
from keras.models import load_model
from pybedtools import BedTool
from tqdm import tqdm
import pickle as pkl
from glob import glob

# Settings

## Working options
os.chdir(f'/n/projects/mw2098/publications/2024_weilert_acc/code/2_analysis/')
pd.set_option('display.max_columns', 100)
bpreveal_path = '/n/projects/mw2098/publications/2024_weilert_acc/public/software/bpreveal_404/'
python_path = '/home/mw2098/anaconda3/envs/bpreveal_404/bin/python'

sys.path.insert(0, f'{bpreveal_path}/src')
import losses

## Custom functions
sys.path.insert(0, f'scripts/py/functions')
from functional import shuffle_seqs, one_hot_encode_sequence, one_hot_encode_sequences, \
    one_hot_decode_sequence, insert_motif, logitsToProfile
from motifs import extract_seqs_from_df, resize_coordinates
from perturb import generate_alt_sequences

## Custom variables
figure_path = 'figures/9_perturb_motifs'
working_dir = os.getcwd()
genome = '../0_setup/fa/mm10.fa'
motifs_path = 'bed/mapped_motifs/all_instances_curated_0based.bed'
xiong_motifs_path = 'bed/mapped_motifs/all_xiong_instances_curated_0based.bed'

motifs_w_metadata_path = 'tsv/mapped_motifs/all_instances_curated_1based.tsv.gz'
xiong_motifs_w_metadata_path = 'tsv/mapped_motifs/all_xiong_instances_curated_1based.tsv.gz'
regions_path = 'bed/mapped_motifs/all_islands_curated_0based_sized_to_input.bed'
xiong_regions_path = 'bed/mapped_motifs/all_xiong_islands_curated_0based_sized_to_input.bed'
showcase_medians_path = 'tsv/insilico/marginalizations/motifs/all_median_profiles.tsv.gz'

motif_to_task_dict = {'Oct4-Sox2': 'oct4', 
                      'Oct4': 'oct4',
                      'Sox2': 'sox2',
                      'Klf4': 'klf4',
                      'Zic3': 'zic3',
                      'Nanog': 'nanog'}
region_dict = {'oct4': 'narrowpeak/mesc_oct4_nexus_peaks.narrowPeak',
               'sox2': 'narrowpeak/mesc_sox2_nexus_peaks.narrowPeak',
               'klf4': 'narrowpeak/mesc_klf4_nexus_peaks.narrowPeak',
               'nanog': 'narrowpeak/mesc_nanog_nexus_peaks.narrowPeak',
               'zic3': 'narrowpeak/mesc_zic3_nexus_peaks.narrowPeak'}
concentration_atac_timepoints = list(range(0, 16, 3))
modeling_design_dict = {
    'bpnet_osknz':{
        'tasks': list(region_dict.keys()),
        'cov': {k: {'pos': 'bw/mesc_' + k + '_nexus_combined_positive.bw',
                    'neg': 'bw/mesc_' + k + '_nexus_combined_negative.bw'}
                for k in region_dict.keys()},
        'model_dir': 'models/bpnet_osknz_fold1.model',
        'num-channels': 2
    },
    'atac_wt':{
        'tasks': ['atac'],
        'cov': 'bw/mesc_native_atac_cutsites_combined.bw',
        'model_dir': 'models/atac_wt_fold1_residual.model/',
        'num-channels': 1
    }
}
for timepoint in concentration_atac_timepoints:
    modeling_design_dict[f'atac_{timepoint}h'] = {
            'tasks': ['atac'],
            'cov': f'bw/GSE174774_mesc_atac_{timepoint}h_combined.bw',
            'model_dir': f'models/atac_{timepoint}h_fold1_residual.model/',
            'num-channels': 1
    }
input_length = 2032
output_length = 1000
trials = 16

## Filesystem commands
!mkdir -p {figure_path}
!mkdir -p tsv/genomic/motifs
!mkdir -p scripts/genomic/motifs
!mkdir -p tsv/genomic/enhancers
!mkdir -p pisa

2025-07-02 09:13:38.873473: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-07-02 09:13:38.873513: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-07-02 09:13:38.874919: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-07-02 09:13:38.882132: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Custom functions

### Function to queue slurm jobs

In [2]:
def generate_slurm_array(cmds, output_file, simultaneous_jobs = 4):
    total_jobs = len(cmds)
    array_header = ['#!/usr/bin/bash',
                  '#SBATCH --job-name bpnet',
                  '#SBATCH --output=slurm_%j.log',
                  f'#SBATCH --array=1-{total_jobs}%{simultaneous_jobs}',
                  '#SBATCH --mem=200gb', 
                  '#SBATCH --time=72:00:00',
                  '#SBATCH --partition=gpu', 
                  '#SBATCH --gres=gpu:a100:1',
                  '#SBATCH --cpus-per-task=10',
                  'source /home/mw2098/.bashrc',
                  'conda deactivate',
                  'conda activate bpreveal_404',
                  f'cd {working_dir}']
    # array_cmds = array_header
    for i,cmd in enumerate(cmds):
        array_cmd = ["if [[ ${{SLURM_ARRAY_TASK_ID}} == {0:d} ]] ; then\n".format(i+1), 
                    "    {0:s}\n".format(cmd),
                    "fi\n\n"]
        array_header = array_header + array_cmd 
        
    with open(output_file, mode='wt') as slurm:
        slurm.write('\n'.join(array_header))
        slurm.write('\n')

### Function to generate script for genomic perturbations

In [3]:
def write_genomic_perturb_script(motifs_df_path, motif_groups_df_path, 
                                 model_dir, fasta_file,
                                 tasks_in_model_order,
                                 output_prefix, output_cmd_path, 
                                 region_index_column = 'region_id', 
                                 motif_window_start_column = 'motif_window_start',
                                 motif_window_end_column = 'motif_window_end',
                                 motif_name_column = 'motif',
                                 motif_chrom_column = 'chrom',
                                 summary_window = 'use_whole_window', #`use_whole_window`, `keep_entire_profile`, or int
                                 comb_max = 2, comb_min = 1, 
                                 nodes = 24, trials = 16, 
                                 input_seqlen = 2114, 
                                 output_seqlen = 1000):
    
    working_dir = os.getcwd()
    gpu_header = ['#!/bin/bash',
                      '#SBATCH --job-name genomic_perturbations',
                      '#SBATCH --output=slurm_%j.log',
                      '#SBATCH --mem=500gb', '#SBATCH --time=72:00:00',
                      '#SBATCH --partition=gpu', '#SBATCH --gres=gpu:a100:1',
                      '#SBATCH --cpus-per-task=30',
                      'source /home/mw2098/.bashrc',
                      'conda deactivate',
                      'conda activate bpreveal_404',
                      f'cd {working_dir}']
    cmd = [
        f'{python_path} scripts/py/tools/generate_genomic_perturbs.py \\',
        f'--motifs_df_path {motifs_df_path} \\',
        f'--motif_groups_df_path {motif_groups_df_path} \\',
        f'--model_dir {model_dir} \\',
        f'--output_prefix {output_prefix} \\',
        f'--tasks_in_model_order {tasks_in_model_order} \\',
        f'--region_index_column {region_index_column} \\',
        f'--motif_window_start_column {motif_window_start_column} \\',
        f'--motif_window_end_column {motif_window_end_column} \\',
        f'--motif_name_column {motif_name_column} \\',
        f'--motif_chrom_column {motif_chrom_column} \\',
        f'--fasta_file {fasta_file} \\',
        f'--input_seqlen {input_seqlen} --output_seqlen {output_seqlen} \\',
        f'--comb_max {comb_max} --comb_min {comb_min} \\',
        f'--nodes {nodes} --trials {trials} \\'
    ]
    if type(summary_window)==int:
        last_param = [f'--summary_window {summary_window} ']
        cmd = cmd + (list(last_param))      
    else:
        last_param = [f'--{summary_window} ']
        cmd = cmd + (list(last_param))
    cmds = gpu_header + cmd
    
    #Write script
    output_hit_mapping_script = open(output_cmd_path, "w")
    for i in cmds:
        output_hit_mapping_script.write(i + "\n")
    output_hit_mapping_script.close()
    return None

# Perform genomic perturbations and summarize effects

## Format motifs for perturbations

To do this, we must first consolidate and organize motifs.

Here, we need to import the motifs that were curated and add the correct columns such that the bpnet_generate_perturbations script can be satisfied. To do this, we need a 0-based coordinate .tsv file with the following columns: `motif`, `region_id`, `chrom`, `motif_window_start`, `motif_window_end`, `motif_length`.

In [4]:
#Import set of motifs
motifs_df = BedTool(motifs_path).to_dataframe()
motifs_df.columns = ['chrom','start','end','name','score','strand']

#Separate motif name
motifs_df['motif_length'] = motifs_df['end'] - motifs_df['start']
motifs_df['motif'] = [n.split('_')[0] for n in motifs_df.name]
motifs_df['motif_id'] = [n.split('_')[1] for n in motifs_df.name]
motifs_df['region_id'] = [n.split('_')[2] for n in motifs_df.name]
print(motifs_df.shape)

motifs_df.head()

(377687, 10)


Unnamed: 0,chrom,start,end,name,score,strand,motif_length,motif,motif_id,region_id
0,chr1,3154541,3154556,Oct4-Sox2_118382_4,0,+,15,Oct4-Sox2,118382,4
1,chr1,3155627,3155642,Oct4-Sox2_118383_5,0,+,15,Oct4-Sox2,118383,5
2,chr1,3155712,3155727,Oct4-Sox2_118384_5,0,+,15,Oct4-Sox2,118384,5
3,chr1,3263877,3263892,Oct4-Sox2_118385_7,0,-,15,Oct4-Sox2,118385,7
4,chr1,3343669,3343684,Oct4-Sox2_118386_9,0,+,15,Oct4-Sox2,118386,9


Format regions in order to mark the internal motif window coordinates.

In [5]:
regions_df = BedTool(regions_path).to_dataframe()
regions_df.columns = ['region_chrom','region_start','region_end','region_id','region_score','region_strand']
regions_df['region_id']=regions_df['region_id'].astype(str)

Match grouped regions to motifs and collect motif position within the windows to obtain `motif_window_start` and `motif_window_end`. We do this now because it is far easier to work with 0-based coordinated in python than in R.

In [6]:
motifs_df = motifs_df.merge(regions_df[['region_start','region_end','region_id']], on = 'region_id', how = 'left')
motifs_df['motif_window_start'] = motifs_df['start']-motifs_df['region_start']
motifs_df['motif_window_end'] = motifs_df['end']-motifs_df['region_start']

In [7]:
motifs_df.head(n=10)
print(motifs_df.shape)

(377687, 14)


Filter motifs on edges of the windows.

In [8]:
motifs_df = motifs_df[((motifs_df['start']  - motifs_df['region_start'])>=20) & ((motifs_df['region_end']  - motifs_df['end'])>=20)]
print(motifs_df.shape)

(377687, 14)


Filter motifs that are of poor `fann` similarity matching to seqlet cluster.

In [9]:
motifs_w_metadata_df = pd.read_csv(motifs_w_metadata_path, sep = '\t').drop(columns = ['start','end','motif', 'strand'])
motifs_df['motif_id'] = motifs_df.motif_id.astype(int)
motifs_df = motifs_df.merge(motifs_w_metadata_df, how = 'left', on = 'motif_id', suffixes=('', '_y'))
print(motifs_df.shape)

(377687, 42)


In [10]:
#Save motifs
motifs_df.to_csv(f'tsv/genomic/motifs/instances_formatted_for_genomic_perturbations_0based.tsv.gz', sep = '\t', index = False)
print(motifs_df.motif.value_counts())
print(motifs_df.region_id.value_counts().value_counts())

motif
Klf4         118382
Zic3          92751
Nanog         65959
Sox2          59781
Oct4-Sox2     40682
Oct4            132
Name: count, dtype: int64
count
1     80995
2     33923
3     20888
4     13138
5      8353
6      4835
7      2647
8      1387
9       680
10      329
11      152
12       64
13       40
14       19
15        6
18        4
22        3
17        3
21        2
16        2
20        1
27        1
25        1
24        1
46        1
36        1
35        1
28        1
Name: count, dtype: int64


In [11]:
#Save regions and rename regions_df columns to match more standard bed formatting for accessibility perturbations
regions_df = regions_df.merge(motifs_df[['region_id','island_count']].drop_duplicates(ignore_index = True, keep = 'first'), 
                              on = 'region_id', how = 'left')
print(regions_df.head())
regions_df.columns = ['chrom', 'start', 'end', 'region_id', 'score', 'strand', 'island_count']
regions_df.to_csv(f'tsv/genomic/motifs/regions_formatted_for_genomic_perturbations_0based.tsv.gz', sep = '\t', index = False)

  region_chrom  region_start  region_end region_id  region_score  \
0         chr1       3034951     3036983         0             0   
1         chr1       3051532     3053564         1             0   
2         chr1       3058391     3060423         2             0   
3         chr1       3061941     3063973         3             0   
4         chr1       3153574     3155606         4             0   

  region_strand  island_count  
0             .             3  
1             .             3  
2             .             5  
3             .             4  
4             .             3  


## Generate binding and accessibility perturbations

Allow combinations of up to 2 and up to 3 mutations. This is promarily used for cooperativity and comparisons among island effects.

In [12]:
for pertubation_name, perturbation_details in modeling_design_dict.items():
    write_genomic_perturb_script(motifs_df_path = f'tsv/genomic/motifs/instances_formatted_for_genomic_perturbations_0based.tsv.gz', 
                                 motif_groups_df_path = f'tsv/genomic/motifs/regions_formatted_for_genomic_perturbations_0based.tsv.gz', 
                                 model_dir = perturbation_details['model_dir'], 
                                 fasta_file = genome,
                                 tasks_in_model_order = ','.join(perturbation_details['tasks']),
                                 output_prefix = f'tsv/genomic/motifs/genomic_pertubs_whole_window_{pertubation_name}', 
                                 output_cmd_path = f'scripts/genomic/motifs/genomic_pertubs_whole_window_{pertubation_name}.slurm', 
                                 input_seqlen = input_length, 
                                 output_seqlen = output_length,
                                 trials = trials,
                                 comb_max = 2, comb_min = 1)
    print(f'sbatch scripts/genomic/motifs/genomic_pertubs_whole_window_{pertubation_name}.slurm')

sbatch scripts/genomic/motifs/genomic_pertubs_whole_window_bpnet_osknz.slurm
sbatch scripts/genomic/motifs/genomic_pertubs_whole_window_atac_wt.slurm
sbatch scripts/genomic/motifs/genomic_pertubs_whole_window_atac_0h.slurm
sbatch scripts/genomic/motifs/genomic_pertubs_whole_window_atac_3h.slurm
sbatch scripts/genomic/motifs/genomic_pertubs_whole_window_atac_6h.slurm
sbatch scripts/genomic/motifs/genomic_pertubs_whole_window_atac_9h.slurm
sbatch scripts/genomic/motifs/genomic_pertubs_whole_window_atac_12h.slurm
sbatch scripts/genomic/motifs/genomic_pertubs_whole_window_atac_15h.slurm


Write another genomic perturbation script with windowed limitations to calculate directional cooperativity for the binding model.

In [13]:
write_genomic_perturb_script(motifs_df_path = f'tsv/genomic/motifs/instances_formatted_for_genomic_perturbations_0based.tsv.gz', 
                                 motif_groups_df_path = f'tsv/genomic/motifs/regions_formatted_for_genomic_perturbations_0based.tsv.gz', 
                                 model_dir = modeling_design_dict['bpnet_osknz']['model_dir'], 
                                 fasta_file = genome,
                                 tasks_in_model_order = ','.join(modeling_design_dict['bpnet_osknz']['tasks']),
                                 output_prefix = f'tsv/genomic/motifs/genomic_pertubs_75bp_window_bpnet_osknz', 
                                 output_cmd_path = f'scripts/genomic/motifs/genomic_pertubs_75bp_window_bpnet_osknz.slurm', 
                                 input_seqlen = input_length, 
                                 output_seqlen = output_length,
                                 trials = trials,
                                 summary_window = 75,
                                 comb_max = 2, comb_min = 1)
print(f'sbatch scripts/genomic/motifs/genomic_pertubs_75bp_window_bpnet_osknz.slurm')

sbatch scripts/genomic/motifs/genomic_pertubs_75bp_window_bpnet_osknz.slurm


In [14]:
motifs_df.head()

Unnamed: 0,chrom,start,end,name,score,strand,motif_length,motif,motif_id,region_id,region_start,region_end,motif_window_start,motif_window_end,seqnames,width,short_name,metacluster_name,pattern_name,sequence,region_index,seq_match,seq_match_quantile,task,model,mapping_state,seq,region_id_y,region_start_1based,region_end_1based,island_content,island_content_ordered,island_content_unique,island_count,acc_contrib,oct4_contrib,sox2_contrib,nanog_contrib,klf4_contrib,zic3_contrib,bind_contrib,is_across_erv
0,chr1,3154541,3154556,Oct4-Sox2_118382_4,0,+,15,Oct4-Sox2,118382,4,3153574,3155606,967,982,chr1,15,pos_0,pos_patterns,pattern_0,ATTTGCATTTGAATA,11,13.426259,0.97222,oct4,bpnet_osknz,both,ATTTGCATTTGAATA,4,3154091,3155090,Klf4:1_Oct4-Sox2:1_Zic3:1,Zic3_Oct4-Sox2_Klf4,Klf4_Oct4-Sox2_Zic3,3,0.234222,0.416462,0.137217,0.131604,0.027325,0.026796,0.416462,True
1,chr1,3155627,3155642,Oct4-Sox2_118383_5,0,+,15,Oct4-Sox2,118383,5,3154660,3156692,967,982,chr1,15,pos_0,pos_patterns,pattern_0,ATTAGCATTATAAAG,12,12.903305,0.915456,oct4,bpnet_osknz,both,ATTAGCATTATAAAG,5,3155177,3156176,Oct4-Sox2:2,Oct4-Sox2_Oct4-Sox2,Oct4-Sox2,2,0.283484,0.120707,0.043296,0.185654,-0.007187,0.044399,0.120707,True
2,chr1,3155712,3155727,Oct4-Sox2_118384_5,0,+,15,Oct4-Sox2,118384,5,3154660,3156692,1052,1067,chr1,15,pos_0,pos_patterns,pattern_0,TTTACCATAAGAAAA,12,11.593916,0.540766,oct4,bpnet_osknz,bind,TTTACCATAAGAAAA,5,3155177,3156176,Oct4-Sox2:2,Oct4-Sox2_Oct4-Sox2,Oct4-Sox2,2,0.127052,0.255989,0.102353,0.179189,0.026077,0.048198,0.255989,True
3,chr1,3263877,3263892,Oct4-Sox2_118385_7,0,-,15,Oct4-Sox2,118385,7,3262904,3264936,973,988,chr1,15,pos_0,pos_patterns,pattern_0,AGTGGCATATCAAAG,13,11.694463,0.587819,oct4,bpnet_osknz,bind,AGTGGCATATCAAAG,7,3263421,3264420,Nanog:1_Oct4-Sox2:1_Zic3:1,Zic3_Oct4-Sox2_Nanog,Nanog_Oct4-Sox2_Zic3,3,0.165599,0.196452,0.129591,0.25929,0.117235,0.244606,0.196452,True
4,chr1,3343669,3343684,Oct4-Sox2_118386_9,0,+,15,Oct4-Sox2,118386,9,3342695,3344727,974,989,chr1,15,pos_0,pos_patterns,pattern_0,ATTTACAACTGAATA,15,11.00761,0.354589,oct4,bpnet_osknz,both,ATTTACAACTGAATA,9,3343212,3344211,Nanog:2_Oct4-Sox2:1,Nanog_Oct4-Sox2_Nanog,Nanog_Oct4-Sox2,3,0.726059,0.346188,0.214333,0.460609,0.156565,0.198544,0.346188,True


Predict effects when all motifs are mutated.

In [15]:
for pertubation_name, perturbation_details in modeling_design_dict.items():
    write_genomic_perturb_script(motifs_df_path = f'tsv/genomic/motifs/instances_formatted_for_genomic_perturbations_0based.tsv.gz', 
                                 motif_groups_df_path = f'tsv/genomic/motifs/regions_formatted_for_genomic_perturbations_0based.tsv.gz', 
                                 model_dir = perturbation_details['model_dir'], 
                                 fasta_file = genome,
                                 tasks_in_model_order = ','.join(perturbation_details['tasks']),
                                 output_prefix = f'tsv/genomic/motifs/genomic_pertubs_all_motifs_{pertubation_name}', 
                                 output_cmd_path = f'scripts/genomic/motifs/genomic_pertubs_all_motifs_{pertubation_name}.slurm', 
                                 input_seqlen = input_length, 
                                 output_seqlen = output_length,
                                 trials = trials,
                                 comb_max = 'island_count', comb_min = 'island_count')
    print(f'sbatch scripts/genomic/motifs/genomic_pertubs_all_motifs_{pertubation_name}.slurm')

sbatch scripts/genomic/motifs/genomic_pertubs_all_motifs_bpnet_osknz.slurm
sbatch scripts/genomic/motifs/genomic_pertubs_all_motifs_atac_wt.slurm
sbatch scripts/genomic/motifs/genomic_pertubs_all_motifs_atac_0h.slurm
sbatch scripts/genomic/motifs/genomic_pertubs_all_motifs_atac_3h.slurm
sbatch scripts/genomic/motifs/genomic_pertubs_all_motifs_atac_6h.slurm
sbatch scripts/genomic/motifs/genomic_pertubs_all_motifs_atac_9h.slurm
sbatch scripts/genomic/motifs/genomic_pertubs_all_motifs_atac_12h.slurm
sbatch scripts/genomic/motifs/genomic_pertubs_all_motifs_atac_15h.slurm


## Filter median motifs for showcase isolation/context contrast

In previous scripts, we filtered the center 100 motifs based on median isolations score signals to depict the profiles across different motifs. We will take all motifs that fit those exact sequences and keep the profiles of contribution scores. 

In [16]:
median_seqs_df = pd.read_csv(showcase_medians_path, sep = '\t')
median_motifs_df = motifs_df[motifs_df['seq'].isin(median_seqs_df.seq.values)].reset_index(drop = True)

In [17]:
#Save motifs
median_motifs_df.to_csv(f'tsv/genomic/motifs/medians_formatted_for_genomic_perturbations_0based.tsv.gz', sep = '\t', index = False)
print(median_motifs_df.motif.value_counts())

motif
Nanog        5853
Klf4          549
Sox2          363
Zic3          358
Oct4           27
Oct4-Sox2      26
Name: count, dtype: int64


## Generate binding and accessibility profiles for median motif sequences

In [18]:
for pertubation_name, perturbation_details in modeling_design_dict.items():
    write_genomic_perturb_script(motifs_df_path = f'tsv/genomic/motifs/medians_formatted_for_genomic_perturbations_0based.tsv.gz', 
                                 motif_groups_df_path = f'tsv/genomic/motifs/regions_formatted_for_genomic_perturbations_0based.tsv.gz', 
                                 model_dir = perturbation_details['model_dir'], 
                                 fasta_file = genome,
                                 tasks_in_model_order = ','.join(perturbation_details['tasks']),
                                 output_prefix = f'tsv/genomic/motifs/genomic_pertubs_median_motifs_{pertubation_name}', 
                                 output_cmd_path = f'scripts/genomic/motifs/genomic_pertubs_median_motifs_{pertubation_name}.slurm', 
                                 input_seqlen = input_length, 
                                 output_seqlen = output_length,
                                 trials = trials,
                                 summary_window = 'keep_entire_profile',
                                 comb_max = 1, comb_min = 1)
    print(f'sbatch scripts/genomic/motifs/genomic_pertubs_median_motifs_{pertubation_name}.slurm')

sbatch scripts/genomic/motifs/genomic_pertubs_median_motifs_bpnet_osknz.slurm
sbatch scripts/genomic/motifs/genomic_pertubs_median_motifs_atac_wt.slurm
sbatch scripts/genomic/motifs/genomic_pertubs_median_motifs_atac_0h.slurm
sbatch scripts/genomic/motifs/genomic_pertubs_median_motifs_atac_3h.slurm
sbatch scripts/genomic/motifs/genomic_pertubs_median_motifs_atac_6h.slurm
sbatch scripts/genomic/motifs/genomic_pertubs_median_motifs_atac_9h.slurm
sbatch scripts/genomic/motifs/genomic_pertubs_median_motifs_atac_12h.slurm
sbatch scripts/genomic/motifs/genomic_pertubs_median_motifs_atac_15h.slurm


## Convert .pkl file to .tsv.gz

In [19]:
for pertubation_name, perturbation_details in modeling_design_dict.items():
    tasks = perturbation_details['tasks']

    preds_all_df = pd.DataFrame()
    index_paths = glob(f'tsv/genomic/motifs/genomic_pertubs_median_motifs_{pertubation_name}_index_*.tsv.gz')
    
    for index_path in tqdm(index_paths):
        pred_path = index_path.replace('_index', '').replace('.tsv.gz', '.pkl')

        indexes_df = pd.read_csv(index_path, sep = '\t')
        with open(pred_path, 'rb') as f:
            preds_dict = pkl.load(f)

        if type(perturbation_details['cov']) == str:
            preds_arr = np.squeeze(preds_dict['atac'])
            preds_df = pd.concat([indexes_df, pd.DataFrame(preds_arr, columns=range(preds_arr.shape[1]))], axis = 1) \
                .melt(id_vars = indexes_df.columns, var_name = 'window_position', value_name = 'pred')
            preds_df['model_name'] = pertubation_name            
            preds_df['task'] = 'atac'            
            preds_df['channel'] = 'atac'            
            preds_all_df = pd.concat([preds_all_df, preds_df])
        else:
            for task in tasks:
                for i,channel in enumerate(list(perturbation_details['cov'][task].keys())):
                    preds_arr = np.squeeze(preds_dict[task][:,:,i])
                    preds_df = pd.concat([indexes_df, pd.DataFrame(preds_arr, columns=range(preds_arr.shape[1]))], axis = 1) \
                        .melt(id_vars = indexes_df.columns, var_name = 'window_position', value_name = 'pred')
                    preds_df['model_name'] = pertubation_name            
                    preds_df['task'] = task            
                    preds_df['channel'] = channel            
                    preds_all_df = pd.concat([preds_all_df, preds_df])
                
    preds_all_df.to_csv(f'tsv/genomic/motifs/genomic_pertubs_median_motifs_formatted_{pertubation_name}.tsv.gz', sep = '\t', index = False)

print(preds_all_df.shape)

100%|████████████████████████████████████████████████████████| 20/20 [13:35<00:00, 40.76s/it]
100%|████████████████████████████████████████████████████████| 20/20 [00:09<00:00,  2.11it/s]
100%|████████████████████████████████████████████████████████| 20/20 [00:09<00:00,  2.06it/s]
100%|████████████████████████████████████████████████████████| 20/20 [00:10<00:00,  1.96it/s]
100%|████████████████████████████████████████████████████████| 20/20 [00:09<00:00,  2.03it/s]
100%|████████████████████████████████████████████████████████| 20/20 [00:10<00:00,  1.96it/s]
100%|████████████████████████| 20/20 [00:09<00:00,  2.04it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:09<00:00,  2.13it/s]


(13999000, 8)


Visualize this alongside isolation scores of median values in later .Rmd files.

# Perform Xiong genomic perturbations and summarize effects

For the motifs mapped by the 0h timepoint from the ATAC-seq time course from Xiong et al, perform perturbations on those explicit motifs.

## Format motifs for perturbations

To do this, we must first consolidate and organize motifs.

Here, we need to import the motifs that were curated and add the correct columns such that the bpnet_generate_perturbations script can be satisfied. To do this, we need a 0-based coordinate .tsv file with the following columns: `motif`, `region_id`, `chrom`, `motif_window_start`, `motif_window_end`, `motif_length`.

In [20]:
#Import set of motifs
motifs_df = BedTool(xiong_motifs_path).to_dataframe()
motifs_df.columns = ['chrom','start','end','name','score','strand']

#Separate motif name
motifs_df['motif_length'] = motifs_df['end'] - motifs_df['start']
motifs_df['motif'] = [n.split('_')[0] for n in motifs_df.name]
motifs_df['motif_id'] = [n.split('_')[1] for n in motifs_df.name]
motifs_df['region_id'] = [n.split('_')[2] for n in motifs_df.name]
print(motifs_df.shape)

motifs_df.head()

(157560, 10)


Unnamed: 0,chrom,start,end,name,score,strand,motif_length,motif,motif_id,region_id
0,chr1,3154541,3154555,Oct4-Sox2_0_2,0,+,14,Oct4-Sox2,0,2
1,chr1,3155627,3155641,Oct4-Sox2_1_3,0,+,14,Oct4-Sox2,1,3
2,chr1,3155712,3155726,Oct4-Sox2_2_3,0,+,14,Oct4-Sox2,2,3
3,chr1,3343669,3343683,Oct4-Sox2_3_6,0,+,14,Oct4-Sox2,3,6
4,chr1,3483033,3483047,Oct4-Sox2_4_10,0,+,14,Oct4-Sox2,4,10


Format regions in order to mark the internal motif window coordinates.

In [21]:
regions_df = BedTool(xiong_regions_path).to_dataframe()
regions_df.columns = ['region_chrom','region_start','region_end','region_id','region_score','region_strand']
regions_df['region_id']=regions_df['region_id'].astype(str)
regions_df.head()

Unnamed: 0,region_chrom,region_start,region_end,region_id,region_score,region_strand
0,chr1,3034865,3036897,0,0,.
1,chr1,3061961,3063993,1,0,.
2,chr1,3153593,3155625,2,0,.
3,chr1,3154660,3156692,3,0,.
4,chr1,3154899,3156931,4,0,.


Match grouped regions to motifs and collect motif position within the windows to obtain `motif_window_start` and `motif_window_end`. We do this now because it is far easier to work with 0-based coordinated in python than in R.

In [22]:
motifs_df = motifs_df.merge(regions_df[['region_start','region_end','region_id']], on = 'region_id', how = 'left')
motifs_df['motif_window_start'] = motifs_df['start']-motifs_df['region_start']
motifs_df['motif_window_end'] = motifs_df['end']-motifs_df['region_start']

In [23]:
print(motifs_df.shape)
motifs_df.head(n=10)

(157560, 14)


Unnamed: 0,chrom,start,end,name,score,strand,motif_length,motif,motif_id,region_id,region_start,region_end,motif_window_start,motif_window_end
0,chr1,3154541,3154555,Oct4-Sox2_0_2,0,+,14,Oct4-Sox2,0,2,3153593,3155625,948,962
1,chr1,3155627,3155641,Oct4-Sox2_1_3,0,+,14,Oct4-Sox2,1,3,3154660,3156692,967,981
2,chr1,3155712,3155726,Oct4-Sox2_2_3,0,+,14,Oct4-Sox2,2,3,3154660,3156692,1052,1066
3,chr1,3343669,3343683,Oct4-Sox2_3_6,0,+,14,Oct4-Sox2,3,6,3342660,3344692,1009,1023
4,chr1,3483033,3483047,Oct4-Sox2_4_10,0,+,14,Oct4-Sox2,4,10,3482000,3484032,1033,1047
5,chr1,3549773,3549787,Oct4-Sox2_5_12,0,-,14,Oct4-Sox2,5,12,3548761,3550793,1012,1026
6,chr1,3681703,3681717,Oct4-Sox2_6_16,0,+,14,Oct4-Sox2,6,16,3680703,3682735,1000,1014
7,chr1,3977384,3977398,Oct4-Sox2_7_21,0,-,14,Oct4-Sox2,7,21,3976282,3978314,1102,1116
8,chr1,3982153,3982167,Oct4-Sox2_8_23,0,+,14,Oct4-Sox2,8,23,3981205,3983237,948,962
9,chr1,4214144,4214158,Oct4-Sox2_9_29,0,+,14,Oct4-Sox2,9,29,4213137,4215169,1007,1021


Filter motifs on edges of the windows.

In [24]:
motifs_df = motifs_df[((motifs_df['start']  - motifs_df['region_start'])>=20) & ((motifs_df['region_end']  - motifs_df['end'])>=20)]
print(motifs_df.shape)

(157560, 14)


Filter motifs that are of poor `fann` similarity matching to seqlet cluster.

In [25]:
motifs_w_metadata_df = pd.read_csv(xiong_motifs_w_metadata_path, sep = '\t').drop(columns = ['start','end','motif', 'strand'])
motifs_df['motif_id'] = motifs_df.motif_id.astype(int)
motifs_df = motifs_df.merge(motifs_w_metadata_df, how = 'left', on = 'motif_id', suffixes=('', '_y'))
print(motifs_df.shape)

(157560, 27)


In [26]:
#Save motifs
motifs_df.to_csv(f'tsv/genomic/motifs/xiong_instances_formatted_for_genomic_perturbations_0based.tsv.gz', sep = '\t', index = False)
print(motifs_df.motif.value_counts())
print(motifs_df.region_id.value_counts().value_counts())

motif
Klf4         85191
Sox2         48704
Oct4-Sox2    23665
Name: count, dtype: int64
count
1     68944
2     22400
3      7735
4      2834
5       989
6       352
7       140
8        61
10       29
9        22
11        8
12        5
13        2
22        2
15        2
14        1
Name: count, dtype: int64


In [27]:
#Save regions and rename regions_df columns to match more standard bed formatting for accessibility perturbations
# regions_df = regions_df.merge(motifs_df[['region_id','island_count']].drop_duplicates(ignore_index = True, keep = 'first'), 
#                               on = 'region_id', how = 'left')
print(regions_df.head())
regions_df.columns = ['chrom', 'start', 'end', 'region_id', 'score', 'strand']
regions_df.to_csv(f'tsv/genomic/motifs/xiong_regions_formatted_for_genomic_perturbations_0based.tsv.gz', sep = '\t', index = False)

  region_chrom  region_start  region_end region_id  region_score region_strand
0         chr1       3034865     3036897         0             0             .
1         chr1       3061961     3063993         1             0             .
2         chr1       3153593     3155625         2             0             .
3         chr1       3154660     3156692         3             0             .
4         chr1       3154899     3156931         4             0             .


## Generate binding and accessibility perturbations

Allow combinations of up to 2 and up to 3 mutations. This is promarily used for cooperativity and comparisons among island effects.

In [28]:
for pertubation_name, perturbation_details in modeling_design_dict.items():
    write_genomic_perturb_script(motifs_df_path = f'tsv/genomic/motifs/xiong_instances_formatted_for_genomic_perturbations_0based.tsv.gz', 
                                 motif_groups_df_path = f'tsv/genomic/motifs/xiong_regions_formatted_for_genomic_perturbations_0based.tsv.gz', 
                                 model_dir = perturbation_details['model_dir'], 
                                 fasta_file = genome,
                                 tasks_in_model_order = ','.join(perturbation_details['tasks']),
                                 output_prefix = f'tsv/genomic/motifs/genomic_perturbs_xiong_whole_window_{pertubation_name}', 
                                 output_cmd_path = f'scripts/genomic/motifs/genomic_perturbs_xiong_whole_window_{pertubation_name}.slurm', 
                                 input_seqlen = input_length, 
                                 output_seqlen = output_length,
                                 trials = trials,
                                 comb_max = 2, comb_min = 1)
    print(f'sbatch scripts/genomic/motifs/genomic_perturbs_xiong_whole_window_{pertubation_name}.slurm')

sbatch scripts/genomic/motifs/genomic_perturbs_xiong_whole_window_bpnet_osknz.slurm
sbatch scripts/genomic/motifs/genomic_perturbs_xiong_whole_window_atac_wt.slurm
sbatch scripts/genomic/motifs/genomic_perturbs_xiong_whole_window_atac_0h.slurm
sbatch scripts/genomic/motifs/genomic_perturbs_xiong_whole_window_atac_3h.slurm
sbatch scripts/genomic/motifs/genomic_perturbs_xiong_whole_window_atac_6h.slurm
sbatch scripts/genomic/motifs/genomic_perturbs_xiong_whole_window_atac_9h.slurm
sbatch scripts/genomic/motifs/genomic_perturbs_xiong_whole_window_atac_12h.slurm
sbatch scripts/genomic/motifs/genomic_perturbs_xiong_whole_window_atac_15h.slurm
