# Introduction

The goal of this analysis is to select and predict CRISPR perturbations of desired motifs at a specific loci.

# Computational setup

In [1]:
import warnings
warnings.filterwarnings("ignore")
from tensorflow.python.util import deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False

#Packages
import os
import sys
import pandas as pd
import numpy as np
from tqdm import tqdm
from pybedtools import BedTool
import keras.backend as K
from keras.models import load_model
import json

## Working options
os.chdir(f'/n/projects/mw2098/publications/2024_weilert_acc/code/2_analysis/')
pd.set_option('display.max_columns', 100)
bpreveal_path = '/n/projects/mw2098/publications/2024_weilert_acc/public/software/bpreveal_404/'
python_path = '/home/mw2098/anaconda3/envs/bpreveal_404/bin/python'

sys.path.insert(0, f'{bpreveal_path}/src')
import losses

## Custom functions
sys.path.insert(0, f'scripts/py/functions')
from functional import shuffle_seqs, one_hot_encode_sequence, one_hot_encode_sequences, \
    one_hot_decode_sequence, insert_motif, logitsToProfile
from motifs import extract_seqs_from_df, resize_coordinates

#Pre-existing variables
region_dict = {'oct4': 'narrowpeak/mesc_oct4_nexus_peaks.narrowPeak',
               'sox2': 'narrowpeak/mesc_sox2_nexus_peaks.narrowPeak',
               'klf4': 'narrowpeak/mesc_klf4_nexus_peaks.narrowPeak',
               'nanog': 'narrowpeak/mesc_nanog_nexus_peaks.narrowPeak',
               'zic3': 'narrowpeak/mesc_zic3_nexus_peaks.narrowPeak'}
concentration_atac_timepoints = list(range(0, 16, 3))
modeling_design_dict = {
    'bpnet_osknz_fold1':{
        'tasks': list(region_dict.keys()),
        'cov': {k: {'pos': 'bw/mesc_' + k + '_nexus_combined_positive.bw',
                    'neg': 'bw/mesc_' + k + '_nexus_combined_negative.bw'}
                for k in region_dict.keys()},
        'model_dir': 'models/bpnet_osknz_fold1.model',
        'num-channels': 2
    },
    'bpnet_osknz_fold2':{
        'tasks': list(region_dict.keys()),
        'cov': {k: {'pos': 'bw/mesc_' + k + '_nexus_combined_positive.bw',
                    'neg': 'bw/mesc_' + k + '_nexus_combined_negative.bw'}
                for k in region_dict.keys()},
        'model_dir': 'models/bpnet_osknz_fold2.model',
        'num-channels': 2
    },
    'bpnet_osknz_fold3':{
        'tasks': list(region_dict.keys()),
        'cov': {k: {'pos': 'bw/mesc_' + k + '_nexus_combined_positive.bw',
                    'neg': 'bw/mesc_' + k + '_nexus_combined_negative.bw'}
                for k in region_dict.keys()},
        'model_dir': 'models/bpnet_osknz_fold3.model',
        'num-channels': 2
    },
    'atac_wt_fold1':{
        'tasks': ['atac'],
        'cov': 'bw/mesc_native_atac_cutsites_combined.bw',
        'model_dir': 'models/atac_wt_fold1_residual.model/',
        'num-channels': 1
    },
    'atac_wt_fold2':{
        'tasks': ['atac'],
        'cov': 'bw/mesc_native_atac_cutsites_combined.bw',
        'model_dir': 'models/atac_wt_fold2_residual.model/',
        'num-channels': 1
    },
    'atac_wt_fold3':{
        'tasks': ['atac'],
        'cov': 'bw/mesc_native_atac_cutsites_combined.bw',
        'model_dir': 'models/atac_wt_fold3_residual.model/',
        'num-channels': 1
    }
}
for timepoint in concentration_atac_timepoints:
    modeling_design_dict[f'atac_{timepoint}h_fold1'] = {
            'tasks': ['atac'],
            'cov': f'bw/GSE174774_mesc_atac_{timepoint}h_combined.bw',
            'model_dir': f'models/atac_{timepoint}h_fold1_residual.model/',
            'num-channels': 1
    }

input_length = 2032
output_length = 1000
flank_length = (input_length - output_length)//2
trials = 256
seed = 2356

genome = '../0_setup/fa/mm10.fa'
figure_path = 'figures/13_perturb_crispr'
regions_path = 'bed/mapped_motifs/all_islands_curated_0based_sized_to_input.bed'
motifs_path = 'tsv/mapped_motifs/all_instances_curated_0based_w_perturb.tsv.gz'

working_dir = os.getcwd()
gpu_header = ['#!/usr/bin/bash',
              '#SBATCH --job-name bpnet_training',
              '#SBATCH --output=slurm_%j.log',
              '#SBATCH --mem=200gb', '#SBATCH --time=72:00:00',
              '#SBATCH --partition=gpu', '#SBATCH --gres=gpu:a100:1',
              '#SBATCH --cpus-per-task=10',
              'source /home/mw2098/.bashrc',
              'conda deactivate',
              'conda activate bpreveal_404',
              f'cd {working_dir}']

!mkdir -p {figure_path} tsv/genomic/crispr
!mkdir fasta
!mkdir json/crispr
!mkdir shap/crispr
!mkdir pisa


2024-11-12 13:47:07.676473: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-12 13:47:07.676506: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-12 13:47:07.678170: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-12 13:47:07.686425: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


mkdir: cannot create directory ‘fasta’: File exists
mkdir: cannot create directory ‘json/crispr’: File exists
mkdir: cannot create directory ‘shap/crispr’: File exists
mkdir: cannot create directory ‘pisa’: File exists


Create SLURM array job function.

In [2]:
def generate_slurm_array(cmds, output_file, simultaneous_jobs = 4):
    total_jobs = len(cmds)
    array_header = ['#!/usr/bin/bash',
                  '#SBATCH --job-name bpnet_training',
                  '#SBATCH --output=slurm_%j.log',
                  f'#SBATCH --array=1-{total_jobs}%{simultaneous_jobs}',
                  '#SBATCH --mem=200gb', 
                  '#SBATCH --time=72:00:00',
                  '#SBATCH --partition=gpu', 
                  '#SBATCH --gres=gpu:a100:1',
                  '#SBATCH --cpus-per-task=10',
                  'source /home/mw2098/.bashrc',
                  'conda deactivate',
                  'conda activate bpreveal_404',
                  f'cd {working_dir}']
    # array_cmds = array_header
    for i,cmd in enumerate(cmds):
        array_cmd = ["if [[ ${{SLURM_ARRAY_TASK_ID}} == {0:d} ]] ; then\n".format(i+1), 
                    "    {0:s}\n".format(cmd),
                    "fi\n\n"]
        array_header = array_header + array_cmd 
        
    with open(output_file, mode='wt') as slurm:
        slurm.write('\n'.join(array_header))
        slurm.write('\n')

Import regions of interest.

In [3]:
regions_df = pd.read_csv(regions_path,  sep = '\t', names = ['chrom','start','end','region_id','score','strand'])

# Perturb Avsec et al published site

This is the site CRISPRed in Figure 6 of Avsec et al (2021). What was not previously noted was that there is a low-affinity Oct4-Sox2 motif between the Sox2 and Nanog motifs that is highly contributing to accessibility and binding. This is an ideal candidate to explore (1) can low-affinity motifs be cooperative, (2) by increasing affinity, can we increase cooperativity and (3) how does the distance of the two motifs influence the designated cooperativity?

For this case, motifA will be Sox2, and motifB will be Oct4-Sox2.

In [4]:
sox2_motif_id = 283919
oct4sox2_motif_id = 139897
region_id = 97554

#Isolate motifs
motifs_df = pd.read_csv(motifs_path,  sep = '\t')
motifs_df['pattern_center'] = motifs_df['pattern_center'] + flank_length
sox2_motif_df = motifs_df[motifs_df.motif_id==sox2_motif_id]
sox2_motif_df['wt_seq'] = extract_seqs_from_df(sox2_motif_df, genome)
oct4sox2_motif_df = motifs_df[motifs_df.motif_id==oct4sox2_motif_id]
oct4sox2_motif_df['wt_seq'] = extract_seqs_from_df(oct4sox2_motif_df, genome)

#Keep metadata of use
columns_of_interest = ['motif', 'wt_seq', 'motif_window_start', 'motif_window_end', 'pattern_center', 'seq_match_quantile']
sox2_df = sox2_motif_df[columns_of_interest]
oct4sox2_df = oct4sox2_motif_df[columns_of_interest]
print(sox2_df)
print(oct4sox2_df)

       motif     wt_seq  motif_window_start  motif_window_end  pattern_center  \
209969  Sox2  GCCTTTGTT                 961               970             965   

        seq_match_quantile  
209969             0.96575  
            motif           wt_seq  motif_window_start  motif_window_end  \
207188  Oct4-Sox2  AATTATAATGATAAT                 995              1010   

        pattern_center  seq_match_quantile  
207188            1002            0.605285  


In [5]:
# region_start = sox2_motif_df.region_start.values[0]
# region_start
# #original: 85539378

Print region start coordinate.

In [6]:
region_start = sox2_motif_df.region_start.values[0]
region_start

85538671

Overwrite distance injection for `dist_coop` scenario.

In [7]:
print('Current center-to-center distance: ', oct4sox2_df.pattern_center.values - sox2_df.pattern_center.values)

Current center-to-center distance:  [37]


In [8]:
distal_center_to_center_distance = 140
distal_to_add = distal_center_to_center_distance - (oct4sox2_df.pattern_center.values - sox2_df.pattern_center.values)[0]
distal_to_add

103

## Define Sox2

Define all CRISPR scenarios for Sox2. Sox2 will not change in position or be enhanced by affinity.

In [9]:
sox2_states_df = pd.DataFrame([['AB', 'A', 'B', 'null'], ['GCCTTTGTT', 'GCCTTTGTT', 'GCCTAGGTT', 'GCCTAGGTT']])\
    .transpose()\
    .rename({0: 'state', 1:'forward_seq'}, axis = 1)
sox2_states_df['motif'] = 'Sox2'
sox2_states_df = sox2_states_df.merge(sox2_df.drop(['seq_match_quantile'], axis = 1))
sox2_states_df

Unnamed: 0,state,forward_seq,motif,wt_seq,motif_window_start,motif_window_end,pattern_center
0,AB,GCCTTTGTT,Sox2,GCCTTTGTT,961,970,965
1,A,GCCTTTGTT,Sox2,GCCTTTGTT,961,970,965
2,B,GCCTAGGTT,Sox2,GCCTTTGTT,961,970,965
3,,GCCTAGGTT,Sox2,GCCTTTGTT,961,970,965


## Define Oct4-Sox2

First, define default state where the low-affinity WT Oct4-Sox2 is mutated

In [10]:
oct4sox2_states_df = pd.DataFrame([['WT_coop', 'WT_coop', 'WT_coop', 'WT_coop', 
                                    'enh_coop', 'enh_coop', 'enh_coop', 'enh_coop', 
                                    'dist_coop', 'dist_coop', 'dist_coop', 'dist_coop'],
                                   ['AB','A','B','null', 
                                    'AB','A','B','null',
                                    'AB','A','B','null'], 
                                   ['AATTATAATGATAAT', 'AATCATAAGGATAAT', 'AATTATAATGATAAT', 'AATCATAAGGATAAT',
                                    'AATTGTAATGCTAAT', 'AATCATAAGGATAAT', 'AATTGTAATGCTAAT', 'AATCATAAGGATAAT',
                                    'AATTATAATGATAAT', 'AATCATAAGGATAAT', 'AATTATAATGATAAT', 'AATCATAAGGATAAT'],
                                   [0, 0, 0, 0, 0, 0, 0, 0, distal_to_add, distal_to_add, distal_to_add, distal_to_add]])\
    .transpose()\
    .rename({0: 'scenario', 1:'state', 2:'forward_seq', 3: 'distance_to_add'}, axis = 1)
oct4sox2_states_df['motif'] = 'Oct4-Sox2'
oct4sox2_states_df = oct4sox2_states_df.merge(oct4sox2_df.drop(['seq_match_quantile'], axis = 1))

In [11]:
oct4sox2_states_df['motif_window_start'] = oct4sox2_states_df['motif_window_start'] + oct4sox2_states_df['distance_to_add']
oct4sox2_states_df['motif_window_end'] = oct4sox2_states_df['motif_window_end'] + oct4sox2_states_df['distance_to_add']
oct4sox2_states_df['pattern_center'] = oct4sox2_states_df['pattern_center'] + oct4sox2_states_df['distance_to_add']

In [12]:
oct4sox2_states_df

Unnamed: 0,scenario,state,forward_seq,distance_to_add,motif,wt_seq,motif_window_start,motif_window_end,pattern_center
0,WT_coop,AB,AATTATAATGATAAT,0,Oct4-Sox2,AATTATAATGATAAT,995,1010,1002
1,WT_coop,A,AATCATAAGGATAAT,0,Oct4-Sox2,AATTATAATGATAAT,995,1010,1002
2,WT_coop,B,AATTATAATGATAAT,0,Oct4-Sox2,AATTATAATGATAAT,995,1010,1002
3,WT_coop,,AATCATAAGGATAAT,0,Oct4-Sox2,AATTATAATGATAAT,995,1010,1002
4,enh_coop,AB,AATTGTAATGCTAAT,0,Oct4-Sox2,AATTATAATGATAAT,995,1010,1002
5,enh_coop,A,AATCATAAGGATAAT,0,Oct4-Sox2,AATTATAATGATAAT,995,1010,1002
6,enh_coop,B,AATTGTAATGCTAAT,0,Oct4-Sox2,AATTATAATGATAAT,995,1010,1002
7,enh_coop,,AATCATAAGGATAAT,0,Oct4-Sox2,AATTATAATGATAAT,995,1010,1002
8,dist_coop,AB,AATTATAATGATAAT,103,Oct4-Sox2,AATTATAATGATAAT,1098,1113,1105
9,dist_coop,A,AATCATAAGGATAAT,103,Oct4-Sox2,AATTATAATGATAAT,1098,1113,1105


## Merge coordinates together

In [13]:
crispr_scenarios_df = oct4sox2_states_df.merge(sox2_states_df, how = 'left', on = 'state', suffixes=('_OS', '_S'))
crispr_scenarios_df[['genomic_start_OS', 'genomic_end_OS', 'genomic_center_OS']] = crispr_scenarios_df[['motif_window_start_OS', 'motif_window_end_OS', 'pattern_center_OS']] + region_start
crispr_scenarios_df[['genomic_start_S', 'genomic_end_S', 'genomic_center_S']] = crispr_scenarios_df[['motif_window_start_S', 'motif_window_end_S', 'pattern_center_S']] + region_start

In [14]:
crispr_scenarios_df

Unnamed: 0,scenario,state,forward_seq_OS,distance_to_add,motif_OS,wt_seq_OS,motif_window_start_OS,motif_window_end_OS,pattern_center_OS,forward_seq_S,motif_S,wt_seq_S,motif_window_start_S,motif_window_end_S,pattern_center_S,genomic_start_OS,genomic_end_OS,genomic_center_OS,genomic_start_S,genomic_end_S,genomic_center_S
0,WT_coop,AB,AATTATAATGATAAT,0,Oct4-Sox2,AATTATAATGATAAT,995,1010,1002,GCCTTTGTT,Sox2,GCCTTTGTT,961,970,965,85539666,85539681,85539673,85539632,85539641,85539636
1,WT_coop,A,AATCATAAGGATAAT,0,Oct4-Sox2,AATTATAATGATAAT,995,1010,1002,GCCTTTGTT,Sox2,GCCTTTGTT,961,970,965,85539666,85539681,85539673,85539632,85539641,85539636
2,WT_coop,B,AATTATAATGATAAT,0,Oct4-Sox2,AATTATAATGATAAT,995,1010,1002,GCCTAGGTT,Sox2,GCCTTTGTT,961,970,965,85539666,85539681,85539673,85539632,85539641,85539636
3,WT_coop,,AATCATAAGGATAAT,0,Oct4-Sox2,AATTATAATGATAAT,995,1010,1002,GCCTAGGTT,Sox2,GCCTTTGTT,961,970,965,85539666,85539681,85539673,85539632,85539641,85539636
4,enh_coop,AB,AATTGTAATGCTAAT,0,Oct4-Sox2,AATTATAATGATAAT,995,1010,1002,GCCTTTGTT,Sox2,GCCTTTGTT,961,970,965,85539666,85539681,85539673,85539632,85539641,85539636
5,enh_coop,A,AATCATAAGGATAAT,0,Oct4-Sox2,AATTATAATGATAAT,995,1010,1002,GCCTTTGTT,Sox2,GCCTTTGTT,961,970,965,85539666,85539681,85539673,85539632,85539641,85539636
6,enh_coop,B,AATTGTAATGCTAAT,0,Oct4-Sox2,AATTATAATGATAAT,995,1010,1002,GCCTAGGTT,Sox2,GCCTTTGTT,961,970,965,85539666,85539681,85539673,85539632,85539641,85539636
7,enh_coop,,AATCATAAGGATAAT,0,Oct4-Sox2,AATTATAATGATAAT,995,1010,1002,GCCTAGGTT,Sox2,GCCTTTGTT,961,970,965,85539666,85539681,85539673,85539632,85539641,85539636
8,dist_coop,AB,AATTATAATGATAAT,103,Oct4-Sox2,AATTATAATGATAAT,1098,1113,1105,GCCTTTGTT,Sox2,GCCTTTGTT,961,970,965,85539769,85539784,85539776,85539632,85539641,85539636
9,dist_coop,A,AATCATAAGGATAAT,103,Oct4-Sox2,AATTATAATGATAAT,1098,1113,1105,GCCTTTGTT,Sox2,GCCTTTGTT,961,970,965,85539769,85539784,85539776,85539632,85539641,85539636


## Generate predictions based on the different scenarios

First, prepare WT genomic sequences according to input accessibility features.

In [15]:
crispr_region_df = regions_df[regions_df.region_id==region_id]
crispr_region_df['wt_seq'] = extract_seqs_from_df(crispr_region_df, genome)

In [16]:
print(crispr_region_df)
print(crispr_region_df.wt_seq.values[0][:10])
print(crispr_region_df.wt_seq.values[0][-10:])

       chrom     start       end  region_id  score strand  \
97547  chr10  85538671  85540703      97547      0      .   

                                                  wt_seq  
97547  TCCTACAGACCCCTTTGTTGTCCCACTCTTTAATAGAAATACCTGA...  
TCCTACAGAC
TATCAAACAC


In [17]:
crispr_region_df.to_csv('tsv/genomic/crispr/crispr_coop_coord.bed', sep = '\t', index = False, header = False)

Next, import model.

In [18]:
acc_model = load_model(modeling_design_dict['atac_wt_fold1']['model_dir'], custom_objects = {'multinomialNll' : losses.multinomialNll, 'reweightableMse': losses.dummyMse})

Next, inject motifs.

In [19]:
wt_seq = crispr_region_df.wt_seq.values[0]

crispr_seqs = []
for i,row in crispr_scenarios_df.iterrows():
    if row.scenario=='dist_coop':
        original_os_start = crispr_scenarios_df.motif_window_start_OS.values[0]
        original_os_end = crispr_scenarios_df.motif_window_end_OS.values[0]
        new_os_start = row.motif_window_start_OS
        new_os_end = row.motif_window_end_OS
        
        #Overwrite Sox2 motif
        s_seq = wt_seq[:(row.motif_window_start_S)] + row.forward_seq_S + wt_seq[(row.motif_window_end_S):]
        
        #Overwrite Oct4-Sox2 motif by flipping the sequence
        final_seq = s_seq[:(original_os_start)] + \
        wt_seq[(new_os_start):(new_os_end)] + \
        s_seq[(original_os_end):(new_os_start)] + \
        row.forward_seq_OS + \
        s_seq[(new_os_end):]
        
        assert len(final_seq)==input_length
        crispr_seqs.append(final_seq)
    else:
        #Overwrite Sox2 motif
        s_seq = wt_seq[:(row.motif_window_start_S)] + row.forward_seq_S + wt_seq[(row.motif_window_end_S):]
        
        #Overwrite Oct4-Sox2 motif
        final_seq = s_seq[:(row.motif_window_start_OS)] + row.forward_seq_OS + s_seq[(row.motif_window_end_OS):]
        assert len(final_seq)==input_length
        crispr_seqs.append(final_seq)

crispr_scenarios_df['inj_seq'] = crispr_seqs
crispr_scenarios_df['scenario_index'] = list(range(crispr_scenarios_df.shape[0]))

In [20]:
crispr_scenarios_df['inj_seq_85539400_to_85540000'] = [s[(85539400 - crispr_region_df.start.values[0]):(85539400 - crispr_region_df.start.values[0] + 600) ]
                                                       for s in crispr_scenarios_df['inj_seq'].values]
crispr_scenarios_df

Unnamed: 0,scenario,state,forward_seq_OS,distance_to_add,motif_OS,wt_seq_OS,motif_window_start_OS,motif_window_end_OS,pattern_center_OS,forward_seq_S,motif_S,wt_seq_S,motif_window_start_S,motif_window_end_S,pattern_center_S,genomic_start_OS,genomic_end_OS,genomic_center_OS,genomic_start_S,genomic_end_S,genomic_center_S,inj_seq,scenario_index,inj_seq_85539400_to_85540000
0,WT_coop,AB,AATTATAATGATAAT,0,Oct4-Sox2,AATTATAATGATAAT,995,1010,1002,GCCTTTGTT,Sox2,GCCTTTGTT,961,970,965,85539666,85539681,85539673,85539632,85539641,85539636,TCCTACAGACCCCTTTGTTGTCCCACTCTTTAATAGAAATACCTGA...,0,TTTGTTAGACCAGTGGAAGTGGGGATAGAGGTGGGAAGAGAGGATG...
1,WT_coop,A,AATCATAAGGATAAT,0,Oct4-Sox2,AATTATAATGATAAT,995,1010,1002,GCCTTTGTT,Sox2,GCCTTTGTT,961,970,965,85539666,85539681,85539673,85539632,85539641,85539636,TCCTACAGACCCCTTTGTTGTCCCACTCTTTAATAGAAATACCTGA...,1,TTTGTTAGACCAGTGGAAGTGGGGATAGAGGTGGGAAGAGAGGATG...
2,WT_coop,B,AATTATAATGATAAT,0,Oct4-Sox2,AATTATAATGATAAT,995,1010,1002,GCCTAGGTT,Sox2,GCCTTTGTT,961,970,965,85539666,85539681,85539673,85539632,85539641,85539636,TCCTACAGACCCCTTTGTTGTCCCACTCTTTAATAGAAATACCTGA...,2,TTTGTTAGACCAGTGGAAGTGGGGATAGAGGTGGGAAGAGAGGATG...
3,WT_coop,,AATCATAAGGATAAT,0,Oct4-Sox2,AATTATAATGATAAT,995,1010,1002,GCCTAGGTT,Sox2,GCCTTTGTT,961,970,965,85539666,85539681,85539673,85539632,85539641,85539636,TCCTACAGACCCCTTTGTTGTCCCACTCTTTAATAGAAATACCTGA...,3,TTTGTTAGACCAGTGGAAGTGGGGATAGAGGTGGGAAGAGAGGATG...
4,enh_coop,AB,AATTGTAATGCTAAT,0,Oct4-Sox2,AATTATAATGATAAT,995,1010,1002,GCCTTTGTT,Sox2,GCCTTTGTT,961,970,965,85539666,85539681,85539673,85539632,85539641,85539636,TCCTACAGACCCCTTTGTTGTCCCACTCTTTAATAGAAATACCTGA...,4,TTTGTTAGACCAGTGGAAGTGGGGATAGAGGTGGGAAGAGAGGATG...
5,enh_coop,A,AATCATAAGGATAAT,0,Oct4-Sox2,AATTATAATGATAAT,995,1010,1002,GCCTTTGTT,Sox2,GCCTTTGTT,961,970,965,85539666,85539681,85539673,85539632,85539641,85539636,TCCTACAGACCCCTTTGTTGTCCCACTCTTTAATAGAAATACCTGA...,5,TTTGTTAGACCAGTGGAAGTGGGGATAGAGGTGGGAAGAGAGGATG...
6,enh_coop,B,AATTGTAATGCTAAT,0,Oct4-Sox2,AATTATAATGATAAT,995,1010,1002,GCCTAGGTT,Sox2,GCCTTTGTT,961,970,965,85539666,85539681,85539673,85539632,85539641,85539636,TCCTACAGACCCCTTTGTTGTCCCACTCTTTAATAGAAATACCTGA...,6,TTTGTTAGACCAGTGGAAGTGGGGATAGAGGTGGGAAGAGAGGATG...
7,enh_coop,,AATCATAAGGATAAT,0,Oct4-Sox2,AATTATAATGATAAT,995,1010,1002,GCCTAGGTT,Sox2,GCCTTTGTT,961,970,965,85539666,85539681,85539673,85539632,85539641,85539636,TCCTACAGACCCCTTTGTTGTCCCACTCTTTAATAGAAATACCTGA...,7,TTTGTTAGACCAGTGGAAGTGGGGATAGAGGTGGGAAGAGAGGATG...
8,dist_coop,AB,AATTATAATGATAAT,103,Oct4-Sox2,AATTATAATGATAAT,1098,1113,1105,GCCTTTGTT,Sox2,GCCTTTGTT,961,970,965,85539769,85539784,85539776,85539632,85539641,85539636,TCCTACAGACCCCTTTGTTGTCCCACTCTTTAATAGAAATACCTGA...,8,TTTGTTAGACCAGTGGAAGTGGGGATAGAGGTGGGAAGAGAGGATG...
9,dist_coop,A,AATCATAAGGATAAT,103,Oct4-Sox2,AATTATAATGATAAT,1098,1113,1105,GCCTTTGTT,Sox2,GCCTTTGTT,961,970,965,85539769,85539784,85539776,85539632,85539641,85539636,TCCTACAGACCCCTTTGTTGTCCCACTCTTTAATAGAAATACCTGA...,9,TTTGTTAGACCAGTGGAAGTGGGGATAGAGGTGGGAAGAGAGGATG...


In [21]:
crispr_scenarios_df.to_csv('tsv/genomic/crispr/crispr_coop_scenarios.tsv.gz', sep = '\t', index = False)

In [22]:
[s[(original_os_start):(original_os_end)] for s in crispr_scenarios_df.inj_seq.values]

['AATTATAATGATAAT',
 'AATCATAAGGATAAT',
 'AATTATAATGATAAT',
 'AATCATAAGGATAAT',
 'AATTGTAATGCTAAT',
 'AATCATAAGGATAAT',
 'AATTGTAATGCTAAT',
 'AATCATAAGGATAAT',
 'TGGAAGTTCTCCATT',
 'TGGAAGTTCTCCATT',
 'TGGAAGTTCTCCATT',
 'TGGAAGTTCTCCATT']

Predict injected sequences.

In [23]:
#Convert logits and logcounts to human-readable ChIP-nexus profile with counts
crispr_df = pd.DataFrame()
models_of_interest = ['bpnet_osknz_fold1', 'atac_wt_fold1', 'atac_wt_fold2', 'atac_wt_fold3'] + \
[f'atac_{timepoint}h_fold1' for timepoint in concentration_atac_timepoints]
for model_name in models_of_interest:

    acc_model = load_model(modeling_design_dict[model_name]['model_dir'], 
                           custom_objects = {'multinomialNll' : losses.multinomialNll, 'reweightableMse': losses.dummyMse})
    crispr_preds = acc_model.predict(one_hot_encode_sequences(crispr_scenarios_df['inj_seq'].values))

    tasks = modeling_design_dict[model_name]['tasks']
    tasks_n = len(tasks)
    
    for j,task in enumerate(tasks):
        for i in crispr_scenarios_df['scenario_index'].values:
            profile = logitsToProfile(logitsAcrossSingleRegion = crispr_preds[j][i], 
                                      logCountsAcrossSingleRegion = crispr_preds[j+tasks_n][i])
            #Convert to tidy pd.df
            df = pd.DataFrame(profile, columns = list(range(modeling_design_dict[model_name]['num-channels'])))
            df['position'] = list(range(df.shape[0]))
            df = df.melt(id_vars = 'position', var_name = 'channel', value_name = 'pred')
            df['genomic_position'] = df['position'] + region_start + flank_length
            df['task'] = task
            df['model_name'] = model_name
            df['scenario_index'] = i
            crispr_df = pd.concat([crispr_df, df])
crispr_df = crispr_df.merge(crispr_scenarios_df[['scenario_index', 'scenario', 'state', 'genomic_center_OS', 'genomic_center_S']], how = 'left')
crispr_df.head(n=10)











Unnamed: 0,position,channel,pred,genomic_position,task,model_name,scenario_index,scenario,state,genomic_center_OS,genomic_center_S
0,0,0,0.646414,85539187,oct4,bpnet_osknz_fold1,0,WT_coop,AB,85539673,85539636
1,1,0,1.069548,85539188,oct4,bpnet_osknz_fold1,0,WT_coop,AB,85539673,85539636
2,2,0,0.971074,85539189,oct4,bpnet_osknz_fold1,0,WT_coop,AB,85539673,85539636
3,3,0,0.84752,85539190,oct4,bpnet_osknz_fold1,0,WT_coop,AB,85539673,85539636
4,4,0,0.921221,85539191,oct4,bpnet_osknz_fold1,0,WT_coop,AB,85539673,85539636
5,5,0,0.897421,85539192,oct4,bpnet_osknz_fold1,0,WT_coop,AB,85539673,85539636
6,6,0,0.587497,85539193,oct4,bpnet_osknz_fold1,0,WT_coop,AB,85539673,85539636
7,7,0,0.853134,85539194,oct4,bpnet_osknz_fold1,0,WT_coop,AB,85539673,85539636
8,8,0,0.991755,85539195,oct4,bpnet_osknz_fold1,0,WT_coop,AB,85539673,85539636
9,9,0,0.972739,85539196,oct4,bpnet_osknz_fold1,0,WT_coop,AB,85539673,85539636


In [24]:
crispr_df.to_csv('tsv/genomic/crispr/crispr_coop_predictions.tsv.gz', sep = '\t', index = False)

Predict biased sequences for comparison with the experimental predictions in a fair manner.

In [25]:
#Convert logits and logcounts to human-readable ChIP-nexus profile with counts
crispr_df = pd.DataFrame()
models_of_interest = ['bpnet_osknz_fold1', 'atac_wt_fold1', 'atac_wt_fold2', 'atac_wt_fold3'] + \
[f'atac_{timepoint}h_fold1' for timepoint in concentration_atac_timepoints]
for model_name in models_of_interest:
    model_dir = modeling_design_dict[model_name]['model_dir'].replace('residual', 'combined')
    acc_model = load_model(model_dir, custom_objects = {'multinomialNll' : losses.multinomialNll, 'reweightableMse': losses.dummyMse})
    crispr_preds = acc_model.predict(one_hot_encode_sequences(crispr_scenarios_df['inj_seq'].values))
    
    tasks = modeling_design_dict[model_name]['tasks']
    tasks_n = len(tasks)
    
    for j,task in enumerate(tasks):
        for i in crispr_scenarios_df['scenario_index'].values:
            profile = logitsToProfile(logitsAcrossSingleRegion = crispr_preds[j][i], 
                                      logCountsAcrossSingleRegion = crispr_preds[j+tasks_n][i])
            #Convert to tidy pd.df
            df = pd.DataFrame(profile, columns = list(range(modeling_design_dict[model_name]['num-channels'])))
            df['position'] = list(range(df.shape[0]))
            df = df.melt(id_vars = 'position', var_name = 'channel', value_name = 'pred')
            df['genomic_position'] = df['position'] + region_start + flank_length
            df['task'] = task
            df['model_name'] = model_name
            df['scenario_index'] = i
            crispr_df = pd.concat([crispr_df, df])
crispr_df = crispr_df.merge(crispr_scenarios_df[['scenario_index', 'scenario', 'state', 'genomic_center_OS', 'genomic_center_S']], how = 'left')
crispr_df.head(n=10)



Unnamed: 0,position,channel,pred,genomic_position,task,model_name,scenario_index,scenario,state,genomic_center_OS,genomic_center_S
0,0,0,0.646414,85539187,oct4,bpnet_osknz_fold1,0,WT_coop,AB,85539673,85539636
1,1,0,1.069548,85539188,oct4,bpnet_osknz_fold1,0,WT_coop,AB,85539673,85539636
2,2,0,0.971074,85539189,oct4,bpnet_osknz_fold1,0,WT_coop,AB,85539673,85539636
3,3,0,0.84752,85539190,oct4,bpnet_osknz_fold1,0,WT_coop,AB,85539673,85539636
4,4,0,0.921221,85539191,oct4,bpnet_osknz_fold1,0,WT_coop,AB,85539673,85539636
5,5,0,0.897421,85539192,oct4,bpnet_osknz_fold1,0,WT_coop,AB,85539673,85539636
6,6,0,0.587497,85539193,oct4,bpnet_osknz_fold1,0,WT_coop,AB,85539673,85539636
7,7,0,0.853134,85539194,oct4,bpnet_osknz_fold1,0,WT_coop,AB,85539673,85539636
8,8,0,0.991755,85539195,oct4,bpnet_osknz_fold1,0,WT_coop,AB,85539673,85539636
9,9,0,0.972739,85539196,oct4,bpnet_osknz_fold1,0,WT_coop,AB,85539673,85539636


In [26]:
crispr_df.to_csv('tsv/genomic/crispr/crispr_coop_predictions_with_bias.tsv.gz', sep = '\t', index = False)

Predict effects of mutating out Zic3 motif.

In [27]:
zic3_center = flank_length + 427
zic3_mut = 'CATTGCAAAAG'
crispr_scenarios_df['inj_seq_zic3_mut'] = [s[:(zic3_center-6)] + zic3_mut + s[(zic3_center+5):] for s in crispr_scenarios_df.inj_seq.values]

In [28]:
#Convert logits and logcounts to human-readable ChIP-nexus profile with counts
crispr_df = pd.DataFrame()
models_of_interest = ['bpnet_osknz_fold1', 'atac_wt_fold1', 'atac_wt_fold2', 'atac_wt_fold3'] + \
[f'atac_{timepoint}h_fold1' for timepoint in concentration_atac_timepoints]
for model_name in models_of_interest:
    model_dir = modeling_design_dict[model_name]['model_dir'].replace('residual', 'combined')
    acc_model = load_model(model_dir, custom_objects = {'multinomialNll' : losses.multinomialNll, 'reweightableMse': losses.dummyMse})
    crispr_preds = acc_model.predict(one_hot_encode_sequences(crispr_scenarios_df['inj_seq_zic3_mut'].values))
    
    tasks = modeling_design_dict[model_name]['tasks']
    tasks_n = len(tasks)
    
    for j,task in enumerate(tasks):
        for i in crispr_scenarios_df['scenario_index'].values:
            profile = logitsToProfile(logitsAcrossSingleRegion = crispr_preds[j][i], 
                                      logCountsAcrossSingleRegion = crispr_preds[j+tasks_n][i])
            #Convert to tidy pd.df
            df = pd.DataFrame(profile, columns = list(range(modeling_design_dict[model_name]['num-channels'])))
            df['position'] = list(range(df.shape[0]))
            df = df.melt(id_vars = 'position', var_name = 'channel', value_name = 'pred')
            df['genomic_position'] = df['position'] + region_start + flank_length
            df['task'] = task
            df['model_name'] = model_name
            df['scenario_index'] = i
            crispr_df = pd.concat([crispr_df, df])
crispr_df = crispr_df.merge(crispr_scenarios_df[['scenario_index', 'scenario', 'state', 'genomic_center_OS', 'genomic_center_S']], how = 'left')
crispr_df.head(n=10)



Unnamed: 0,position,channel,pred,genomic_position,task,model_name,scenario_index,scenario,state,genomic_center_OS,genomic_center_S
0,0,0,0.42271,85539187,oct4,bpnet_osknz_fold1,0,WT_coop,AB,85539673,85539636
1,1,0,0.710536,85539188,oct4,bpnet_osknz_fold1,0,WT_coop,AB,85539673,85539636
2,2,0,0.624773,85539189,oct4,bpnet_osknz_fold1,0,WT_coop,AB,85539673,85539636
3,3,0,0.687584,85539190,oct4,bpnet_osknz_fold1,0,WT_coop,AB,85539673,85539636
4,4,0,0.73555,85539191,oct4,bpnet_osknz_fold1,0,WT_coop,AB,85539673,85539636
5,5,0,0.697468,85539192,oct4,bpnet_osknz_fold1,0,WT_coop,AB,85539673,85539636
6,6,0,0.484215,85539193,oct4,bpnet_osknz_fold1,0,WT_coop,AB,85539673,85539636
7,7,0,0.69577,85539194,oct4,bpnet_osknz_fold1,0,WT_coop,AB,85539673,85539636
8,8,0,0.797302,85539195,oct4,bpnet_osknz_fold1,0,WT_coop,AB,85539673,85539636
9,9,0,0.832765,85539196,oct4,bpnet_osknz_fold1,0,WT_coop,AB,85539673,85539636


In [29]:
crispr_df.to_csv('tsv/genomic/crispr/crispr_coop_predictions_with_zic3_mut_with_bias.tsv.gz', sep = '\t', index = False)

## Generate contribution scores for each mutation set

First, obtain the sequences of interest and save as .fasta file.

In [30]:
fasta_path = f'fasta/crispr_coop_seqs.fa'
myfile = open(fasta_path, 'w')
for i,row in crispr_scenarios_df.iterrows():
    
    header = '>' + row.scenario + row.state
    myfile.write("%s\n" % header)
    myfile.write("%s\n" % row.inj_seq)
    
myfile.close()

Next, define the counts output head by which to SHAP the scores back.

In [31]:
array_cmds = []

for model_name,model_info in modeling_design_dict.items():

    #Set up SHAP parameters for each task.
    shap_template_dict = {
        # 'genome': genome,
        'fasta-file': fasta_path, 
        'input-length': input_length,
        'output-length': output_length,
        'num-shuffles': 20,
        'verbosity': 'DEBUG'
    }

    #For ATAC vs CHIP-nexus, set up SHAP parameters
    if model_info['tasks'][0]=='atac':
        task = 'atac'
        model_suffix = '_residual.model'
        shap_dict = shap_template_dict.copy()
        shap_dict['model-file'] = model_info['model_dir']
        shap_dict['heads'] = 1
        shap_dict['head-id'] = 0
        shap_dict['profile-task-ids'] = [0]
        shap_dict['profile-h5'] = f'shap/crispr/crispr_coop_{model_name}_{task}_profile.h5'
        shap_dict['counts-h5'] = f'shap/crispr/crispr_coop_{model_name}_{task}_counts.h5'
        shap_json = json.dumps(shap_dict, indent=4)
        shap_json_file = f'json/crispr/shapFlat_crispr_coop_{model_name}_{task}.json'
        with open(shap_json_file, 'w') as outfile:
            outfile.write(shap_json)     

        #Set up the SHAP commands
        model_shap = [f'{python_path} {bpreveal_path}/src/interpretFlat.py json/crispr/shapFlat_crispr_coop_{model_name}_{task}.json']
        # cmds = gpu_header + model_shap
        # with open(f'scripts/crispr_coop_bpnet_shap_{model_name}_{task}.slurm', mode='wt') as slurm:
        #     slurm.write('\n'.join(cmds))
        #     slurm.write('\n')
        # print('sbatch ', f'scripts/crispr_coop_bpnet_shap_{model_name}_{task}.slurm')
        #Write to a job array
        cmds_str = '\n'.join(model_shap)
        array_cmd = [cmds_str]
        array_cmds += array_cmd
        
    else:
        model_suffix = '.model'
        shap_dict = shap_template_dict.copy()
        shap_dict['model-file'] = f'models/{model_name}{model_suffix}'
        for head_counter, (task, cov) in enumerate(model_info['cov'].items()):
            shap_dict['heads'] = len(model_info['tasks'])
            shap_dict['head-id'] = head_counter
            shap_dict['profile-task-ids'] = [0,1]
            shap_dict['profile-h5'] = f'shap/crispr/crispr_coop_{model_name}_{task}_profile.h5'
            shap_dict['counts-h5'] = f'shap/crispr/crispr_coop_{model_name}_{task}_counts.h5'
            shap_json = json.dumps(shap_dict, indent=4)
            shap_json_file = f'json/crispr/shapFlat_crispr_coop_{model_name}_{task}.json'
            with open(shap_json_file, 'w') as outfile:
                outfile.write(shap_json)     
    
            #Set up the SHAP commands
            model_shap = [f'{python_path} {bpreveal_path}/src/interpretFlat.py json/crispr/shapFlat_crispr_coop_{model_name}_{task}.json']
            # cmds = gpu_header + model_shap
            # with open(f'scripts/crispr_coop_bpnet_shap_{model_name}_{task}.slurm', mode='wt') as slurm:
            #     slurm.write('\n'.join(cmds))
            #     slurm.write('\n')
            # print('sbatch ', f'scripts/crispr_coop_bpnet_shap_{model_name}_{task}.slurm')   

            #Write to a job array
            cmds_str = '\n'.join(model_shap)
            array_cmd = [cmds_str]
            array_cmds += array_cmd

generate_slurm_array(cmds = array_cmds, output_file = 'scripts/crispr_coop_bpnet_shap.slurm')
print('sbatch scripts/crispr_coop_bpnet_shap.slurm')

sbatch scripts/crispr_coop_bpnet_shap.slurm


## Perform PISA across each CRISPR set

Define .json by which we want to perform PISA, We will perform PISA across each of these different configurations to ensure that OS/S motifs in this region are mostly cooperating with one another. 

In [32]:
array_cmds = []
for model_name,model_info in modeling_design_dict.items():
    tasks = model_info['tasks']

    enhancer_pisa_template_dict = {
        "genome": genome,
        'bed-file': 'tsv/genomic/crispr/crispr_coop_coord.bed', #'fasta-file': fasta_path, 
        "model-file": model_info['model_dir'],
        "input-length": input_length,
        "output-length": output_length,
        "heads": len(tasks),
        "num-shuffles": 20,
        "verbosity": "DEBUG"
    }

    for head_counter, task in enumerate(tasks):
        for channel_counter in range(model_info['num-channels']):
            enhancer_pisa_dict = enhancer_pisa_template_dict.copy()
            enhancer_pisa_dict['head-id'] = head_counter
            enhancer_pisa_dict['task-id'] = channel_counter
            enhancer_pisa_dict['output-h5'] = f"pisa/crispr_coop_pisa_{model_name}_{task}_{channel_counter}.h5"
            enhancer_pisa_json = json.dumps(enhancer_pisa_dict, indent=4)
            enhancer_pisa_file = f'json/interpretPISA_crispr_coop_{model_name}_{task}_{channel_counter}.json'
    
            with open(enhancer_pisa_file, 'w') as outfile:
                outfile.write(enhancer_pisa_json)
                
            #Set up training and prediction commands
            pisa = [f'{python_path} {bpreveal_path}/src/interpretPisa.py json/interpretPISA_crispr_coop_{model_name}_{task}_{channel_counter}.json']
    
            #Write to a job array
            cmds_str = '\n'.join(pisa)
            array_cmd = [cmds_str]
            array_cmds += array_cmd

generate_slurm_array(cmds = array_cmds, output_file = 'scripts/bpnet_pisa_array_enhancers_across_crispr_coop.slurm')
print('sbatch scripts/bpnet_pisa_array_enhancers_across_crispr_coop.slurm')

sbatch scripts/bpnet_pisa_array_enhancers_across_crispr_coop.slurm


# Perturb Akr1cl site with 2 identical Sox2 motifs

This site was identified by (1) having 2 Sox2 motifs with identical sequences within the same nucleosome distance (2) these two motifs needed to show opposing functions of pioneering vs non-pioneering. This showcases that motif affinity is not the obly driver of pioneering, and that context also matters. 

In this case, motifA will be the distal Sox2 and motifB will be the proximal Sox2.

In [33]:
sox2_distal_motif_id = 253082
sox2_proximal_motif_id = 253084
region_id = 3491

#Isolate motifs
motifs_df = pd.read_csv(motifs_path,  sep = '\t')
motifs_df['pattern_center'] = motifs_df['pattern_center'] + flank_length
sox2_distal_motif_df = motifs_df[motifs_df.motif_id==sox2_distal_motif_id]
sox2_distal_motif_df['wt_seq'] = extract_seqs_from_df(sox2_distal_motif_df, genome)
sox2_proximal_motif_df = motifs_df[motifs_df.motif_id==sox2_proximal_motif_id]
sox2_proximal_motif_df['wt_seq'] = extract_seqs_from_df(sox2_proximal_motif_df, genome)

#Keep metadata of use
columns_of_interest = ['motif', 'seqnames', 'wt_seq', 'motif_window_start', 'motif_window_end', 'pattern_center']
sox2_distal_motif_df = sox2_distal_motif_df[columns_of_interest]
sox2_proximal_motif_df = sox2_proximal_motif_df[columns_of_interest]
print(sox2_distal_motif_df)
print(sox2_proximal_motif_df)

     motif seqnames     wt_seq  motif_window_start  motif_window_end  \
4290  Sox2     chr1  CCCTTTGTC                 864               873   

      pattern_center  
4290             868  
     motif seqnames     wt_seq  motif_window_start  motif_window_end  \
4292  Sox2     chr1  CCCTTTGTC                 963               972   

      pattern_center  
4292             967  


Print region start coordinate.

In [34]:
region_start = motifs_df[motifs_df.motif_id==sox2_distal_motif_id].region_start.values[0]
region_start

65036790

Overwrite distance injection for `dist_coop` scenario.

In [35]:
print('Current center-to-center distance: ', sox2_proximal_motif_df.pattern_center.values - sox2_distal_motif_df.pattern_center.values)

Current center-to-center distance:  [99]


## Define distal Sox2

Define all CRISPR scenarios for the distal Sox2. Sox2 will not change in position or be enhanced by affinity.

In [36]:
sox2_distal_states_df = pd.DataFrame([['AB', 'B', 'A'], ['CCCTTTGTC', 'CCCTAGGTC', 'CCCTTTGTC']])\
    .transpose()\
    .rename({0: 'state', 1:'forward_seq'}, axis = 1)
sox2_distal_states_df['motif'] = 'Sox2'
sox2_distal_states_df = sox2_distal_states_df.merge(sox2_distal_motif_df)
sox2_distal_states_df

Unnamed: 0,state,forward_seq,motif,seqnames,wt_seq,motif_window_start,motif_window_end,pattern_center
0,AB,CCCTTTGTC,Sox2,chr1,CCCTTTGTC,864,873,868
1,B,CCCTAGGTC,Sox2,chr1,CCCTTTGTC,864,873,868
2,A,CCCTTTGTC,Sox2,chr1,CCCTTTGTC,864,873,868


## Define proximal Sox2

Define all CRISPR scenarios for the proximal Sox2. Sox2 will not change in position or be enhanced by affinity.

In [37]:
sox2_proximal_states_df = pd.DataFrame([['AB', 'A', 'B'], ['CCCTTTGTC', 'CCCTAGGTC', 'CCCTTTGTC']])\
    .transpose()\
    .rename({0: 'state', 1:'forward_seq'}, axis = 1)
sox2_proximal_states_df['motif'] = 'Sox2'
sox2_proximal_states_df = sox2_proximal_states_df.merge(sox2_proximal_motif_df)
sox2_proximal_states_df

Unnamed: 0,state,forward_seq,motif,seqnames,wt_seq,motif_window_start,motif_window_end,pattern_center
0,AB,CCCTTTGTC,Sox2,chr1,CCCTTTGTC,963,972,967
1,A,CCCTAGGTC,Sox2,chr1,CCCTTTGTC,963,972,967
2,B,CCCTTTGTC,Sox2,chr1,CCCTTTGTC,963,972,967


## Merge coordinates together

In [38]:
crispr_scenarios_df = sox2_distal_states_df.merge(sox2_proximal_states_df, how = 'left', on = 'state', suffixes=('_distal', '_proximal'))
crispr_scenarios_df[['genomic_start_distal', 'genomic_end_distal', 'genomic_center_distal']] = crispr_scenarios_df[['motif_window_start_distal', 'motif_window_end_distal', 'pattern_center_distal']] + region_start
crispr_scenarios_df[['genomic_start_proximal', 'genomic_end_proximal', 'genomic_center_proximal']] = crispr_scenarios_df[['motif_window_start_proximal', 'motif_window_end_proximal', 'pattern_center_proximal']] + region_start
crispr_scenarios_df

Unnamed: 0,state,forward_seq_distal,motif_distal,seqnames_distal,wt_seq_distal,motif_window_start_distal,motif_window_end_distal,pattern_center_distal,forward_seq_proximal,motif_proximal,seqnames_proximal,wt_seq_proximal,motif_window_start_proximal,motif_window_end_proximal,pattern_center_proximal,genomic_start_distal,genomic_end_distal,genomic_center_distal,genomic_start_proximal,genomic_end_proximal,genomic_center_proximal
0,AB,CCCTTTGTC,Sox2,chr1,CCCTTTGTC,864,873,868,CCCTTTGTC,Sox2,chr1,CCCTTTGTC,963,972,967,65037654,65037663,65037658,65037753,65037762,65037757
1,B,CCCTAGGTC,Sox2,chr1,CCCTTTGTC,864,873,868,CCCTTTGTC,Sox2,chr1,CCCTTTGTC,963,972,967,65037654,65037663,65037658,65037753,65037762,65037757
2,A,CCCTTTGTC,Sox2,chr1,CCCTTTGTC,864,873,868,CCCTAGGTC,Sox2,chr1,CCCTTTGTC,963,972,967,65037654,65037663,65037658,65037753,65037762,65037757


## Generate predictions based on the different scenarios

First, prepare WT genomic sequences according to input accessibility features.

In [39]:
crispr_region_df = regions_df[regions_df.region_id==region_id]
crispr_region_df['wt_seq'] = extract_seqs_from_df(crispr_region_df, genome)

In [40]:
print(crispr_region_df)
print(crispr_region_df.wt_seq.values[0][:10])
print(crispr_region_df.wt_seq.values[0][-10:])

     chrom     start       end  region_id  score strand  \
3491  chr1  65036790  65038822       3491      0      .   

                                                 wt_seq  
3491  TTGATGTTAGCCATTTGATTGGTGTAAGGCAGAATTTTTAGCTTTG...  
TTGATGTTAG
AGCATAAAAG


In [41]:
crispr_region_df.to_csv('tsv/genomic/crispr/crispr_context_coord.bed', sep = '\t', index = False, header = False)

Next, import model.

In [42]:
acc_model = load_model(modeling_design_dict['atac_wt_fold1']['model_dir'], custom_objects = {'multinomialNll' : losses.multinomialNll, 'reweightableMse': losses.dummyMse})

Next, inject motifs.

In [43]:
wt_seq = crispr_region_df.wt_seq.values[0]

crispr_seqs = []
for i,row in crispr_scenarios_df.iterrows():
        #Overwrite proximal Sox2
        s_seq = wt_seq[:(row.motif_window_start_proximal)] + row.forward_seq_proximal + wt_seq[(row.motif_window_end_proximal):]
        
        #Overwrite distal Sox2
        final_seq = s_seq[:(row.motif_window_start_distal)] + row.forward_seq_distal + s_seq[(row.motif_window_end_distal):]
        assert len(final_seq)==input_length
        crispr_seqs.append(final_seq)

crispr_scenarios_df['inj_seq'] = crispr_seqs
crispr_scenarios_df['scenario_index'] = list(range(crispr_scenarios_df.shape[0]))

In [44]:
crispr_scenarios_df['inj_seq_65037600_to_65037900'] = [s[(65037600 - crispr_region_df.start.values[0]):(65037600 - crispr_region_df.start.values[0] + 300) ]
                                                       for s in crispr_scenarios_df['inj_seq'].values]
crispr_scenarios_df

Unnamed: 0,state,forward_seq_distal,motif_distal,seqnames_distal,wt_seq_distal,motif_window_start_distal,motif_window_end_distal,pattern_center_distal,forward_seq_proximal,motif_proximal,seqnames_proximal,wt_seq_proximal,motif_window_start_proximal,motif_window_end_proximal,pattern_center_proximal,genomic_start_distal,genomic_end_distal,genomic_center_distal,genomic_start_proximal,genomic_end_proximal,genomic_center_proximal,inj_seq,scenario_index,inj_seq_65037600_to_65037900
0,AB,CCCTTTGTC,Sox2,chr1,CCCTTTGTC,864,873,868,CCCTTTGTC,Sox2,chr1,CCCTTTGTC,963,972,967,65037654,65037663,65037658,65037753,65037762,65037757,TTGATGTTAGCCATTTGATTGGTGTAAGGCAGAATTTTTAGCTTTG...,0,AACTGCTGCTATTTTCTGACGGGCCACATCTGTCTCCTTTCTGTCC...
1,B,CCCTAGGTC,Sox2,chr1,CCCTTTGTC,864,873,868,CCCTTTGTC,Sox2,chr1,CCCTTTGTC,963,972,967,65037654,65037663,65037658,65037753,65037762,65037757,TTGATGTTAGCCATTTGATTGGTGTAAGGCAGAATTTTTAGCTTTG...,1,AACTGCTGCTATTTTCTGACGGGCCACATCTGTCTCCTTTCTGTCC...
2,A,CCCTTTGTC,Sox2,chr1,CCCTTTGTC,864,873,868,CCCTAGGTC,Sox2,chr1,CCCTTTGTC,963,972,967,65037654,65037663,65037658,65037753,65037762,65037757,TTGATGTTAGCCATTTGATTGGTGTAAGGCAGAATTTTTAGCTTTG...,2,AACTGCTGCTATTTTCTGACGGGCCACATCTGTCTCCTTTCTGTCC...


In [45]:
crispr_scenarios_df.to_csv('tsv/genomic/crispr/crispr_context_scenarios.tsv.gz', sep = '\t', index = False)

In [46]:
original_distal_sox2_start = crispr_scenarios_df.motif_window_start_distal.values[0]
original_distal_sox2_end = crispr_scenarios_df.motif_window_end_distal.values[0]
[s[(original_distal_sox2_start):(original_distal_sox2_end)] for s in crispr_scenarios_df.inj_seq.values]

['CCCTTTGTC', 'CCCTAGGTC', 'CCCTTTGTC']

In [47]:
original_proximal_sox2_start = crispr_scenarios_df.motif_window_start_proximal.values[0]
original_proximal_sox2_end = crispr_scenarios_df.motif_window_end_proximal.values[0]
[s[(original_proximal_sox2_start):(original_proximal_sox2_end)] for s in crispr_scenarios_df.inj_seq.values]

['CCCTTTGTC', 'CCCTTTGTC', 'CCCTAGGTC']

Predict injected sequences.

In [48]:
#Convert logits and logcounts to human-readable ChIP-nexus profile with counts
crispr_df = pd.DataFrame()
for model_name in list(modeling_design_dict.keys()):
    model = load_model(modeling_design_dict[model_name]['model_dir'], 
                           custom_objects = {'multinomialNll' : losses.multinomialNll, 'reweightableMse': losses.dummyMse})
    crispr_preds = model.predict(one_hot_encode_sequences(crispr_scenarios_df['inj_seq'].values))
    
    for i in crispr_scenarios_df['scenario_index'].values:
        tasks = modeling_design_dict[model_name]['tasks']
        for j,task in enumerate(tasks):
            profile = logitsToProfile(logitsAcrossSingleRegion = crispr_preds[j][i], 
                                      logCountsAcrossSingleRegion = crispr_preds[j + len(tasks)][i])
            #Convert to tidy pd.df
            df = pd.DataFrame(profile, columns = list(range(modeling_design_dict[model_name]['num-channels'])))
            df['position'] = list(range(df.shape[0]))
            df['genomic_position'] = df['position'] + region_start + flank_length
            df = df.melt(id_vars = ['position', 'genomic_position'], var_name = 'channel', value_name = 'pred')
            df['task'] = task
            df['model_name'] = model_name
            df['scenario_index'] = i
            crispr_df = pd.concat([crispr_df, df])
crispr_df = crispr_df.merge(crispr_scenarios_df[['scenario_index', 'state', 'genomic_center_distal', 'genomic_center_proximal']], how = 'left')
crispr_df.head(n=10)



Unnamed: 0,position,genomic_position,channel,pred,task,model_name,scenario_index,state,genomic_center_distal,genomic_center_proximal
0,0,65037306,0,0.005036,oct4,bpnet_osknz_fold1,0,AB,65037658,65037757
1,1,65037307,0,0.00942,oct4,bpnet_osknz_fold1,0,AB,65037658,65037757
2,2,65037308,0,0.033758,oct4,bpnet_osknz_fold1,0,AB,65037658,65037757
3,3,65037309,0,0.029952,oct4,bpnet_osknz_fold1,0,AB,65037658,65037757
4,4,65037310,0,0.01548,oct4,bpnet_osknz_fold1,0,AB,65037658,65037757
5,5,65037311,0,0.005962,oct4,bpnet_osknz_fold1,0,AB,65037658,65037757
6,6,65037312,0,0.006315,oct4,bpnet_osknz_fold1,0,AB,65037658,65037757
7,7,65037313,0,0.004534,oct4,bpnet_osknz_fold1,0,AB,65037658,65037757
8,8,65037314,0,0.008796,oct4,bpnet_osknz_fold1,0,AB,65037658,65037757
9,9,65037315,0,0.013199,oct4,bpnet_osknz_fold1,0,AB,65037658,65037757


In [49]:
crispr_df.to_csv('tsv/genomic/crispr/crispr_context_predictions.tsv.gz', sep = '\t', index = False)

Predict biased sequences for comparison with the experimental predictions in a fair manner.

In [50]:
#Convert logits and logcounts to human-readable ChIP-nexus profile with counts
crispr_df = pd.DataFrame()
for model_name in list(modeling_design_dict.keys()):
    model_dir = modeling_design_dict[model_name]['model_dir'].replace('residual', 'combined')
    model = load_model(model_dir, custom_objects = {'multinomialNll' : losses.multinomialNll, 'reweightableMse': losses.dummyMse})
    crispr_preds = model.predict(one_hot_encode_sequences(crispr_scenarios_df['inj_seq'].values))
    
    for i in crispr_scenarios_df['scenario_index'].values:
        tasks = modeling_design_dict[model_name]['tasks']
        for j,task in enumerate(tasks):
            profile = logitsToProfile(logitsAcrossSingleRegion = crispr_preds[j][i], 
                                      logCountsAcrossSingleRegion = crispr_preds[j + len(tasks)][i])
            #Convert to tidy pd.df
            df = pd.DataFrame(profile, columns = list(range(modeling_design_dict[model_name]['num-channels'])))
            df['position'] = list(range(df.shape[0]))
            df['genomic_position'] = df['position'] + region_start + flank_length
            df = df.melt(id_vars = ['position', 'genomic_position'], var_name = 'channel', value_name = 'pred')
            df['task'] = task
            df['model_name'] = model_name
            df['scenario_index'] = i
            crispr_df = pd.concat([crispr_df, df])
crispr_df = crispr_df.merge(crispr_scenarios_df[['scenario_index', 'state', 'genomic_center_distal', 'genomic_center_proximal']], how = 'left')
crispr_df.head(n=10)



Unnamed: 0,position,genomic_position,channel,pred,task,model_name,scenario_index,state,genomic_center_distal,genomic_center_proximal
0,0,65037306,0,0.005036,oct4,bpnet_osknz_fold1,0,AB,65037658,65037757
1,1,65037307,0,0.00942,oct4,bpnet_osknz_fold1,0,AB,65037658,65037757
2,2,65037308,0,0.033758,oct4,bpnet_osknz_fold1,0,AB,65037658,65037757
3,3,65037309,0,0.029952,oct4,bpnet_osknz_fold1,0,AB,65037658,65037757
4,4,65037310,0,0.01548,oct4,bpnet_osknz_fold1,0,AB,65037658,65037757
5,5,65037311,0,0.005962,oct4,bpnet_osknz_fold1,0,AB,65037658,65037757
6,6,65037312,0,0.006315,oct4,bpnet_osknz_fold1,0,AB,65037658,65037757
7,7,65037313,0,0.004534,oct4,bpnet_osknz_fold1,0,AB,65037658,65037757
8,8,65037314,0,0.008796,oct4,bpnet_osknz_fold1,0,AB,65037658,65037757
9,9,65037315,0,0.013199,oct4,bpnet_osknz_fold1,0,AB,65037658,65037757


In [51]:
crispr_df.to_csv('tsv/genomic/crispr/crispr_context_predictions_with_bias.tsv.gz', sep = '\t', index = False)

## Generate contribution scores for each mutation set

First, obtain the sequences of interest and save as .fasta file.

In [52]:
fasta_path = f'fasta/crispr_context_seqs.fa'
myfile = open(fasta_path, 'w')
for i,row in crispr_scenarios_df.iterrows():
    
    header = '>' + row.state
    myfile.write("%s\n" % header)
    myfile.write("%s\n" % row.inj_seq)
    
myfile.close()

Next, define the counts output head by which to SHAP the scores back.

In [53]:
array_cmds = []

for model_name,model_info in modeling_design_dict.items():

    #Set up SHAP parameters for each task.
    shap_template_dict = {
        # 'genome': genome,
        'fasta-file': fasta_path, 
        'input-length': input_length,
        'output-length': output_length,
        'num-shuffles': 20,
        'verbosity': 'DEBUG'
    }

    #For ATAC vs CHIP-nexus, set up SHAP parameters
    if model_info['tasks'][0]=='atac':
        task = 'atac'
        model_suffix = '_residual.model'
        shap_dict = shap_template_dict.copy()
        shap_dict['model-file'] = model_info['model_dir']
        shap_dict['heads'] = 1
        shap_dict['head-id'] = 0
        shap_dict['profile-task-ids'] = [0]
        shap_dict['profile-h5'] = f'shap/crispr/crispr_context_{model_name}_{task}_profile.h5'
        shap_dict['counts-h5'] = f'shap/crispr/crispr_context_{model_name}_{task}_counts.h5'
        shap_json = json.dumps(shap_dict, indent=4)
        shap_json_file = f'json/crispr/shapFlat_crispr_context_{model_name}_{task}.json'
        with open(shap_json_file, 'w') as outfile:
            outfile.write(shap_json)     

        #Set up the SHAP commands
        model_shap = [f'{python_path} {bpreveal_path}/src/interpretFlat.py json/crispr/shapFlat_crispr_context_{model_name}_{task}.json']
        # cmds = gpu_header + model_shap
        # with open(f'scripts/crispr_coop_bpnet_shap_{model_name}_{task}.slurm', mode='wt') as slurm:
        #     slurm.write('\n'.join(cmds))
        #     slurm.write('\n')
        # print('sbatch ', f'scripts/crispr_coop_bpnet_shap_{model_name}_{task}.slurm')
        #Write to a job array
        cmds_str = '\n'.join(model_shap)
        array_cmd = [cmds_str]
        array_cmds += array_cmd
        
    else:
        model_suffix = '.model'
        shap_dict = shap_template_dict.copy()
        shap_dict['model-file'] = f'models/{model_name}{model_suffix}'
        for head_counter, (task, cov) in enumerate(model_info['cov'].items()):
            shap_dict['heads'] = len(model_info['tasks'])
            shap_dict['head-id'] = head_counter
            shap_dict['profile-task-ids'] = [0,1]
            shap_dict['profile-h5'] = f'shap/crispr/crispr_context_{model_name}_{task}_profile.h5'
            shap_dict['counts-h5'] = f'shap/crispr/crispr_context_{model_name}_{task}_counts.h5'
            shap_json = json.dumps(shap_dict, indent=4)
            shap_json_file = f'json/crispr/shapFlat_crispr_context_{model_name}_{task}.json'
            with open(shap_json_file, 'w') as outfile:
                outfile.write(shap_json)     
    
            #Set up the SHAP commands
            model_shap = [f'{python_path} {bpreveal_path}/src/interpretFlat.py json/crispr/shapFlat_crispr_context_{model_name}_{task}.json']

            #Write to a job array
            cmds_str = '\n'.join(model_shap)
            array_cmd = [cmds_str]
            array_cmds += array_cmd

generate_slurm_array(cmds = array_cmds, output_file = 'scripts/crispr_context_bpnet_shap.slurm')
print('sbatch scripts/crispr_context_bpnet_shap.slurm')

sbatch scripts/crispr_context_bpnet_shap.slurm
