# Introduction

The goals of this analysis is to run TF-MoDISco and map hits to the trained models.

# Computational setup

In [1]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

import warnings
warnings.filterwarnings("ignore")

#Packages
import os
import sys
import json
import glob
import pandas as pd
import numpy as np
from tqdm import tqdm

# Settings

## Working options
os.chdir(f'/n/projects/mw2098/publications/2024_weilert_acc/code/2_analysis/')
pd.set_option('display.max_columns', 100)

## Custom functions
sys.path.insert(0, f'scripts/py/functions')
from motifs import import_modisco_seqlets_txt, map_modisco_coordinates_to_genome, resize_coordinates

## Configuration variables
figure_path = 'figures/4_map_motifs'
bpreveal_path = '/n/projects/mw2098/publications/2024_weilert_acc/public/software/bpreveal_404/'
python_path = '/home/mw2098/anaconda3/envs/bpreveal_404/bin/python'
meme_db = '../../public/databases/JASPAR/JASPAR2022_CORE_vertebrates_non-redundant_pfms_meme.txt'
genome = '../0_setup/fa/mm10.fa'
output_length = 1000
acc_vs_bind_dict = {'acc': {'input_length': 2032, 
                            'trials': 256, 
                            'model_dir': 'models/atac_wt_fold1_residual.model',
                            'tasks': ['atac'],
                            'prefix': 'atac_wt',
                            'bed_path_dict' : {f'fold{i}': f'bed/bpreveal/atac_wt_fold{i}_all.bed' for i in range(1,4)}},
                    'bind': {'input_length': 2032, 
                             'trials': 256,
                             'model_dir': 'models/bpnet_osknz_fold1.model',
                             'tasks': ['oct4', 'sox2', 'klf4', 'nanog', 'zic3'],
                             'prefix': 'bpnet_osknz',
                             'bed_path_dict' : {f'fold{i}': f'bed/bpreveal/bpnet_osknz_fold{i}_all.bed' for i in range(1,4)}},
                    'bias': {'input_length': 1154, 
                             'trials': 256,
                             'model_dir': 'models/optimize/atac_bias.model',
                             'tasks': ['atac'],
                             'prefix': 'atac_bias',
                             'bed_path_dict' : {f'': f'bed/bpreveal/optimize/atac_wt_trans_all.bed'}},
                    'xiong_0h': {'input_length': 2032, 
                            'trials': 256, 
                            'model_dir': 'models/atac_0h_fold1_residual.model',
                            'tasks': ['atac'],
                            'prefix': 'atac_0h',
                            'bed_path_dict' : {f'fold{i}': f'bed/bpreveal/atac_0h_fold{i}_all.bed' for i in range(1,4)}},
                    'xiong_3h': {'input_length': 2032, 
                            'trials': 256, 
                            'model_dir': 'models/atac_3h_fold1_residual.model',
                            'tasks': ['atac'],
                            'prefix': 'atac_3h',
                            'bed_path_dict' : {f'fold{i}': f'bed/bpreveal/atac_3h_fold{i}_all.bed' for i in range(1,4)}},
                    'xiong_6h': {'input_length': 2032, 
                            'trials': 256, 
                            'model_dir': 'models/atac_6h_fold1_residual.model',
                            'tasks': ['atac'],
                            'prefix': 'atac_6h',
                            'bed_path_dict' : {f'fold{i}': f'bed/bpreveal/atac_6h_fold{i}_all.bed' for i in range(1,4)}},
                    'xiong_9h': {'input_length': 2032, 
                            'trials': 256, 
                            'model_dir': 'models/atac_9h_fold1_residual.model',
                            'tasks': ['atac'],
                            'prefix': 'atac_9h',
                            'bed_path_dict' : {f'fold{i}': f'bed/bpreveal/atac_9h_fold{i}_all.bed' for i in range(1,4)}},
                    'xiong_12h': {'input_length': 2032, 
                            'trials': 256, 
                            'model_dir': 'models/atac_12h_fold1_residual.model',
                            'tasks': ['atac'],
                            'prefix': 'atac_12h',
                            'bed_path_dict' : {f'fold{i}': f'bed/bpreveal/atac_12h_fold{i}_all.bed' for i in range(1,4)}},
                    'xiong_15h': {'input_length': 2032, 
                            'trials': 256, 
                            'model_dir': 'models/atac_15h_fold1_residual.model',
                            'tasks': ['atac'],
                            'prefix': 'atac_15h',
                            'bed_path_dict' : {f'fold{i}': f'bed/bpreveal/atac_15h_fold{i}_all.bed' for i in range(1,4)}}
                   }

## Filesystem commands
!mkdir -p modiscolite

# Run TF-MoDISco

In [2]:
working_dir = os.getcwd()
bash_header = ['#!/bin/bash', 
               '#SBATCH --job-name=modiscolite',
               '#SBATCH --ntasks=1',
               '#SBATCH --cpus-per-task=70',
               '#SBATCH --mem=100gb',
               '#SBATCH --time=48:00:00',
               '#SBATCH --output=slurm_%j.log',
               'source /home/mw2098/.bashrc',
               'conda deactivate',
               'ml meme',
               'conda activate bpreveal_404',
               f'cd {working_dir}']

for k,v in acc_vs_bind_dict.items():
    prefix = v['prefix']
    tasks = v['tasks']
    bed_path_dict = v['bed_path_dict']
    for fold,bed_path in bed_path_dict.items():
        for task in tasks:
            for mode in ['counts', 'profile']:
                
                #Run counts TF-MoDISco
                shap_to_npz = [f'{python_path} {bpreveal_path}/src/shapToNumpy.py \\',
                                      f'--h5 shap/{prefix}_{fold}_{task}_{mode}.h5 \\',
                                      f'--seqs npz/{prefix}_{fold}_{task}_{mode}_seqs \\',
                                      f'--scores npz/{prefix}_{fold}_{task}_{mode}_scores'] 
                modisco = [f'mkdir -p modiscolite/{prefix}_{fold}_{task}_{mode}',
                                  f'modisco motifs \\',
                                  f'-s npz/{prefix}_{fold}_{task}_{mode}_seqs.npy \\',
                                  f'-a npz/{prefix}_{fold}_{task}_{mode}_scores.npy \\',
                                  f'-n 50000 \\',
                                  f'-w {output_length} \\',
                                  f'--output modiscolite/{prefix}_{fold}_{task}_{mode}/modisco.h5']
                modisco_report = [f'modisco report -i modiscolite/{prefix}_{fold}_{task}_{mode}/modisco.h5 \\',
                                         f'-o modiscolite/{prefix}_{fold}_{task}_{mode} \\',
                                         f'-m {meme_db}']
            
                cmds = bash_header + shap_to_npz + modisco + modisco_report 
                cmds = [c.replace('__', '_') for c in cmds]
    
                #Write the script
                with open(f'scripts/bpreveal_modiscolite_{k}_{fold}_{task}_{mode}.slurm', mode='wt') as bash:
                    bash.write('\n'.join(cmds))
                    bash.write('\n')
                print(f'sbatch scripts/bpreveal_modiscolite_{k}_{fold}_{task}_{mode}.slurm')

sbatch scripts/bpreveal_modiscolite_acc_fold1_atac_counts.slurm
sbatch scripts/bpreveal_modiscolite_acc_fold1_atac_profile.slurm
sbatch scripts/bpreveal_modiscolite_acc_fold2_atac_counts.slurm
sbatch scripts/bpreveal_modiscolite_acc_fold2_atac_profile.slurm
sbatch scripts/bpreveal_modiscolite_acc_fold3_atac_counts.slurm
sbatch scripts/bpreveal_modiscolite_acc_fold3_atac_profile.slurm
sbatch scripts/bpreveal_modiscolite_bind_fold1_oct4_counts.slurm
sbatch scripts/bpreveal_modiscolite_bind_fold1_oct4_profile.slurm
sbatch scripts/bpreveal_modiscolite_bind_fold1_sox2_counts.slurm
sbatch scripts/bpreveal_modiscolite_bind_fold1_sox2_profile.slurm
sbatch scripts/bpreveal_modiscolite_bind_fold1_klf4_counts.slurm
sbatch scripts/bpreveal_modiscolite_bind_fold1_klf4_profile.slurm
sbatch scripts/bpreveal_modiscolite_bind_fold1_nanog_counts.slurm
sbatch scripts/bpreveal_modiscolite_bind_fold1_nanog_profile.slurm
sbatch scripts/bpreveal_modiscolite_bind_fold1_zic3_counts.slurm
sbatch scripts/bprevea

When processing the tfmodiscolite `modisco.h5` the internal code is as follows: 

modisco.h5: metacluster --> pattern_id -->
+ contrib_scores: CWM represented as a [30 x 4] array
+ hypothetical contribs: hypothetical CWM represented as a [30 x 4] array
+ sequence: PPM represented as a [30 x 4] array
+ seqlets -->
    + sequence, contrib_scores, hypothetical_contribs in a [n_seqlets x 30 x 4] array 
    + is_revcomp, n_seqlets, start, end, example_idx (peak index) in a [n_seqlets] array

# CWM scan motifs

For these representations, we need to CWM-scan motifs based on our mapping criteria and seqlet allocations. 

In [3]:
for k,v in acc_vs_bind_dict.items():
    prefix = v['prefix']
    tasks = v['tasks']
    bed_path_dict = v['bed_path_dict']
    for fold,bed_path in bed_path_dict.items():        
        for task in tasks:
            for mode in ['counts', 'profile']:

                motif_scan_dict = {
                    'seqlet-cutoff-settings': {
                        'quantile-json': f'json/motifSeqletCutoffs_{prefix}_{fold}_{task}_{mode}_quantiles.json', 
                        'modisco-h5': f'modiscolite/{prefix}_{fold}_{task}_{mode}/modisco.h5'.replace('__', '_'),
                        'modisco-contrib-h5': f'shap/{prefix}_{fold}_{task}_{mode}.h5'.replace('__', '_'),
                        'patterns': 'all', 
                        'seq-match-quantile': None,
                        'contrib-match-quantile': 0.2, 
                        'contrib-magnitude-quantile': 0.01,
                        'trim-threshold': 0.3,
                        'trim-padding': 1,
                        'background-probs': [0.3, 0.2, 0.2, 0.3], #derived from approximations of annotated GC content genome-wide
                        'seqlets-tsv': f'modiscolite/{prefix}_{fold}_{task}_{mode}/seqlets.tsv'.replace('__', '_'),
                        'verbosity': 'WARNING'
                    },
                    'scan-settings': {
                        'scan-contrib-h5': f'shap/{prefix}_{fold}_{task}_{mode}.h5'.replace('__', '_'),
                        'hits-tsv': f'modiscolite/{prefix}_{fold}_{task}_{mode}/hits.tsv'.replace('__', '_'),
                        'num-threads': 68
                    }, 
                    'verbosity': 'WARNING'
                }
                motif_scan_json = json.dumps(motif_scan_dict, indent=4)
                motif_scan_file = f'json/motifScan_{prefix}_{fold}_{task}_{mode}.json'        
                with open(motif_scan_file, 'w') as outfile:
                    outfile.write(motif_scan_json) 

                motif_scan = [f'{python_path} {bpreveal_path}/src/motifScan.py {motif_scan_file}']
                cmds = bash_header + motif_scan
    
                #Write the script
                with open(f'scripts/bpreveal_motifscan_{k}_{fold}_{task}_{mode}.slurm', mode='wt') as bash:
                    bash.write('\n'.join(cmds))
                    bash.write('\n')
                print(f'sbatch scripts/bpreveal_motifscan_{k}_{fold}_{task}_{mode}.slurm')

sbatch scripts/bpreveal_motifscan_acc_fold1_atac_counts.slurm
sbatch scripts/bpreveal_motifscan_acc_fold1_atac_profile.slurm
sbatch scripts/bpreveal_motifscan_acc_fold2_atac_counts.slurm
sbatch scripts/bpreveal_motifscan_acc_fold2_atac_profile.slurm
sbatch scripts/bpreveal_motifscan_acc_fold3_atac_counts.slurm
sbatch scripts/bpreveal_motifscan_acc_fold3_atac_profile.slurm
sbatch scripts/bpreveal_motifscan_bind_fold1_oct4_counts.slurm
sbatch scripts/bpreveal_motifscan_bind_fold1_oct4_profile.slurm
sbatch scripts/bpreveal_motifscan_bind_fold1_sox2_counts.slurm
sbatch scripts/bpreveal_motifscan_bind_fold1_sox2_profile.slurm
sbatch scripts/bpreveal_motifscan_bind_fold1_klf4_counts.slurm
sbatch scripts/bpreveal_motifscan_bind_fold1_klf4_profile.slurm
sbatch scripts/bpreveal_motifscan_bind_fold1_nanog_counts.slurm
sbatch scripts/bpreveal_motifscan_bind_fold1_nanog_profile.slurm
sbatch scripts/bpreveal_motifscan_bind_fold1_zic3_counts.slurm
sbatch scripts/bpreveal_motifscan_bind_fold1_zic3_pr

## Derive quantile information on hits from seqlet distributions

It is helpful to quantify the relative importance or significance of a motif mapped hit based on the original TF-MoDISco seqlet distribution. 

In [4]:
for k,v in {key: acc_vs_bind_dict.get(key) for key in ['acc','bind']}.items():
    prefix = v['prefix']
    tasks = v['tasks']
    bed_path_dict = v['bed_path_dict']
    for fold,bed_path in bed_path_dict.items():
        for task in tasks:
            for mode in ['counts', 'profile']:
                motif_quantiles = [f'{python_path} {bpreveal_path}/src/motifAddQuantiles.py --verbose \\',
                             f'--seqlet-tsv modiscolite/{prefix}_{fold}_{task}_{mode}/seqlets.tsv \\',
                             f'--scan-tsv modiscolite/{prefix}_{fold}_{task}_{mode}/hits.tsv']
                cmds = bash_header + motif_quantiles
    
                #Write the script
                with open(f'scripts/bpreveal_motifquantiles_{k}_{fold}_{task}_{mode}.slurm', mode='wt') as bash:
                    bash.write('\n'.join(cmds))
                    bash.write('\n')
                print(f'sbatch scripts/bpreveal_motifquantiles_{k}_{fold}_{task}_{mode}.slurm')

sbatch scripts/bpreveal_motifquantiles_acc_fold1_atac_counts.slurm
sbatch scripts/bpreveal_motifquantiles_acc_fold1_atac_profile.slurm
sbatch scripts/bpreveal_motifquantiles_acc_fold2_atac_counts.slurm
sbatch scripts/bpreveal_motifquantiles_acc_fold2_atac_profile.slurm
sbatch scripts/bpreveal_motifquantiles_acc_fold3_atac_counts.slurm
sbatch scripts/bpreveal_motifquantiles_acc_fold3_atac_profile.slurm
sbatch scripts/bpreveal_motifquantiles_bind_fold1_oct4_counts.slurm
sbatch scripts/bpreveal_motifquantiles_bind_fold1_oct4_profile.slurm
sbatch scripts/bpreveal_motifquantiles_bind_fold1_sox2_counts.slurm
sbatch scripts/bpreveal_motifquantiles_bind_fold1_sox2_profile.slurm
sbatch scripts/bpreveal_motifquantiles_bind_fold1_klf4_counts.slurm
sbatch scripts/bpreveal_motifquantiles_bind_fold1_klf4_profile.slurm
sbatch scripts/bpreveal_motifquantiles_bind_fold1_nanog_counts.slurm
sbatch scripts/bpreveal_motifquantiles_bind_fold1_nanog_profile.slurm
sbatch scripts/bpreveal_motifquantiles_bind_f