# Introduction

Get contribution score for tead single and tead double motif sequences patterns that were part of affinity plot in analysis 8_

# Computational setup

## Environment

In [1]:
import warnings
warnings.filterwarnings("ignore")
from tensorflow.python.util import deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False

#Packages
import os
import sys
import pandas as pd
import numpy as np
import plotnine
import seaborn as sns
from pybedtools import BedTool
from tqdm import tqdm
from bpnet.utils import create_tf_session
from bpnet.BPNet import BPNetSeqModel
from bpnet.simulate import motif_coords
from bpnet.preproc import resize_interval
from concise.preprocessing import encodeDNA
from bpnet.extractors import extract_seq
from plotnine import *
from Bio.Seq import Seq

# Settings
os.chdir(f'/n/projects/kd2200/publication/bpnet/analysis/')
pd.set_option('display.max_columns', 100)
create_tf_session('0',1)

# Custom functions
sys.path.insert(0, f'/n/projects/kd2200/publication/bpnet/analysis/scripts/bpnet/scripts/')
from perturb_functions import insert_motif, one_hot_decode_sequence, one_hot_encode_sequences
from motif_functions import remove_palindromic_motif_duplicates
from data_format_functions import tidy_bpnet_predictions_nexus,tidy_bpnet_contributions
from motifs import extract_seqs_from_df


Using TensorFlow backend.






2024-01-13 19:54:19,192 [INFO] Note: detected 80 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
2024-01-13 19:54:19,193 [INFO] Note: NumExpr detected 80 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2024-01-13 19:54:19,195 [INFO] NumExpr defaulting to 8 threads.












2024-01-13 19:54:25,001 [INFO] Failed to extract font properties from /usr/share/fonts/google-noto-emoji/NotoColorEmoji.ttf: In FT2Font: Can not load face.  Unknown file format.


## Variables

In [38]:
%matplotlib inline

# Pre-existing variables
fasta_file = f'/n/projects/kd2200/publication/bpnet/fasta/mm10.fa'
model_dir = f'/n/projects/kd2200/publication/bpnet/model/dataspec.yaml_default_fold_5/'

# Independent variables
#tead double motif patterns
#tead4_motifs_path = f'tsv/tdbl_motifs_overlap_with_affinity_seq_1based.tsv.gz'

#tead single motif patterns
tead4_motifs_path = f'tsv/td4_single_motifs_overlap_with_affinity_seq_1based.tsv.gz'

## Load BPNet model

In [9]:
model = BPNetSeqModel.from_mdir(model_dir)

# Analysis code
## Import Tead single or double motifs (one pattern set at a time)

In [39]:
motifs_df = pd.read_csv(tead4_motifs_path, sep = '\t')
motifs_df['seq'] = motifs_df['sequence']
motifs_df

Unnamed: 0,seqnames,start,end,width,strand,pattern,contrib_weighted_p,match_weighted_p,example_idx,pattern_start,pattern_end,pattern_center,pattern_len,match_weighted,match_weighted_cat,match_max,match_max_task,contrib_weighted,contrib_weighted_cat,contrib_max,contrib_max_task,seq_match,seq_match_p,seq_match_cat,match.tead4,contrib.tead4,pattern_short,example_start,example_end,example_strand,example_interval_from_task,row_idx,sequence,signal,seq
0,chr3,88235936,88235944,9,+,metacluster_0/pattern_0,0.988075,0.984224,130732,498,507,503,9,0.821801,high,0.821801,tead4,4.207777,high,4.207777,tead4,9.816225,0.839503,high,0.821801,4.207777,m0_p0,88235437,88236437,.,tead4,72910,ACATTCCTG,3.472007,ACATTCCTG
1,chr11,60253759,60253767,9,-,metacluster_0/pattern_0,1.000000,0.982857,8080,408,417,412,9,0.819200,high,0.819200,tead4,5.259412,high,5.259412,tead4,10.154043,1.000000,high,0.819200,5.259412,m0_p0,60253350,60254350,.,cdx2,4132,ACATTCCAG,3.219776,ACATTCCAG
2,chr11,60253759,60253767,9,-,metacluster_0/pattern_0,1.000000,0.982484,62925,538,547,542,9,0.818891,high,0.818891,tead4,5.298977,high,5.298977,tead4,10.154042,1.000000,high,0.818891,5.298977,m0_p0,60253220,60254220,.,tfap2c,31356,ACATTCCAG,3.219776,ACATTCCAG
3,chr11,60253759,60253767,9,-,metacluster_0/pattern_0,1.000000,0.982857,114969,481,490,485,9,0.819240,high,0.819240,tead4,5.264923,high,5.264923,tead4,10.154042,1.000000,high,0.819240,5.264923,m0_p0,60253277,60254277,.,tead4,58562,ACATTCCAG,3.219776,ACATTCCAG
4,chr11,60253759,60253767,9,-,metacluster_0/pattern_0,1.000000,0.982857,149011,350,359,354,9,0.819344,high,0.819344,tead4,5.263971,high,5.263971,tead4,10.154042,1.000000,high,0.819344,5.263971,m0_p0,60253408,60254408,.,yap1,88526,ACATTCCAG,3.219776,ACATTCCAG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67236,chr5,73979778,73979786,9,-,metacluster_0/pattern_0,0.481366,0.661863,180384,371,380,375,9,0.738766,high,0.738766,tead4,1.829661,medium,1.829661,tead4,8.911318,0.604969,medium,0.738766,1.829661,m0_p0,73979406,73980406,.,gata3,103307,GAATTCCAG,0.000000,GAATTCCAG
67237,chrX,37754710,37754718,9,+,metacluster_0/pattern_0,0.994037,0.673168,187578,823,832,828,9,0.741360,high,0.741360,tead4,4.384552,high,4.384552,tead4,9.816225,0.839503,high,0.741360,4.384552,m0_p0,37753886,37754886,.,gata3,106580,ACATTCCTG,0.000000,ACATTCCTG
67238,chr11,95859392,95859400,9,-,metacluster_0/pattern_0,0.230311,0.204099,59468,464,473,468,9,0.629860,low,0.629860,tead4,1.129133,low,1.129133,tead4,6.996727,0.215155,low,0.629860,1.129133,m0_p0,95858927,95859927,.,tfap2c,29832,GAATTCCAA,-0.056048,GAATTCCAA
67239,chr11,95859392,95859400,9,-,metacluster_0/pattern_0,0.237640,0.204099,148968,530,539,534,9,0.629836,low,0.629836,tead4,1.145555,low,1.145555,tead4,6.996727,0.215155,low,0.629836,1.145555,m0_p0,95858861,95859861,.,yap1,88500,GAATTCCAA,-0.056048,GAATTCCAA


## denote coordinate based on eith Tead single or double motifs

In [40]:
#tead double motif patterns
#sites = (0,17)
#motifs_df['sites_seq'] = [s[sites[0]:sites[1]] for s in motifs_df.seq.values]

#tead single motif patterns
sites = (0,10)
motifs_df['sites_seq'] = [s[sites[0]:sites[1]] for s in motifs_df.seq.values]

In [41]:
motifs_df

Unnamed: 0,seqnames,start,end,width,strand,pattern,contrib_weighted_p,match_weighted_p,example_idx,pattern_start,pattern_end,pattern_center,pattern_len,match_weighted,match_weighted_cat,match_max,match_max_task,contrib_weighted,contrib_weighted_cat,contrib_max,contrib_max_task,seq_match,seq_match_p,seq_match_cat,match.tead4,contrib.tead4,pattern_short,example_start,example_end,example_strand,example_interval_from_task,row_idx,sequence,signal,seq,sites_seq
0,chr3,88235936,88235944,9,+,metacluster_0/pattern_0,0.988075,0.984224,130732,498,507,503,9,0.821801,high,0.821801,tead4,4.207777,high,4.207777,tead4,9.816225,0.839503,high,0.821801,4.207777,m0_p0,88235437,88236437,.,tead4,72910,ACATTCCTG,3.472007,ACATTCCTG,ACATTCCTG
1,chr11,60253759,60253767,9,-,metacluster_0/pattern_0,1.000000,0.982857,8080,408,417,412,9,0.819200,high,0.819200,tead4,5.259412,high,5.259412,tead4,10.154043,1.000000,high,0.819200,5.259412,m0_p0,60253350,60254350,.,cdx2,4132,ACATTCCAG,3.219776,ACATTCCAG,ACATTCCAG
2,chr11,60253759,60253767,9,-,metacluster_0/pattern_0,1.000000,0.982484,62925,538,547,542,9,0.818891,high,0.818891,tead4,5.298977,high,5.298977,tead4,10.154042,1.000000,high,0.818891,5.298977,m0_p0,60253220,60254220,.,tfap2c,31356,ACATTCCAG,3.219776,ACATTCCAG,ACATTCCAG
3,chr11,60253759,60253767,9,-,metacluster_0/pattern_0,1.000000,0.982857,114969,481,490,485,9,0.819240,high,0.819240,tead4,5.264923,high,5.264923,tead4,10.154042,1.000000,high,0.819240,5.264923,m0_p0,60253277,60254277,.,tead4,58562,ACATTCCAG,3.219776,ACATTCCAG,ACATTCCAG
4,chr11,60253759,60253767,9,-,metacluster_0/pattern_0,1.000000,0.982857,149011,350,359,354,9,0.819344,high,0.819344,tead4,5.263971,high,5.263971,tead4,10.154042,1.000000,high,0.819344,5.263971,m0_p0,60253408,60254408,.,yap1,88526,ACATTCCAG,3.219776,ACATTCCAG,ACATTCCAG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67236,chr5,73979778,73979786,9,-,metacluster_0/pattern_0,0.481366,0.661863,180384,371,380,375,9,0.738766,high,0.738766,tead4,1.829661,medium,1.829661,tead4,8.911318,0.604969,medium,0.738766,1.829661,m0_p0,73979406,73980406,.,gata3,103307,GAATTCCAG,0.000000,GAATTCCAG,GAATTCCAG
67237,chrX,37754710,37754718,9,+,metacluster_0/pattern_0,0.994037,0.673168,187578,823,832,828,9,0.741360,high,0.741360,tead4,4.384552,high,4.384552,tead4,9.816225,0.839503,high,0.741360,4.384552,m0_p0,37753886,37754886,.,gata3,106580,ACATTCCTG,0.000000,ACATTCCTG,ACATTCCTG
67238,chr11,95859392,95859400,9,-,metacluster_0/pattern_0,0.230311,0.204099,59468,464,473,468,9,0.629860,low,0.629860,tead4,1.129133,low,1.129133,tead4,6.996727,0.215155,low,0.629860,1.129133,m0_p0,95858927,95859927,.,tfap2c,29832,GAATTCCAA,-0.056048,GAATTCCAA,GAATTCCAA
67239,chr11,95859392,95859400,9,-,metacluster_0/pattern_0,0.237640,0.204099,148968,530,539,534,9,0.629836,low,0.629836,tead4,1.145555,low,1.145555,tead4,6.996727,0.215155,low,0.629836,1.145555,m0_p0,95858861,95859861,.,yap1,88500,GAATTCCAA,-0.056048,GAATTCCAA,GAATTCCAA


## Group results based on pattern match summaries.

In [42]:
summarized_tead4_sites_df = motifs_df.groupby(['sites_seq']).size().reset_index()
summarized_tead4_sites_df.columns = ['sites_seq', 'frequency']
summarized_tead4_sites_df = summarized_tead4_sites_df.sort_values('frequency', ascending = False)
summarized_tead4_sites_df = summarized_tead4_sites_df.reset_index()
summarized_tead4_sites_df

Unnamed: 0,index,sites_seq,frequency
0,4,ACATTCCAG,7804
1,10,ACATTCCTG,6024
2,1,AAATTCCTG,5668
3,17,GCATTCCAG,4830
4,0,AAATTCCAG,4293
5,19,GCATTCCTG,4026
6,9,ACATTCCTC,3758
7,15,GAATTCCAG,3354
8,2,ACATTCCAA,3180
9,11,ACATTCCTT,2690


## Get contributions in silico for motifs. 

1. For each motifs
2. Import random sequences.
3. Inject each motifs in random seq.
4. Get predict_all across all trials
5. Average contribution across appropriate areas for trial for all the motifs

In [43]:
trials = 64
random_seqs = np.load('/n/projects/mw2098/analysis/chrombpnet/models/atac/mm10/data/pkl/random_seqs_seed_10_trials_256_array.npz')['seqs_1he'][:trials]
flanks = (random_seqs.shape[1]-1000)//2
random_seqs = random_seqs[:, flanks:(1000+flanks)]
random_seqs.shape

FileNotFoundError: [Errno 2] No such file or directory: '/n/projects/mw2098/analysis/chrombpnet/models/atac/mm10/data/pkl/random_seqs_seed_10_trials_256_array.npz'

In [None]:
## functions for generating random sequences & encoding and decoding in 1he format

# seq in form of ACGT letters
def generate_random_seq(seqlen, weights = [.25, .25, .25, .25]):
    """
    Purpose: Generate a random DNA sequence of a specified length.
    """
    import random
    seq = random.choices(['A','C','G','T'], weights = weights, k=seqlen)
    return(''.join(seq))

# 1he seq
def one_hot_encode_sequences(sequences):
    """
    Purpose: Given an array of sequences, one-hot-encode into a [region x position x 4] array.
    """
    return(np.stack([one_hot_encode_sequence(s) for s in sequences]))


def one_hot_encode_sequence(sequence):
    """
    Kudos to Charles: /n/projects/cm2363/bpnet-nucleosomes/work/localimportance/allLocalImportances.py
    Purpose: Given a SINGLE sequence string, one-hot encode the data.
        + default control_profiles and control_logcounts is to be zeroed out
        + naively detects whether the sequence is one-hot-encoded.
    """
    onehot_mapping = {
    'A': [1,0,0,0],
    'C': [0,1,0,0],
    'G': [0,0,1,0],
    'T': [0,0,0,1],
    'a': [1,0,0,0],
    'c': [0,1,0,0],
    'g': [0,0,1,0],
    't': [0,0,0,1],
    'N': [0,0,0,0]
    }
    return np.array([onehot_mapping[x] for x in sequence])


def one_hot_decode_sequence(array):
    """
    Purpose: Given an array [position x 4], decode sequence to a string.
    """
    onehot_decoder = {
    0: 'A',
    1: 'C',
    2: 'G',
    3: 'T'
    }

    idxs = np.where(array)[1]
    return (''.join([onehot_decoder[i] for i in idxs]))

In [44]:
# generate 64 random seq
import random
random.seed(10)
trials = 64
random_seqs = [generate_random_seq(seqlen = 1000) for i in range(trials)]

In [45]:
# make it 1he
random_seqs_1he =one_hot_encode_sequences(random_seqs)
flanks = (random_seqs_1he.shape[1]-1000)//2
random_seqs_1he = random_seqs_1he[:, flanks:(1000+flanks)]
random_seqs_1he.shape

(64, 1000, 4)

In [46]:
combination_index_df = pd.DataFrame()
combination_w_trials_list = []

for i,row in tqdm(summarized_tead4_sites_df.iterrows()):
        
    #define half sites
    sites = row.sites_seq
    
    #Create sequences
    for j,s in enumerate(random_seqs_1he):
        seq = one_hot_decode_sequence(s)
        seq_w_motif = insert_motif(seq = seq, motif = sites, position = 500)
        combination_w_trials_list.append(seq_w_motif)
        row['trial_index'] = j
        row['combo_index'] = i
        combination_index_df = combination_index_df.append(row)

#Create row index that combines trial index and ocmbo index and matches index of contrib predictions
combination_index_df['row_index'] = (combination_index_df.combo_index * 64 + combination_index_df.trial_index).astype(int)

21it [00:04,  4.62it/s]


In [47]:
len(combination_w_trials_list)

1344

In [48]:
%%script false --no-raise-error

import pickle

#One hot encode sequences
combination_w_trials_seq_1he = one_hot_encode_sequences(combination_w_trials_list)

#Predict all sequences
combinations_w_trials_pred_dict = model.predict_all(combination_w_trials_seq_1he, contrib_method='deeplift')

with open('tsv/td4_single_motifs_overlap_with_affinity_seq.pkl', 'wb') as f:
    pickle.dump(combinations_w_trials_pred_dict, f)
    

# import pkl file

In [17]:
# tead double motif patterns
# import pickle
# with open('tsv/tdbl_motifs_overlap_with_affinity_seq.pkl', 'rb') as f:
#     combinations_w_trials_pred_dict = pickle.load(f)

# tead single motif patterns 
import pickle
with open('tsv/td4_single_motifs_overlap_with_affinity_seq.pkl', 'rb') as f:
    combinations_w_trials_pred_dict = pickle.load(f)


In [49]:
def one_hot_decode_sequence(array):
    """
    Purpose: Given an array [position x 4], decode sequence to a string.
    """
    onehot_decoder = {
    0: 'A',
    1: 'C',
    2: 'G',
    3: 'T'
    }

    idxs = np.where(array)[1]
    return (''.join([onehot_decoder[i] for i in idxs]))

In [50]:

#sanity check for tead4 double
# xx =combinations_w_trials_pred_dict[1]['contrib_score']['tead4/profile'][492:509]* combinations_w_trials_pred_dict[1]['seq'][492:509]
# DF = pd.DataFrame(xx)
# DF

#sanity check for tead4 single
xx =combinations_w_trials_pred_dict[1]['contrib_score']['tead4/profile'][496:505]* combinations_w_trials_pred_dict[1]['seq'][496:505]
DF = pd.DataFrame(xx)
DF

Unnamed: 0,0,1,2,3
0,0.080321,-0.0,0.0,-0.0
1,0.0,0.118716,-0.0,-0.0
2,0.305941,-0.0,-0.0,-0.0
3,-0.0,-0.0,-0.0,0.392919
4,0.0,-0.0,-0.0,0.188562
5,-0.0,0.177476,-0.0,-0.0
6,-0.0,0.237509,-0.0,-0.0
7,-0.148166,-0.0,0.0,-0.0
8,-0.0,0.0,0.018731,-0.0


In [51]:
# for tead4 double
# contrib_by_seq = []
# for j,i in tqdm(enumerate(range(0, len(combinations_w_trials_pred_dict), trials))):
#     contrib_by_trial = []
#     for k in range(trials):
#          #Collect contribution scores under proper coordinates for all trials 
#         contrib = [(combinations_w_trials_pred_dict[i+k]['contrib_score']['tead4/profile'][492:509] * combinations_w_trials_pred_dict[i+k]['seq'][492:509])]
#         contrib_by_trial.append(contrib)
#     contrib_avg = np.mean(np.squeeze(np.array(contrib_by_trial)), axis = 0)
#     contrib_by_seq.append([contrib_avg])
# contrib_by_seqs = np.squeeze(np.array(contrib_by_seq))
# cwm = np.mean(contrib_by_seqs, axis = 0)

# cwm_for_plotting =pd.DataFrame(cwm, columns = ['A','C','G','T']).transpose()


# for tead4 single
contrib_by_seq = []
for j,i in tqdm(enumerate(range(0, len(combinations_w_trials_pred_dict), trials))):
    contrib_by_trial = []
    for k in range(trials):
         #Collect contribution scores under proper coordinates for all trials 
        contrib = [(combinations_w_trials_pred_dict[i+k]['contrib_score']['tead4/profile'][496:505] * combinations_w_trials_pred_dict[i+k]['seq'][496:505])]
        contrib_by_trial.append(contrib)
    contrib_avg = np.mean(np.squeeze(np.array(contrib_by_trial)), axis = 0)
    contrib_by_seq.append([contrib_avg])
contrib_by_seqs = np.squeeze(np.array(contrib_by_seq))
cwm = np.mean(contrib_by_seqs, axis = 0)

cwm_for_plotting =pd.DataFrame(cwm, columns = ['A','C','G','T']).transpose()

21it [00:00, 2697.14it/s]


In [52]:
contrib_by_seq[0]

[array([[ 0.11514929,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.14759827,  0.        ,  0.        ],
        [ 0.41881018,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.51584648],
        [ 0.        ,  0.        ,  0.        ,  0.21992702],
        [ 0.        ,  0.25915034,  0.        ,  0.        ],
        [ 0.        ,  0.28805769,  0.        ,  0.        ],
        [-0.13116744,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.0708409 ,  0.        ]])]

In [53]:
contrib_by_seq[1]

[array([[ 0.10410749,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.12404795,  0.        ,  0.        ],
        [ 0.37628353,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.44455924],
        [ 0.        ,  0.        ,  0.        ,  0.19566168],
        [ 0.        ,  0.21734186,  0.        ,  0.        ],
        [ 0.        ,  0.17162083,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , -0.11619184],
        [ 0.        ,  0.        ,  0.06157233,  0.        ]])]

In [54]:
#saved cwm matrix as tsv

#cwm_for_plotting.to_csv('tsv/tead4_double_motif_contrib_forpatterns.tsv', sep = '\t', index = False)
cwm_for_plotting.to_csv('tsv/tead4_single_motif_contrib_forpatterns.tsv', sep = '\t', index = False)

In [55]:
#saved summary as csv

#summarized_tead4_sites_df.to_csv('tsv/tead4_double_motif_freq_forpatterns.csv', sep = '\t', index = False)
summarized_tead4_sites_df.to_csv('tsv/tead4_single_motif_freq_forpatterns.csv', sep = '\t', index = False)

## Below code was run in R using library(ggseqlogo)

In [None]:
#import tsv
tdbl <- read_tsv("tsv/tead4_double_motif_contrib_forpatterns.tsv",col_names = TRUE)
td4 <- read_tsv("tsv/tead4_single_motif_contrib_forpatterns.tsv",col_names = TRUE)

#make df
tdbl <- as.data.frame(tdbl)
td4 <- as.data.frame(td4)

#make matrix
#rownames(tdbl)<- c("A","C","G","T")
#rownames(td4)<- c("A","C","G","T")
tdbl <- as.matrix.data.frame(tdbl)
td4 <- as.matrix.data.frame(td4)

#plot
g_tdbl <- ggseqlogo(tdbl, method='custom', seq_type='dna',scales = 'none') + ylab('CWM')
g_td4 <- ggseqlogo(td4, method='custom', seq_type='dna',scales = 'none') + ylab('CWM')

# save plot
ggsave("figures/9_tead4_double_motif_analysis/cwm_tead4_double_patterns_contrib.pdf",g_tdbl, height = 4, width = 6)
ggsave("figures/9_tead4_double_motif_analysis/cwm_tead4_single_patterns_contrib.pdf",g_td4, height = 4, width = 6)

In [56]:
cwm_for_plotting

Unnamed: 0,0,1,2,3,4,5,6,7,8
A,0.046283,0.00094,0.292517,0.0,0.0,0.0,0.0,-0.0448,-0.017395
C,0.0,0.063863,0.0,0.0,0.0,0.147612,0.141794,-0.00552,-0.006434
G,0.023388,-0.014333,0.0,0.0,0.0,0.0,0.0,0.0,0.024126
T,0.0,0.0,0.0,0.339111,0.133347,0.0,0.0,-0.034902,-0.012882
