# Introduction

The vmatch pattern of two Tead motifs generated in Rmd (9_tead4_double_motif_analysis) is used to predict Tead4 binding and averaged it

# Computational setup

## Environment

In [1]:
import warnings
warnings.filterwarnings("ignore")
from tensorflow.python.util import deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False

#Packages
import os
import sys
import pandas as pd
import numpy as np
import plotnine
import seaborn as sns
from pybedtools import BedTool
from tqdm import tqdm
from bpnet.utils import create_tf_session
from bpnet.BPNet import BPNetSeqModel
from bpnet.simulate import motif_coords
from bpnet.preproc import resize_interval
from concise.preprocessing import encodeDNA
from bpnet.extractors import extract_seq
from plotnine import *
from Bio.Seq import Seq

# Settings
os.chdir(f'/n/projects/kd2200/publication/bpnet/analysis/')
pd.set_option('display.max_columns', 100)
create_tf_session('0',1)

# Custom functions
sys.path.insert(0, f'/n/projects/kd2200/publication/bpnet/analysis/scripts/bpnet/scripts')
from perturb_functions import insert_motif, one_hot_decode_sequence, one_hot_encode_sequences,one_hot_encode_sequence
from motif_functions import remove_palindromic_motif_duplicates
from data_format_functions import tidy_bpnet_predictions_nexus,tidy_bpnet_contributions
sys.path.insert(0, f'/n/projects/kd2200/publication/bpnet/analysis/scripts/py')
from motifs import extract_seqs_from_df

Using TensorFlow backend.
2024-01-28 17:15:54,083 [INFO] Note: detected 80 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
2024-01-28 17:15:54,086 [INFO] Note: NumExpr detected 80 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2024-01-28 17:15:54,087 [INFO] NumExpr defaulting to 8 threads.
2024-01-28 17:15:59,765 [INFO] Failed to extract font properties from /usr/share/fonts/google-noto-emoji/NotoColorEmoji.ttf: In FT2Font: Can not load face.  Unknown file format.


In [95]:
%matplotlib inline

# Pre-existing variables
fasta_file = f'/n/projects/kd2200/publication/bpnet/fasta/mm10.fa'
model_dir = f'/n/projects/kd2200/publication/bpnet/model/dataspec.yaml_default_fold_5/'

# Independent variables
#tead4_motifs_path = f'tsv/combine_double_motifs_diffspacing.tsv.gz'
#tead4_motifs_path = f'tsv/zero_double_motifs_spacing.tsv.gz'
#tead4_motifs_path = f'tsv/one_double_motifs_spacing.tsv.gz'
#tead4_motifs_path = f'tsv/two_double_motifs_spacing.tsv.gz'
#tead4_motifs_path = f'tsv/three_double_motifs_spacing.tsv.gz'
#tead4_motifs_path = f'tsv/four_double_motifs_spacing.tsv.gz'
#tead4_motifs_path = f'tsv/five_double_motifs_spacing.tsv.gz'
tead4_motifs_path = f'tsv/six_double_motifs_spacing.tsv.gz'


In [96]:
model = BPNetSeqModel.from_mdir(model_dir)

In [97]:
motifs_df = pd.read_csv(tead4_motifs_path, sep = '\t')
motifs_df.shape
motifs_df  

Unnamed: 0,seqnames,start,end,width,strand,DNA,at_space,pattern_short
0,chr1,3646676,3646698,23,+,GAATTCCAGTGGTACATTCCCTC,six,m6_p0
1,chr1,4125620,4125642,23,+,AAATTCCTGTTCTAAATTCCTGT,six,m6_p0
2,chr1,8662812,8662834,23,+,GAATTCCTGCCCTGAATTCCCAT,six,m6_p0
3,chr1,11045119,11045141,23,+,ACATTCCTGTGTTTAATTCCTTT,six,m6_p0
4,chr1,12658340,12658362,23,+,ACATTCCAGAAATATATTCCCTG,six,m6_p0
...,...,...,...,...,...,...,...,...
1038,chrX,153894017,153894039,23,-,ACATTCCTAGGATACATTCCTAG,six,m6_p0
1039,chrX,163440930,163440952,23,-,GAATTCCAAGCACCTATTCCAAC,six,m6_p0
1040,chrX,164097949,164097971,23,-,ACATTCCAAAGGTGAATTCCTCA,six,m6_p0
1041,chrY,2373413,2373435,23,+,AAATTCCAAGCATATATTCCTTG,six,m6_p0


In [98]:
motifs_df['fwd_seq'] = extract_seqs_from_df(coords_df = motifs_df, fasta_path = fasta_file, chrom_column = 'seqnames', start_column = 'start', end_column = 'end')

In [99]:
motifs_oriented_df = pd.DataFrame()
for i,row in tqdm(motifs_df.iterrows()):
    if row.strand=='-':
        fwd_seq = Seq(row.fwd_seq)
        rev_seq = str(fwd_seq.reverse_complement())
        row['seq'] = rev_seq
    else:
        row['seq'] = row.fwd_seq
    motifs_oriented_df = motifs_oriented_df.append(row)

1043it [00:05, 188.90it/s]


In [100]:
motifs_oriented_df["row_id"]=motifs_oriented_df.index

In [101]:
motifs_oriented_df

Unnamed: 0,DNA,at_space,end,fwd_seq,pattern_short,seq,seqnames,start,strand,width,row_id
0,GAATTCCAGTGGTACATTCCCTC,six,3646698.0,AATTCCAGTGGTACATTCCCTC,m6_p0,AATTCCAGTGGTACATTCCCTC,chr1,3646676.0,+,23.0,0
1,AAATTCCTGTTCTAAATTCCTGT,six,4125642.0,AATTCCTGTTCTAAATTCCTGT,m6_p0,AATTCCTGTTCTAAATTCCTGT,chr1,4125620.0,+,23.0,1
2,GAATTCCTGCCCTGAATTCCCAT,six,8662834.0,AATTCCTGCCCTGAATTCCCAT,m6_p0,AATTCCTGCCCTGAATTCCCAT,chr1,8662812.0,+,23.0,2
3,ACATTCCTGTGTTTAATTCCTTT,six,11045141.0,CATTCCTGTGTTTAATTCCTTT,m6_p0,CATTCCTGTGTTTAATTCCTTT,chr1,11045119.0,+,23.0,3
4,ACATTCCAGAAATATATTCCCTG,six,12658362.0,CATTCCAGAAATATATTCCCTG,m6_p0,CATTCCAGAAATATATTCCCTG,chr1,12658340.0,+,23.0,4
...,...,...,...,...,...,...,...,...,...,...,...
1038,ACATTCCTAGGATACATTCCTAG,six,153894039.0,TAGGAATGTATCCTAGGAATGT,m6_p0,ACATTCCTAGGATACATTCCTA,chrX,153894017.0,-,23.0,1038
1039,GAATTCCAAGCACCTATTCCAAC,six,163440952.0,TTGGAATAGGTGCTTGGAATTC,m6_p0,GAATTCCAAGCACCTATTCCAA,chrX,163440930.0,-,23.0,1039
1040,ACATTCCAAAGGTGAATTCCTCA,six,164097971.0,GAGGAATTCACCTTTGGAATGT,m6_p0,ACATTCCAAAGGTGAATTCCTC,chrX,164097949.0,-,23.0,1040
1041,AAATTCCAAGCATATATTCCTTG,six,2373435.0,AATTCCAAGCATATATTCCTTG,m6_p0,AATTCCAAGCATATATTCCTTG,chrY,2373413.0,+,23.0,1041


In [102]:
## functions for generating random sequences & encoding and decoding from 1he format

# seq in form of ACGT letters
def generate_random_seq(seqlen, weights = [.25, .25, .25, .25]):
    """
    Purpose: Generate a random DNA sequence of a specified length.
    """
    import random
    seq = random.choices(['A','C','G','T'], weights = weights, k=seqlen)
    return(''.join(seq))

# 1he seq
def one_hot_encode_sequences(sequences):
    """
    Purpose: Given an array of sequences, one-hot-encode into a [region x position x 4] array.
    """
    return(np.stack([one_hot_encode_sequence(s) for s in sequences]))


def one_hot_encode_sequence(sequence):
    """
    Kudos to Charles: /n/projects/cm2363/bpnet-nucleosomes/work/localimportance/allLocalImportances.py
    Purpose: Given a SINGLE sequence string, one-hot encode the data.
        + default control_profiles and control_logcounts is to be zeroed out
        + naively detects whether the sequence is one-hot-encoded.
    """
    onehot_mapping = {
    'A': [1,0,0,0],
    'C': [0,1,0,0],
    'G': [0,0,1,0],
    'T': [0,0,0,1],
    'a': [1,0,0,0],
    'c': [0,1,0,0],
    'g': [0,0,1,0],
    't': [0,0,0,1],
    'N': [0,0,0,0]
    }
    return np.array([onehot_mapping[x] for x in sequence])

def one_hot_decode_sequence(array):
    """
    Purpose: Given an array [position x 4], decode sequence to a string.
    """
    onehot_decoder = {
    0: 'A',
    1: 'C',
    2: 'G',
    3: 'T'
    }

    idxs = np.where(array)[1]
    return (''.join([onehot_decoder[i] for i in idxs]))

In [103]:
# generate 64 random seq
import random
random.seed(10)
trials = 64
random_seqs = [generate_random_seq(seqlen = 1000) for i in range(trials)]

In [104]:
# make it 1he
random_seqs_1he =one_hot_encode_sequences(random_seqs)
flanks = (random_seqs_1he.shape[1]-1000)//2
random_seqs_1he = random_seqs_1he[:, flanks:(1000+flanks)]
random_seqs_1he.shape

(64, 1000, 4)

In [105]:
## get random seq decoded as list
seqs = []
for j,s in enumerate(random_seqs_1he):
    seqs.append(one_hot_decode_sequence(s))

In [106]:
def collect_summarized_counts(motifs_oriented_df, random_seqs, tasks_of_interest, motif_window, measurement_window):

    """
    Purpose: Given WT genomic tead4 spaced seqs -> predict binding and average it
    Inputs:
    Outputs:
    """
    
    from tqdm import tqdm
    from concise.preprocessing import encodeDNA
    def generate_seq(random_seq, central_motif, side_motif=None, side_distances=[], seqlen=1000):
        from bpnet.simulate import insert_motif
        injected_seq = insert_motif(random_seq, central_motif, seqlen // 2)
        for d in side_distances:
            injected_seq = insert_motif(injected_seq, side_motif, d)
        return injected_seq
    
    summary_for_all_combos_df = pd.DataFrame()
    for i,row in tqdm(motifs_oriented_df.iterrows()):
        
        #define half sites
        first_half_site = row.seq
        second_half_site = row.seq
        combo_index = row.row_id

        #center_injection_site = (motif_window[1]-motif_window[0])//2 + motif_window[0]
        #Generate sequences with variant states
        six_seqs = [generate_seq(random_seq = i, central_motif = first_half_site, side_motif = '', 
                                  side_distances = []) for i in random_seqs]


        #Collect into dictionary
        seq_dict = {'six': six_seqs}

        #Predict each variant state({inj_state -> {task -> [64 x 1000 x 2]}))
        preds_dict = {k: model.predict(encodeDNA(v)) for k,v in seq_dict.items()}
        
        # Average across trials and measurement window({inj_state -> {task -> #}})
        preds_single_dict = {inj_state: {task: np.mean(np.sum(v1[:, measurement_window[0]:measurement_window[1], :], axis = (1,2))) 
                     for task,v1 in v.items()} 
         for inj_state,v in preds_dict.items()}

        #Summarize counts together pd.df based on predictions
        summary_df = pd.DataFrame()
        for inj_state, v in preds_single_dict.items():
            df = pd.DataFrame.from_dict(v, orient = 'index').transpose()
            df['inj_state'] = inj_state
            df = df.melt(id_vars = ['inj_state'], var_name = 'task', value_name = 'counts')
            summary_df = summary_df.append(df)
        summary_df['first_half_site'] = first_half_site
        summary_df['combo_index'] = combo_index
        summary_for_all_combos_df = summary_for_all_combos_df.append(summary_df)
    return(summary_for_all_combos_df)

In [107]:
counts_df = collect_summarized_counts(motifs_oriented_df=motifs_oriented_df,random_seqs = seqs, 
                                   tasks_of_interest = ['tead4'],
                                   motif_window = (488, 514), measurement_window = (470, 530))
counts_df

1043it [01:05, 15.86it/s]


Unnamed: 0,inj_state,task,counts,first_half_site,combo_index
0,six,cdx2,3.263674,AATTCCAGTGGTACATTCCCTC,0
1,six,tfap2c,1.276643,AATTCCAGTGGTACATTCCCTC,0
2,six,tead4,6.459865,AATTCCAGTGGTACATTCCCTC,0
3,six,yap1,1.666966,AATTCCAGTGGTACATTCCCTC,0
4,six,gata3,0.991754,AATTCCAGTGGTACATTCCCTC,0
...,...,...,...,...,...
0,six,cdx2,3.387744,ACATTCCTACTCCCAATTCCTG,1042
1,six,tfap2c,1.270150,ACATTCCTACTCCCAATTCCTG,1042
2,six,tead4,4.661751,ACATTCCTACTCCCAATTCCTG,1042
3,six,yap1,1.420984,ACATTCCTACTCCCAATTCCTG,1042


In [108]:
# subset the tead4 task for averaging and used this values to plot in R 
x = counts_df[counts_df['task'] == "tead4"]
df = x.groupby(['task']).agg({'counts' : 'mean'}).reset_index()

In [16]:
df_zero

Unnamed: 0,task,counts
0,tead4,3.651979


In [30]:
df_one

Unnamed: 0,task,counts
0,tead4,6.618369


In [45]:
df_two

Unnamed: 0,task,counts
0,tead4,13.13456


In [59]:
df_three

Unnamed: 0,task,counts
0,tead4,5.612301


In [74]:
df_four

Unnamed: 0,task,counts
0,tead4,7.136119


In [89]:
df_five

Unnamed: 0,task,counts
0,tead4,4.652863


In [109]:
df_six

Unnamed: 0,task,counts
0,tead4,5.205096


In [None]:
df_zero

In [81]:
df_one

Unnamed: 0,task,counts
0,tead4,9.155832


In [113]:
df_two

Unnamed: 0,task,counts
0,tead4,22.75915


In [131]:
df_three

Unnamed: 0,task,counts
0,tead4,8.368184


In [147]:
df_four

Unnamed: 0,task,counts
0,tead4,9.310171


In [162]:
df_five

Unnamed: 0,task,counts
0,tead4,7.212682


In [178]:
df_six

Unnamed: 0,task,counts
0,tead4,6.170466


In [41]:
#save in case if you need it
counts_df.to_csv(f'csv/combine_diff_spacing_tead4_preds.csv.gz', index = False)