In [2]:
import numpy as np

from utils.motif_utils import get_attention_sequences, motif_analysis
from utils.motif_probability_calculator import process_motif_files, combine_tomtom_files

In [None]:
# kmer = 6
model_date = "2025-02-27_V2"
min_lengths = [5]
print(f"Model date: {model_date}")

for kmer in range(3, 7):
    for min_len in min_lengths:
        print(f"Processing with: min_len={min_len}")

        # Define file paths
        target_directory = f"./analysis/attention/{model_date}/{kmer}-mer/min_len={min_len}"
        attention_score_file = f"outputs/attention_scores/{model_date}/{kmer}-mer_train_attention_scores.npy"
        dna_sequence_file = "./data/enhancer_identification/combined_dna_sequences.txt"

        # Retrieve attention scores and sequences
        pos_sequences, neg_sequences, pos_attention_scores, neg_attention_scores = get_attention_sequences(attention_score_file, dna_sequence_file)

        # Perform motif analysis
        motif_analysis(pos_seqs=pos_sequences, neg_seqs=neg_sequences, pos_atten_scores=pos_attention_scores, window_size=10, save_file_dir=target_directory, min_len=min_len, pval_cutoff=0.05)

        # Define the output file name for probability calculation
        probability_file_path = target_directory + "/"
        process_motif_files(probability_file_path)
        combine_tomtom_files(probability_file_path, f"{min_len}_stats.txt")

Model date: 2025-02-27_V2
Processing with: min_len=5
Size: (2968, 200)
Before filtering, the length of motif: 6520
After filtering, the length of motif: 468
After merger, the number of groups: 52
After number filter, the number of group: 18
Processing with: min_len=5
Size: (2968, 200)
Before filtering, the length of motif: 6774
After filtering, the length of motif: 991
After merger, the number of groups: 55
After number filter, the number of group: 28
Processing with: min_len=5
Size: (2968, 200)
Before filtering, the length of motif: 8654
After filtering, the length of motif: 282
After merger, the number of groups: 45
After number filter, the number of group: 13
Processing with: min_len=5
Size: (2968, 200)
Before filtering, the length of motif: 8231
After filtering, the length of motif: 477
After merger, the number of groups: 46
After number filter, the number of group: 15


In [6]:
!rm -rf analysis/attention

In [6]:
kmer_range = [3, 4, 5, 6]
model_date = "2025-02-27_V2"

min_lengths = [5]

for min_len in min_lengths:
    print(f"Processing with: min_len={min_len}")
    dna_sequence_file = "./data/enhancer_identification/combined_dna_sequences.txt"

    # Initialize lists to store attention scores and sequences
    all_pos_attention_scores = []
    all_neg_attention_scores = []
    pos_sequences = None
    neg_sequences = None

    for kmer in kmer_range:
        # Define file paths
        attention_score_file = f"outputs/attention_scores/{model_date}/{kmer}-mer_train_attention_scores.npy"

        # Retrieve attention scores and sequences
        pos_seq, neg_seq, pos_attention_scores, neg_attention_scores = get_attention_sequences(attention_score_file, dna_sequence_file)

        # Store attention scores
        all_pos_attention_scores.append(pos_attention_scores)
        all_neg_attention_scores.append(neg_attention_scores)

        # Store sequences (they are the same for all k-mers)
        if pos_sequences is None:
            pos_sequences = pos_seq
            neg_sequences = neg_seq

    # Combine attention scores
    combined_pos_attention_scores = np.mean(np.stack(all_pos_attention_scores), axis=0)
    combined_neg_attention_scores = np.mean(np.stack(all_neg_attention_scores), axis=0)

    # Define target directory
    target_directory = f"./analysis/attention/{model_date}/average/min_len={min_len}"

    # Perform motif analysis with combined attention scores
    motif_analysis(
        pos_seqs=pos_sequences,
        neg_seqs=neg_sequences,
        pos_atten_scores=combined_pos_attention_scores,
        window_size=10,
        save_file_dir=target_directory,
        min_len=min_len,
        pval_cutoff=0.05,
    )

    # # Define the output file name for probability calculation
    probability_file_path = target_directory + "/"
    process_motif_files(probability_file_path)
    combine_tomtom_files(probability_file_path, f"{min_len}_stats.txt")

Processing with: min_len=5
Size: (2968, 200)
Size: (2968, 200)
Size: (2968, 200)
Size: (2968, 200)
Before filtering, the length of motif: 7114
After filtering, the length of motif: 704
After merger, the number of groups: 51
After number filter, the number of group: 19
