# Motif Detection of Every Gene Detected in Experimental Data

Teague McCracken 
03/30/2025

In [1]:
# Initialize the notebook, set environment to transcriptomics
import pandas as pd
import subprocess
import os
import multiprocessing
from pathlib import Path
import numpy as np

data_path = '/home/temccrac/Programs/git_clones/ECE759_Project/cellular_clarity'
checkpoint_path = '/home/temccrac/Programs/cellular_clarity_project/checkpoints' # on Teague's server 

## Create the New Reference Genome, trying Araport11

In [2]:
# File paths for step 1 using Araport 11 annotations, 2016, had ~448 missing genes, None are DEGs
gtf = "/home/temccrac/Programs/data/genomes/Araport11/Araport11_renamed.gff"
#gene_list = "/home/temccrac/Programs/git_clones/ECE759_Project/cellular_clarity/genes_of_interest.txt" # our results
gene_list = "/home/temccrac/Programs/git_clones/ECE759_Project/cellular_clarity/paperresults/genes_of_interest.txt"
genome_fa = "/home/temccrac/Programs/data/genomes/Araport11/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa"
genome_sizes = "/home/temccrac/Programs/data/genomes/Araport11/TAIR10.chrom.sizes"

## Step 1 - Extract promoter sequences for each gene

Inputs needed: <br>
1. gtf_file (reference genome information) <br>
2. gene_list_file (detected genes from experiment) <br>
3. genome.fa <br>
4. genomes size file Arabidopsis.genome (contains genome size information) <br>

Outputs: all outputs go to project checkpoint folder <br>
1. filtered_gtf (reduced version of Arabidopsis gtf file) <br>
2. tss.bed (bed format with transcription start sites) <br>
3. promoters.bed (bed format with 1000 bp upstream regions (promoters) of genes) <br>
4. promoters.fasta (the final output, sequences of promoter regions of each gene of interest) <br>

In [3]:
# Core functions for promoter extraction
def filter_gtf_by_genes(gtf_file, gene_list_file, output_gtf):
    with open(gene_list_file, 'r') as f:
        genes = set(f.read().splitlines())

    with open(gtf_file, 'r') as gtf_in, open(output_gtf, 'w') as gtf_out:
        for line in gtf_in:
            if line.startswith('#'):
                continue
            if any(gene in line for gene in genes):
                gtf_out.write(line)

def extract_tss_bed_original(filtered_gtf, output_bed): #with tair .gtf annotations
    awk_cmd = (
        '''awk '$3 == "transcript" {
            match($0, /gene_id "([^"]+)"/, m);
            gene = m[1];
            if ($7 == "+") {
                print $1 "\\t" $4-1 "\\t" $4 "\\t" gene "\\t.\\t" $7;
            } else {
                print $1 "\\t" $5-1 "\\t" $5 "\\t" gene "\\t.\\t" $7;
            }
        }' '''
    )
    full_cmd = f"{awk_cmd} {filtered_gtf} > {output_bed}"
    subprocess.run(full_cmd, shell=True, check=True)

def extract_tss_bed_v2(filtered_gtf, output_bed): # command for using araport11 .gff
    awk_cmd = (
    '''awk '$3 == "mRNA" {
        # Extract the gene ID from "Parent=..." in the 9th field
        match($9, /Parent=([^;]+)/, m);
        gene = m[1];

        # For plus-strand, TSS is the start ($4). For minus-strand, TSS is the end ($5).
        if ($7 == "+") {
            print $1 "\\t" $4-1 "\\t" $4 "\\t" gene "\\t.\\t" $7;
        } else {
            print $1 "\\t" $5-1 "\\t" $5 "\\t" gene "\\t.\\t" $7;
        }
    }' '''
    )
    full_cmd = f"{awk_cmd} {filtered_gtf} > {output_bed}"
    subprocess.run(full_cmd, shell=True, check=True)

def extract_tss_bed(filtered_gtf, output_bed):
    awk_cmd = (
    '''awk '{
        match($9, /Parent=([^;]+)/, m);
        gene = m[1];
        if (gene == "") {
            match($9, /ID=([^;]+)/, m);
            gene = m[1];
        }
        if ($7 == "+") {
            print $1 "\\t" $4-1 "\\t" $4 "\\t" gene "\\t.\\t" $7;
        } else {
            print $1 "\\t" $5-1 "\\t" $5 "\\t" gene "\\t.\\t" $7;
        }
    }' '''
    )
    full_cmd = f"{awk_cmd} {filtered_gtf} > {output_bed}"
    subprocess.run(full_cmd, shell=True, check=True)

def create_promoter_bed(tss_bed, genome_sizes, output_bed, length=1000):
    with open(output_bed, 'w') as out_f:
        subprocess.run([
            "bedtools", "flank",
            "-i", tss_bed,
            "-g", genome_sizes,
            "-l", str(length),
            "-r", "0",
            "-s"
        ], stdout=out_f, check=True)

def extract_promoter_fasta(genome_fa, promoter_bed, output_fasta):
    subprocess.run([
        "bedtools", "getfasta",
        "-fi", genome_fa,
        "-bed", promoter_bed,
        "-fo", output_fasta,
        "-s",
        "-name"  # preserves gene IDs in FASTA headers
    ], check=True)

# Functions to execute the above commands in parallel
def run_promoter_pipeline(chunk_id, gene_list_chunk, gtf, genome_fa, genome_sizes, outdir):
    filtered_gtf = f"{outdir}/filtered_{chunk_id}.gtf"
    tss_bed = f"{outdir}/tss_{chunk_id}.bed"
    promoter_bed = f"{outdir}/promoters_{chunk_id}.bed"
    fasta_out = f"{outdir}/promoters_{chunk_id}.fa"

    # Write chunked gene list
    gene_list_file = f"{outdir}/genes_{chunk_id}.txt"
    with open(gene_list_file, 'w') as f:
        f.write('\n'.join(gene_list_chunk))

    # Run sub-steps
    filter_gtf_by_genes(gtf, gene_list_file, filtered_gtf)
    extract_tss_bed(filtered_gtf, tss_bed)
    create_promoter_bed(tss_bed, genome_sizes, promoter_bed)
    extract_promoter_fasta(genome_fa, promoter_bed, fasta_out)

def parallel_promoter_extraction(genes_file, gtf, genome_fa, genome_sizes, outdir, num_cpus=10):
    with open(genes_file) as f:
        genes = f.read().splitlines()

    # Split gene list into chunks
    chunk_size = len(genes) // num_cpus
    chunks = [genes[i:i + chunk_size] for i in range(0, len(genes), chunk_size)]

    os.makedirs(outdir, exist_ok=True)

    args = [(i, chunks[i], gtf, genome_fa, genome_sizes, outdir) for i in range(len(chunks))]
    with multiprocessing.Pool(num_cpus) as pool:
        pool.starmap(run_promoter_pipeline, args)

    # Merge all .fa files
    merged_fasta = os.path.join(outdir, "promoters_merged.fa")
    with open(merged_fasta, 'w') as outfile:
        for i in range(len(chunks)):
            chunk_fasta = os.path.join(outdir, f"promoters_{i}.fa")
            with open(chunk_fasta, 'r') as infile:
                outfile.write(infile.read())

    print("✅ All promoters extracted and merged.")

In [4]:
# Run steps for Step 1
parallel_promoter_extraction(
    genes_file=gene_list,
    gtf=gtf,
    genome_fa=genome_fa,
    genome_sizes=genome_sizes,
    outdir=checkpoint_path,
    num_cpus=10
)

print("✅ Promoter FASTA extraction complete.")

✅ All promoters extracted and merged.
✅ Promoter FASTA extraction complete.


After running, I deleted all the unneccessary intermediate files and kept only the promoter_merged.fa file

Now, I check that all the genes of interests have promoters that were saved in the merged promoter file

In [3]:
def check_promoter_coverage(gene_list_file, promoters_fasta):
    # Read expected gene list from a plain text file (one gene per line)
    with open(gene_list_file) as f:
        expected_genes = set(f.read().splitlines())

    # Function to standardize gene IDs (remove version suffixes, etc.)
    def standardize_gene_id(gene_id):
        return gene_id.split('.')[0]

    # Standardize expected gene IDs
    expected_genes = {standardize_gene_id(gene) for gene in expected_genes}

    # Read FASTA headers from the promoters file and standardize the gene IDs
    found_genes = set(
        standardize_gene_id(line[1:].strip().split("::")[0])
        for line in open(promoters_fasta) if line.startswith(">")
    )

    # Compute missing and extra genes
    missing = expected_genes - found_genes
    extra = found_genes - expected_genes

    print(f"✅ Found promoters for {len(found_genes)} / {len(expected_genes)} genes.")
    if missing:
        print(f"❌ Missing {len(missing)} genes:")
        for gene in list(missing)[:10]:
            print("  ", gene)
    else:
        print("🎉 No missing genes.")

    if extra:
        print(f"⚠️ {len(extra)} unexpected gene(s) in output (not in your original list):")
        for gene in list(extra)[:5]:
            print("  ", gene)
    return missing, extra, expected_genes

# Example usage:
missing, extra, expected_genes = check_promoter_coverage(
    gene_list_file=gene_list,
    promoters_fasta="/home/temccrac/Programs/cellular_clarity_project/checkpoints/promoters_merged.fa"
)

✅ Found promoters for 34260 / 33602 genes.
❌ Missing 488 genes:
   AT3G62422
   AT3G42385
   AT2G38544
   AT3G32905
   AT2G33440
   AT5G17795
   AT1G48598
   AT2G12557
   AT5G64341
   AT5G47928
⚠️ 1146 unexpected gene(s) in output (not in your original list):
   ath-miR167b
   AT2G08470
   AT1G05337
   AT2G08345
   AT1G04117


Looking at which ones are missing from the reference annotation and then I will save this for our records

In [76]:
pd.Series(list(missing)).to_csv(os.path.join(data_path, 'Araport11_2016_missing_final.txt'), index=False, header=False)
missing = list(pd.read_csv(os.path.join(data_path, 'Araport11_2016_missing_final.txt'), header=None)[0])

In [7]:
DEGs_pr = pd.read_csv(os.path.join(data_path, 'paperresults/DEGs_clusters_epidermis_paperresults.csv'), index_col=None) # We are working with the Epidermis data
notmissing = pd.Series(list(set(DEGs_pr['AGI']) - set(missing)))
len(notmissing)
notmissing.to_csv(os.path.join(data_path, 'Araport11_2016_notmissing.txt'), index=False, header=False)

In [13]:
missing_DEGs = set(missing).intersection(set(DEGs_pr['AGI']))
missing_DEGs = pd.Series(list(missing_DEGs))
missing_DEGs.to_csv(os.path.join(data_path, 'Araport11_2016_missingDEGs.txt'), index=False, header=False)

In [15]:
# Loading checkpoints
notmissing_DEGs = pd.read_csv(os.path.join(data_path, 'Araport11_2016_notmissing.txt'))
#missing_DEGs = pd.read_csv(os.path.join(data_path, 'Araport11_2016_missingDEGs.txt'))

## Step 2 - Clustering / DEG Detection

I will use the papers DEG and cluster results

## Step 3 - Motif Detection Using AME

combined_motifs.meme is the DAP-Seq database

In [4]:
import os
import argparse
import subprocess
import multiprocessing
from Bio import SeqIO
import random
random.seed(33)

In [57]:
motif_detection_path = '/home/temccrac/Programs/data/cellular_clarity/dap_seq_data/dap_data_v4/ArabidopsisDAPv1.meme'
#promoters_fasta = '/home/temccrac/Programs/cellular_clarity_project/checkpoints/promoters_Araport11_2016.fa'
promoters_fasta = '/home/temccrac/Programs/cellular_clarity_project/checkpoints/promoters_merged.fa'
#out_dir = "/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment"
ame_executable = "/home/temccrac/meme/bin/ame"                        # AME executable (or provide full path)
additional_params = ""                        # Additional parameters for AME (if any)
cpus = 10                                      # Number of parallel processes

DEGs_pr = pd.read_csv(os.path.join(data_path, 'paperresults/DEGs_clusters_epidermis_paperresults.csv'), index_col=None) # We are working with the Epidermis data
DEGs_included = DEGs_pr[~DEGs_pr['AGI'].isin(missing)]
cluster_sizes = DEGs_included['cluster_id'].value_counts()
print('smallest cluster size is', cluster_sizes.min(), 'genes')

foreground_groups = DEGs_included.groupby("cluster_id")["AGI"].apply(list).to_dict()

group_sizes = [len(genes) for genes in foreground_groups.values()]
average_size = sum(group_sizes) / len(group_sizes)
non_DEGs = pd.read_csv(os.path.join(data_path, 'paperresults/non_DEGs.csv'), header=None, index_col = None)
non_DEGs_list = non_DEGs[0].tolist()
non_DEGs_list = [id for id in non_DEGs_list if id not in missing]
non_DEGs_sampled = random.sample(non_DEGs_list, len(DEGs_included))

groups = np.array_split(non_DEGs_sampled, 50)

# Convert each numpy array to a list and store them in a dictionary
background_groups = {i+1: list(group) for i, group in enumerate(groups)}

smallest cluster size is 3 genes


In [33]:
def run_ame(sequence_fasta, ame_output_dir, motif_detection_path, ame_executable="ame", additional_params=""):
    """
    Runs the AME command using the provided motif file (second positional argument)
    and the sequence FASTA (first positional argument). 
    """
    # According to AME usage, the first positional argument is the sequence file,
    # and the second positional argument is the motif file.
    cmd = [
        ame_executable,
        "--oc", ame_output_dir,
        sequence_fasta,          # First positional argument: sequence file (foreground or background)
        motif_detection_path     # Second positional argument: motif file in MEME format
    ]
    if additional_params:
        cmd.extend(additional_params.split())
    print("Running AME command:", " ".join(cmd))
    subprocess.run(cmd, check=True)

def process_group_both(group_name, fg_gene_ids, bg_gene_ids, promoters_dict, out_dir, motif_detection_path, ame_executable, additional_params):
    """
    For a given group:
      - Write two FASTA files: one for the foreground group (DEGs) and one for the background group (non-DEGs).
      - Create separate output directories for AME results on the foreground and background.
      - Run AME separately on the foreground and background FASTA files.
    """
    # Define output FASTA file paths
    fg_fasta = os.path.join(out_dir, f"{group_name}_foreground.fa")
    bg_fasta = os.path.join(out_dir, f"{group_name}_background.fa")
    
    # Write the foreground FASTA (DEG promoters)
    with open(fg_fasta, "w") as fg_handle:
        for gene in fg_gene_ids:
            if gene in promoters_dict:
                SeqIO.write(promoters_dict[gene], fg_handle, "fasta")
            else:
                print(f"Warning: Foreground gene {gene} not found in promoters dictionary.")
    
    # Write the background FASTA (non-DEG promoters)
    with open(bg_fasta, "w") as bg_handle:
        for gene in bg_gene_ids:
            if gene in promoters_dict:
                SeqIO.write(promoters_dict[gene], bg_handle, "fasta")
            else:
                print(f"Warning: Background gene {gene} not found in promoters dictionary.")
    
    # Create separate output directories for foreground and background AME results
    ame_out_fg = os.path.join(out_dir, f"{group_name}_ame_foreground")
    ame_out_bg = os.path.join(out_dir, f"{group_name}_ame_background")
    os.makedirs(ame_out_fg, exist_ok=True)
    os.makedirs(ame_out_bg, exist_ok=True)
    
    # Run AME separately for foreground and background
    run_ame(fg_fasta, ame_out_fg, motif_detection_path, ame_executable, additional_params)
    run_ame(bg_fasta, ame_out_bg, motif_detection_path, ame_executable, additional_params)

def load_promoters(promoters_fasta):
    """
    Loads the large FASTA file of promoters into a dictionary keyed by gene ID.
    Assumes that each record's id has the format 'AGI::...' and uses the part before the first "::" as the key.
    """
    promoters = {}
    for record in SeqIO.parse(promoters_fasta, "fasta"):
        gene_id = record.id.split("::")[0]
        promoters[gene_id] = record
    return promoters


In [None]:
# Load promoter sequences into a dictionary
promoters_dict = load_promoters(promoters_fasta)

# Prepare arguments for each group
args_list = []
for group_name, fg_gene_ids in foreground_groups.items():
    # Get background gene list for this group (if not defined, use an empty list)
    bg_gene_ids = background_groups.get(group_name, [])
    args_list.append((group_name, fg_gene_ids, bg_gene_ids, promoters_dict, out_dir, motif_detection_path, ame_executable, additional_params))

# Run AME for each group in parallel using multiprocessing
pool = multiprocessing.Pool(processes=cpus)
pool.starmap(process_group_both, args_list)
pool.close()
pool.join()

print("All AME analyses are complete.")

In [None]:
# Load promoter sequences into a dictionary
promoters_dict = load_promoters(promoters_fasta)
if string in promoters_dict:
    print("'AT1G48598' is in promoters_dict")
else:
    print("'AT1G48598' is not in promoters_dict")

'AT1G48598 ' is not in promoters_dict


## Step 4 - Processing Motif Information Output

In [None]:
import re

In [None]:
# Takes one cluster's motif output and converts it to a dataframe with all the motifs mapped to genes they were identified in
def process_cluster_moitfs(path):
    try:
        df = pd.read_csv(path, delimiter='\t')
    except pd.errors.EmptyDataError:
        # If the file is completely empty, return an empty DataFrame with desired columns.
        return pd.DataFrame(columns=['motif_ID', 'gene'])
    
    # If the file was read but is empty, return an empty DataFrame with desired columns.
    if df.empty:
        return pd.DataFrame(columns=['motif_ID', 'gene'])
    
    # Proceed with processing if data exists.
    df_subset = df[['motif_ID', 'seq_ID']]
    pattern = r'^(.+?)::'
    df_subset_processed = df_subset.copy()
    df_subset_processed['seq_ID'] = df_subset['seq_ID'].str.extract(pattern)
    df_subset_processed.columns = [df_subset.columns[0], 'gene']
    return df_subset_processed

def analyze_motif_enrichment(ameoutputdir):
    # Create all the file paths
    motif_results_paths = []
    for root, dir, files in os.walk(ameoutputdir):
        for file in files: 
            if 'sequences.tsv' in file: 
                motif_results_paths.append(os.path.join(root, file))

    clust_id_pattern = r'(?<=/)(\d{1,2})(?=_ame)'
    motif_enrichment_dict = {}
    for n, path in enumerate(motif_results_paths):
        # Get the directory containing the file, then extract its basename
        dir_path = os.path.dirname(path)
        key = os.path.basename(dir_path)
        print(path)
        motif_enrichment_dict[key] = process_cluster_moitfs(path)
    
    empty_count = sum(1 for v in motif_enrichment_dict.values() if v.empty)
    print(f"{empty_count} dictionary entries are empty.")
    unique_motifs = set()

    for key, df in motif_enrichment_dict.items():
        if not df.empty:
            # Update the set with unique motif_ID values from the current DataFrame
            unique_motifs.update(df['motif_ID'].unique())
    print("Unique motif_ID count:", len(unique_motifs))
    
    return motif_enrichment_dict

MOTIF: 650 SEQ: 1000/124500

MOTIF: 654 SEQ: 3000/124500

In [None]:
# Analzying motif enrichments for trial 3
motif_enrichment_path = '/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial3'
dict = analyze_motif_enrichment(motif_enrichment_path)

/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial3/38_ame/sequences.tsv
/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial3/1_ame/sequences.tsv
/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial3/36_ame/sequences.tsv
/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial3/37_ame/sequences.tsv
/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial3/12_ame/sequences.tsv
/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial3/28_ame/sequences.tsv
/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial3/11_ame/sequences.tsv
/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial3/43_ame/sequences.tsv
/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial3/3_ame/sequences.tsv
/home/temccrac/Programs/cellular_clarity_project

MOTIF: 668 SEQ: 2000/124500

MOTIF: 868 SEQ: 9000/124500
Added /home/temccrac/Programs/data/cellular_clarity/dap_seq_data/dap_data_v4/ArabidopsisDAPv1.meme to motif_sources which now has 1 file names.
Motif file name is /home/temccrac/Programs/data/cellular_clarity/dap_seq_data/dap_data_v4/ArabidopsisDAPv1.meme.
The output directory '/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial4/22_ame_foreground' already exists.
Its contents will be overwritten.
E-value threshold for reporting results: 10
Checking alphabets in 1 motif files.
Loading motifs from file '/home/temccrac/Programs/data/cellular_clarity/dap_seq_data/dap_data_v4/ArabidopsisDAPv1.meme'
MOTIF: 851 SEQ: 12000/12450
Loading single set of sequences.
No FASTA scores were read from sequence ID lines.
Setting FASTA scores equal to the input order of the sequence.
In partition maximization mode.
MOTIF: 1 SEQ: 83/83
Sorting sequences by sequence PWM score to get PWM ranks; breaking ties to put negatives first.
Resorting sequen

In [30]:
# for separate background and foreground
# Find keys with 'foreground' and 'background'
foreground_empty = [key for key, df in motif_enrichment_dict.items() 
                    if 'foreground' in key.lower() and df.empty]
background_empty = [key for key, df in motif_enrichment_dict.items() 
                    if 'background' in key.lower() and df.empty]

print(f"Foreground empty entries: {len(foreground_empty)}")
print("Keys:", foreground_empty)

print(f"Background empty entries: {len(background_empty)}")
print("Keys:", background_empty)

Foreground empty entries: 32
Keys: ['47_ame_foreground', '42_ame_foreground', '18_ame_foreground', '12_ame_foreground', '19_ame_foreground', '30_ame_foreground', '49_ame_foreground', '16_ame_foreground', '46_ame_foreground', '1_ame_foreground', '2_ame_foreground', '15_ame_foreground', '5_ame_foreground', '11_ame_foreground', '39_ame_foreground', '22_ame_foreground', '37_ame_foreground', '34_ame_foreground', '28_ame_foreground', '44_ame_foreground', '45_ame_foreground', '50_ame_foreground', '21_ame_foreground', '27_ame_foreground', '4_ame_foreground', '3_ame_foreground', '41_ame_foreground', '35_ame_foreground', '24_ame_foreground', '33_ame_foreground', '13_ame_foreground', '43_ame_foreground']
Background empty entries: 32
Keys: ['43_ame_background', '33_ame_background', '49_ame_background', '47_ame_background', '35_ame_background', '36_ame_background', '20_ame_background', '26_ame_background', '15_ame_background', '48_ame_background', '16_ame_background', '40_ame_background', '30_ame_b

In [31]:
unique_motifs_fg = set()
unique_motifs_bg = set()

for key, df in motif_enrichment_dict.items():
    if not df.empty:
        if 'foreground' in key.lower():
            unique_motifs_fg.update(df['motif_ID'].unique())
        elif 'background' in key.lower():
            unique_motifs_bg.update(df['motif_ID'].unique())

print("Foreground unique motif_ID count:", len(unique_motifs_fg))
print("Background unique motif_ID count:", len(unique_motifs_bg))


Foreground unique motif_ID count: 25
Background unique motif_ID count: 33


In [15]:
# For this run, we want to combine all DEGs (from DEGs_included['AGI']) and all non_DEGs_sampled
# into one foreground list.
foreground_gene_ids = DEGs_included['AGI'].tolist()  # all DEGs gene IDs
combined_gene_ids = list(set(foreground_gene_ids + non_DEGs_sampled))

print(f"Total genes (combined): {len(combined_gene_ids)}")

# Load promoter sequences into a dictionary (if not already loaded)
def load_promoters(promoters_fasta):
    promoters = {}
    for record in SeqIO.parse(promoters_fasta, "fasta"):
        # Assumes header is in the form: >AT2G20724::...
        gene_id = record.id.split("::")[0]
        promoters[gene_id] = record
    return promoters

promoters_dict = load_promoters(promoters_fasta)

# Write combined FASTA file
combined_fasta = os.path.join(out_dir, "combined_promoters.fa")
with open(combined_fasta, "w") as out_handle:
    missing_genes = []
    for gene in combined_gene_ids:
        if gene in promoters_dict:
            SeqIO.write(promoters_dict[gene], out_handle, "fasta")
        else:
            missing_genes.append(gene)
            print(f"Warning: Gene {gene} not found in promoters dictionary.")
            
print(f"Wrote promoters for {len(combined_gene_ids) - len(missing_genes)} genes to {combined_fasta}")

# Run AME with the combined FASTA file as the foreground.
# Since we want to use AME's default background model, we do not supply the --control option.
ame_output_dir = os.path.join(out_dir, "AME_combined")
os.makedirs(ame_output_dir, exist_ok=True)

# According to AME usage, the first positional argument is the sequence file and the second is the motif file.
cmd = [
    ame_executable,
    "--oc", ame_output_dir,
    combined_fasta,            # Foreground sequences
    motif_detection_path       # Motif file in MEME format
]
if additional_params:
    cmd.extend(additional_params.split())
    
print("Running AME command:", " ".join(cmd))
subprocess.run(cmd, check=True)

print("AME analysis for combined gene set is complete.")


Total genes (combined): 5261
Wrote promoters for 5261 genes to /home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment/combined_promoters.fa
Running AME command: /home/temccrac/meme/bin/ame --oc /home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment/AME_combined /home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment/combined_promoters.fa /home/temccrac/Programs/data/cellular_clarity/dap_seq_data/dap_data_v4/ArabidopsisDAPv1.meme


Added /home/temccrac/Programs/data/cellular_clarity/dap_seq_data/dap_data_v4/ArabidopsisDAPv1.meme to motif_sources which now has 1 file names.
Motif file name is /home/temccrac/Programs/data/cellular_clarity/dap_seq_data/dap_data_v4/ArabidopsisDAPv1.meme.
The output directory '/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment/AME_combined' already exists.
Its contents will be overwritten.
E-value threshold for reporting results: 10
Checking alphabets in 1 motif files.
Loading motifs from file '/home/temccrac/Programs/data/cellular_clarity/dap_seq_data/dap_data_v4/ArabidopsisDAPv1.meme'
Loading single set of sequences.
No FASTA scores were read from sequence ID lines.
Setting FASTA scores equal to the input order of the sequence.
In partition maximization mode.
MOTIF: 1 SEQ: 5261/5261
Sorting sequences by sequence PWM score to get PWM ranks; breaking ties to put negatives first.
Resorting sequences to their original input order.
Performing partition maximiza

AME analysis for combined gene set is complete.


MOTIF: 872 SEQ: 5261/5261


In [None]:
cluter_12_genes = DEGs_pr['AGI'][DEGs_pr['cluster_id'] == 12]

713    AT1G02530
714    AT1G04180
715    AT1G04610
716    AT1G06090
717    AT1G08920
         ...    
794    AT5G57800
795    AT5G58784
796    AT5G60530
797    AT5G64870
798    AT5G66280
Name: AGI, Length: 86, dtype: object