# Motif Detection of Every Gene Detected in Experimental Data

Teague McCracken 
03/30/2025

In [62]:
# Initialize the notebook, set environment to transcriptomics
import pandas as pd
import subprocess
import os
import multiprocessing
from pathlib import Path
import numpy as np

data_path = '/home/temccrac/Programs/git_clones/ECE759_Project/cellular_clarity'
checkpoint_path = '/home/temccrac/Programs/cellular_clarity_project/checkpoints' # on Teague's server 

## Create the New Reference Genome, trying Araport11

In [2]:
# File paths for step 1, had 5324 missing genes
gtf = "/home/temccrac/Programs/data/genomes/Arabidopsis_thaliana.TAIR10.54.gtf"
#gene_list = "/home/temccrac/Programs/git_clones/ECE759_Project/cellular_clarity/genes_of_interest.txt" # our results
gene_list = "/home/temccrac/Programs/git_clones/ECE759_Project/cellular_clarity/paperresults/genes_of_interest.txt"
genome_fa = "/home/temccrac/Programs/data/genomes/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa"
genome_sizes = "/home/temccrac/Programs/data/genomes/Arabidopsis.genome"

In [63]:
# File paths for step 1 using Araport 11 annotations, 2016, had ~2778 missing genes, only 35 are DEGs
gtf = "/home/temccrac/Programs/data/genomes/Araport11/Araport11_renamed.gff"
#gene_list = "/home/temccrac/Programs/git_clones/ECE759_Project/cellular_clarity/genes_of_interest.txt" # our results
gene_list = "/home/temccrac/Programs/git_clones/ECE759_Project/cellular_clarity/paperresults/genes_of_interest.txt"
genome_fa = "/home/temccrac/Programs/data/genomes/Araport11/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa"
genome_sizes = "/home/temccrac/Programs/data/genomes/Araport11/TAIR10.chrom.sizes"

In [4]:
# File paths for step 1 using Araport 11 Mar9 2021 annotations, missing 2900 genes
gtf = "/home/temccrac/Programs/data/genomes/Araport11_2021/Araport11_renamed.gff"
#gene_list = "/home/temccrac/Programs/git_clones/ECE759_Project/cellular_clarity/genes_of_interest.txt" # our results
gene_list = "/home/temccrac/Programs/git_clones/ECE759_Project/cellular_clarity/paperresults/genes_of_interest.txt"
genome_fa = "/home/temccrac/Programs/data/genomes/Araport11_2021/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa"
genome_sizes = "/home/temccrac/Programs/data/genomes/Araport11_2021/TAIR10.chrom.sizes"

In [None]:
# File paths for step 1 using Araport 11 Feb 2022 annotations, this is the final one I used
gtf = "/home/temccrac/Programs/data/genomes/Araport11_2021/Araport11_renamed.gff"
#gene_list = "/home/temccrac/Programs/git_clones/ECE759_Project/cellular_clarity/genes_of_interest.txt" # our results
gene_list = "/home/temccrac/Programs/git_clones/ECE759_Project/cellular_clarity/paperresults/genes_of_interest.txt"
genome_fa = "/home/temccrac/Programs/data/genomes/Araport11_2021/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa"
genome_sizes = "/home/temccrac/Programs/data/genomes/Araport11_2021/TAIR10.chrom.sizes"

## Step 1 - Extract promoter sequences for each gene

Inputs needed: <br>
1. gtf_file (reference genome information) <br>
2. gene_list_file (detected genes from experiment) <br>
3. genome.fa <br>
4. genomes size file Arabidopsis.genome (contains genome size information) <br>

Outputs: all outputs go to project checkpoint folder <br>
1. filtered_gtf (reduced version of Arabidopsis gtf file) <br>
2. tss.bed (bed format with transcription start sites) <br>
3. promoters.bed (bed format with 1000 bp upstream regions (promoters) of genes) <br>
4. promoters.fasta (the final output, sequences of promoter regions of each gene of interest) <br>

In [None]:
# Core functions for promoter extraction
def filter_gtf_by_genes(gtf_file, gene_list_file, output_gtf):
    with open(gene_list_file, 'r') as f:
        genes = set(f.read().splitlines())

    with open(gtf_file, 'r') as gtf_in, open(output_gtf, 'w') as gtf_out:
        for line in gtf_in:
            if line.startswith('#'):
                continue
            if any(gene in line for gene in genes):
                gtf_out.write(line)

def extract_tss_bed_original(filtered_gtf, output_bed): #with tair .gtf annotations
    awk_cmd = (
        '''awk '$3 == "transcript" {
            match($0, /gene_id "([^"]+)"/, m);
            gene = m[1];
            if ($7 == "+") {
                print $1 "\\t" $4-1 "\\t" $4 "\\t" gene "\\t.\\t" $7;
            } else {
                print $1 "\\t" $5-1 "\\t" $5 "\\t" gene "\\t.\\t" $7;
            }
        }' '''
    )
    full_cmd = f"{awk_cmd} {filtered_gtf} > {output_bed}"
    subprocess.run(full_cmd, shell=True, check=True)

def extract_tss_bed_v2(filtered_gtf, output_bed): # command for using araport11 .gff
    awk_cmd = (
    '''awk '$3 == "mRNA" {
        # Extract the gene ID from "Parent=..." in the 9th field
        match($9, /Parent=([^;]+)/, m);
        gene = m[1];

        # For plus-strand, TSS is the start ($4). For minus-strand, TSS is the end ($5).
        if ($7 == "+") {
            print $1 "\\t" $4-1 "\\t" $4 "\\t" gene "\\t.\\t" $7;
        } else {
            print $1 "\\t" $5-1 "\\t" $5 "\\t" gene "\\t.\\t" $7;
        }
    }' '''
    )
    full_cmd = f"{awk_cmd} {filtered_gtf} > {output_bed}"
    subprocess.run(full_cmd, shell=True, check=True)

def extract_tss_bed(filtered_gtf, output_bed):
    awk_cmd = (
    '''awk '{
        match($9, /Parent=([^;]+)/, m);
        gene = m[1];
        if (gene == "") {
            match($9, /ID=([^;]+)/, m);
            gene = m[1];
        }
        if ($7 == "+") {
            print $1 "\\t" $4-1 "\\t" $4 "\\t" gene "\\t.\\t" $7;
        } else {
            print $1 "\\t" $5-1 "\\t" $5 "\\t" gene "\\t.\\t" $7;
        }
    }' '''
    )
    full_cmd = f"{awk_cmd} {filtered_gtf} > {output_bed}"
    subprocess.run(full_cmd, shell=True, check=True)

def create_promoter_bed(tss_bed, genome_sizes, output_bed, length=1000):
    with open(output_bed, 'w') as out_f:
        subprocess.run([
            "bedtools", "flank",
            "-i", tss_bed,
            "-g", genome_sizes,
            "-l", str(length),
            "-r", "0",
            "-s"
        ], stdout=out_f, check=True)

def extract_promoter_fasta(genome_fa, promoter_bed, output_fasta):
    subprocess.run([
        "bedtools", "getfasta",
        "-fi", genome_fa,
        "-bed", promoter_bed,
        "-fo", output_fasta,
        "-s",
        "-name"  # preserves gene IDs in FASTA headers
    ], check=True)

# Functions to execute the above commands in parallel
def run_promoter_pipeline(chunk_id, gene_list_chunk, gtf, genome_fa, genome_sizes, outdir):
    filtered_gtf = f"{outdir}/filtered_{chunk_id}.gtf"
    tss_bed = f"{outdir}/tss_{chunk_id}.bed"
    promoter_bed = f"{outdir}/promoters_{chunk_id}.bed"
    fasta_out = f"{outdir}/promoters_{chunk_id}.fa"

    # Write chunked gene list
    gene_list_file = f"{outdir}/genes_{chunk_id}.txt"
    with open(gene_list_file, 'w') as f:
        f.write('\n'.join(gene_list_chunk))

    # Run sub-steps
    filter_gtf_by_genes(gtf, gene_list_file, filtered_gtf)
    extract_tss_bed(filtered_gtf, tss_bed)
    create_promoter_bed(tss_bed, genome_sizes, promoter_bed)
    extract_promoter_fasta(genome_fa, promoter_bed, fasta_out)

def parallel_promoter_extraction(genes_file, gtf, genome_fa, genome_sizes, outdir, num_cpus=10):
    with open(genes_file) as f:
        genes = f.read().splitlines()

    # Split gene list into chunks
    chunk_size = len(genes) // num_cpus
    chunks = [genes[i:i + chunk_size] for i in range(0, len(genes), chunk_size)]

    os.makedirs(outdir, exist_ok=True)

    args = [(i, chunks[i], gtf, genome_fa, genome_sizes, outdir) for i in range(len(chunks))]
    with multiprocessing.Pool(num_cpus) as pool:
        pool.starmap(run_promoter_pipeline, args)

    # Merge all .fa files
    merged_fasta = os.path.join(outdir, "promoters_merged.fa")
    with open(merged_fasta, 'w') as outfile:
        for i in range(len(chunks)):
            chunk_fasta = os.path.join(outdir, f"promoters_{i}.fa")
            with open(chunk_fasta, 'r') as infile:
                outfile.write(infile.read())

    print("✅ All promoters extracted and merged.")

In [None]:
# Run steps for Step 1
parallel_promoter_extraction(
    genes_file=gene_list,
    gtf=gtf,
    genome_fa=genome_fa,
    genome_sizes=genome_sizes,
    outdir=checkpoint_path,
    num_cpus=10
)

print("✅ Promoter FASTA extraction complete.")

After running, I deleted all the unneccessary intermediate files and kept only the promoter_merged.fa file

Now, I check that all the genes of interests have promoters that were saved in the merged promoter file

In [65]:
def check_promoter_coverage(gene_list_file, promoters_fasta):
    # Read expected gene list from a plain text file (one gene per line)
    with open(gene_list_file) as f:
        expected_genes = set(f.read().splitlines())

    # Function to standardize gene IDs (remove version suffixes, etc.)
    def standardize_gene_id(gene_id):
        return gene_id.split('.')[0]

    # Standardize expected gene IDs
    expected_genes = {standardize_gene_id(gene) for gene in expected_genes}

    # Read FASTA headers from the promoters file and standardize the gene IDs
    found_genes = set(
        standardize_gene_id(line[1:].strip().split("::")[0])
        for line in open(promoters_fasta) if line.startswith(">")
    )

    # Compute missing and extra genes
    missing = expected_genes - found_genes
    extra = found_genes - expected_genes

    print(f"✅ Found promoters for {len(found_genes)} / {len(expected_genes)} genes.")
    if missing:
        print(f"❌ Missing {len(missing)} genes:")
        for gene in list(missing)[:10]:
            print("  ", gene)
    else:
        print("🎉 No missing genes.")

    if extra:
        print(f"⚠️ {len(extra)} unexpected gene(s) in output (not in your original list):")
        for gene in list(extra)[:5]:
            print("  ", gene)
    return missing

# Example usage:
missing = check_promoter_coverage(
    gene_list_file=gene_list,
    promoters_fasta="/home/temccrac/Programs/cellular_clarity_project/checkpoints/promoters_merged.fa"
)


✅ Found promoters for 34260 / 33602 genes.
❌ Missing 488 genes:
   AT3G30456
   AT3G58676
   AT3G52302
   AT4G35589
   AT3G28321
   AT2G15930
   AT2G45161
   AT2G04370
   AT5G57181
   AT5G28642
⚠️ 1146 unexpected gene(s) in output (not in your original list):
   ath-miR170-5p
   ath-miR2111a-3p
   AT1G07757
   AT1G04837
   ath-miR857


Looking at which ones are missing from the reference annotation and then I will save this for our records

In [69]:
pd.Series(list(missing)).to_csv(os.path.join(data_path, 'Araport11_2016_missing_all.txt'), index=False, header=False)

In [70]:
missing = pd.read_csv(os.path.join(data_path, 'Araport11_2016_missing_all.txt'))

In [24]:
DEGs_pr = pd.read_csv(os.path.join(data_path, 'paperresults/DEGs_clusters_epidermis_paperresults.csv'), index_col=None) # We are working with the Epidermis data
notmissing = pd.Series(list(set(DEGs_pr['AGI']) - set(missing)))
len(notmissing)
notmissing.to_csv(os.path.join(data_path, 'Araport11_2016_notmissing.txt'), index=False, header=False)

In [25]:
missing_DEGs = set(missing).intersection(set(DEGs_pr['AGI']))
missing_DEGs = pd.Series(list(missing_DEGs))
missing_DEGs.to_csv(os.path.join(data_path, 'Araport11_2016_missingDEGs.txt'), index=False, header=False)

In [None]:
# Loading checkpoints
notmissing_DEGs = pd.read_csv(os.path.join(data_path, 'Araport11_2016_notmissing.txt'))
missing_DEGs = pd.read_csv(os.path.join(data_path, 'Araport11_2016_missingDEGs.txt'))

## Step 2 - Clustering / DEG Detection

I will use the papers DEG and cluster results

In [82]:
DEGs_pr

Unnamed: 0,AGI,Alias,cluster_id,DE_6,DE_12,DE_18,DE_24,DE_30,DE_36
0,AT1G01140,"CIPK9, PKS6, SnRK3.12",1,1,0,0,0,0,0
1,AT1G02400,"ATGA2OX4, ATGA2OX6, DTA1, GA2OX6",1,0,1,0,0,0,1
2,AT1G04700,,1,0,0,0,1,0,0
3,AT1G09080,BIP3,1,1,0,0,0,0,0
4,AT1G12040,LRX1,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
2734,AT5G23405,,49,0,0,0,1,0,0
2735,AT5G62190,PRH75,49,0,0,0,1,0,0
2736,AT4G03470,,50,1,0,0,0,0,0
2737,AT4G24770,"ATRBP31, ATRBP33, CP31, RBP31",50,1,1,0,0,0,0


## Step 3 - Motif Detection Using AME

combined_motifs.meme is the DAP-Seq database

In [8]:
import os
import argparse
import subprocess
import multiprocessing
from Bio import SeqIO
import random
random.seed(33)

### Running with control sequences

In [88]:
motif_detection_path = '/home/temccrac/Programs/data/cellular_clarity/dap_seq_data/dap_data_v4/ArabidopsisDAPv1.meme'
promoters_fasta = '/home/temccrac/Programs/cellular_clarity_project/checkpoints/promoters_merged.fa'
#out_dir = "/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment"
ame_executable = "/home/temccrac/meme/bin/ame"                        # AME executable (or provide full path)
additional_params = ""                        # Additional parameters for AME (if any)
cpus = 10                                      # Number of parallel processes

DEGs_pr = pd.read_csv(os.path.join(data_path, 'paperresults/DEGs_clusters_epidermis_paperresults.csv'), index_col=None) # We are working with the Epidermis data
DEGs_included = DEGs_pr[~DEGs_pr['AGI'].isin(missing)]
cluster_sizes = DEGs_included['cluster_id'].value_counts()
print('smallest cluster size is', cluster_sizes.min(), 'genes')

foreground_groups = DEGs_included.groupby("cluster_id")["AGI"].apply(list).to_dict()

group_sizes = [len(genes) for genes in foreground_groups.values()]
average_size = sum(group_sizes) / len(group_sizes)
non_DEGs = pd.read_csv(os.path.join(data_path, 'paperresults/non_DEGs.csv'), header=None, index_col = None)
non_DEGs_list = non_DEGs[0].tolist()
non_DEGs_list = [id for id in non_DEGs_list if id not in missing]

non_DEGs_sampled = random.sample(non_DEGs_list, len(DEGs_included))
groups = np.array_split(non_DEGs_sampled, 50)
background_groups = {i+1: list(group) for i, group in enumerate(groups)}

smallest cluster size is 3 genes


In [89]:
# Set of functions for processing with a control 
def run_ame(fg_fasta, bg_fasta, ame_output_dir, motif_detection_path, ame_executable, additional_params):
    """
    Runs the AME command using the provided motif file (first positional argument),
    the foreground FASTA (second positional argument), and the background FASTA via --control.
    """
    cmd = [
        ame_executable,
        "--oc", ame_output_dir,
        "--control", bg_fasta,
        fg_fasta,               # First positional argument: sequence file (foreground)
        motif_detection_path    # Next positional argument: motif file in MEME format
    ]
    if additional_params:
        cmd.extend(additional_params.split())
    print("Running AME command:", " ".join(cmd))
    subprocess.run(cmd, check=True)

# Function to process one group: create FASTA files and run AME, updated after trial 6
def process_group(group_name, fg_gene_ids, bg_gene_ids, promoters_dict, out_dir, motif_detection_path, ame_executable, additional_params):
    # Define output FASTA file paths for foreground and background
    fg_fasta = os.path.join(out_dir, f"{group_name}_foreground.fa")
    bg_fasta = os.path.join(out_dir, f"{group_name}_background.fa")
    
    # Write the foreground FASTA (genes in this group)
    with open(fg_fasta, "w") as fg_handle:
        for gene in fg_gene_ids:
            if gene in promoters_dict:
                record_val = promoters_dict[gene]
                if isinstance(record_val, list):
                    # Write all promoter records for this gene
                    for record in record_val:
                        SeqIO.write(record, fg_handle, "fasta")
                else:
                    # Write the single promoter record
                    SeqIO.write(record_val, fg_handle, "fasta")
            else:
                print(f"Warning: Foreground gene {gene} not found in promoters dictionary.")
    
    # Write the background FASTA (all other genes for this group)
    with open(bg_fasta, "w") as bg_handle:
        for gene in bg_gene_ids:
            if gene in promoters_dict:
                record_val = promoters_dict[gene]
                if isinstance(record_val, list):
                    for record in record_val:
                        SeqIO.write(record, bg_handle, "fasta")
                else:
                    SeqIO.write(record_val, bg_handle, "fasta")
            else:
                print(f"Warning: Background gene {gene} not found in promoters dictionary.")
    
    # Create an output directory for AME results for this group
    ame_out = os.path.join(out_dir, f"{group_name}_ame")
    os.makedirs(ame_out, exist_ok=True)
    
    # Run AME
    run_ame(fg_fasta, bg_fasta, ame_out, motif_detection_path, ame_executable, additional_params)



# Function to load all promoters from a FASTA file into a dictionary keyed by gene ID
def load_promoters(promoters_fasta):
    """
    Loads the large FASTA file of promoters into a dictionary keyed by gene ID.
    Assumes that each record's id has the format 'AGI::...'
    and uses the part before the first "::" as the key.
    """
    # Append the record to the list of promoters for this gene
    promoters = {}
    for record in SeqIO.parse(promoters_fasta, "fasta"):
        # Standardize the gene id: split header (assumes format 'AGI::...'), then remove version info, added after trial 6
        # meant to include all version of a genes promoter that was extracted using my logic from above
        gene_id_with_version = record.id.split("::")[0]
        gene_id = gene_id_with_version.split('.')[0]
        # Extract the AGI by splitting on "::"
        gene_id = record.id.split("::")[0]
        promoters[gene_id] = record
    return promoters

In [None]:
# Ran this for comparing DEGs in clusters to average sized groups of non-DEGs
# Load the promoter sequences into a dictionary
promoters_dict = load_promoters(promoters_fasta)

# Prepare arguments for each group
args_list = []
for group_name, fg_gene_ids in foreground_groups.items():
    bg_gene_ids = background_groups.get(group_name, [])
    args_list.append((group_name, fg_gene_ids, bg_gene_ids, promoters_dict, out_dir, motif_detection_path, ame_executable, additional_params))

# Run AME for each group in parallel using multiprocessing
pool = multiprocessing.Pool(processes=cpus)
pool.starmap(process_group, args_list)
pool.close()
pool.join()

print("All AME analyses are complete.")

In [None]:
# Trying comparing each cluster to all background genes, Trial 4
out_dir = "/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial4"
# Load promoter sequences into a dictionary
promoters_dict = load_promoters(promoters_fasta)

# Assuming non_DEGs_list is already defined as a list of all background gene IDs
args_list = []
for group_name, fg_gene_ids in foreground_groups.items():
    # Instead of getting a group-specific background, use the full list for each cluster
    bg_gene_ids = non_DEGs_list  
    args_list.append((group_name, fg_gene_ids, bg_gene_ids, promoters_dict, out_dir, motif_detection_path, ame_executable, additional_params))

# Run AME for each group in parallel using multiprocessing
pool = multiprocessing.Pool(processes=cpus)
pool.starmap(process_group, args_list)
pool.close()
pool.join()

print("All AME analyses are complete.")

In [None]:
# Trying 1000 sequence totals for foreground and background as a "rule of thumb", Trial 5
out_dir = "/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial5"
# Load promoter sequences into a dictionary
#promoters_dict = load_promoters(promoters_fasta)

args_list = []
for group_name, fg_gene_ids in foreground_groups.items():
    fg_size = len(fg_gene_ids)
    bg_size = 1000 - fg_size  # Calculate desired background group size
    # Randomly sample bg_size genes from non_DEGs_list (without replacement)
    bg_gene_ids = random.sample(non_DEGs_list, bg_size)
    args_list.append((group_name, fg_gene_ids, bg_gene_ids, promoters_dict, out_dir, motif_detection_path, ame_executable, additional_params))

# Run AME for each group in parallel using multiprocessing
pool = multiprocessing.Pool(processes=cpus)
pool.starmap(process_group, args_list)
pool.close()
pool.join()

print("All AME analyses are complete.")


In [None]:
# Trying 20 iterations of this method above in trial 5, then synthesizing results, Trial 6
out_dir = "/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial6"
# Generate a list of 20 random seeds
random_seeds = [42, 7, 13, 21, 35, 56, 78, 89, 101, 202, 303, 404, 505, 606, 707, 808, 909, 1001, 1101, 1201]

# We'll store the analysis results (e.g., the motif enrichment dictionary for each iteration) in a list
all_iteration_results = []

for i, seed in enumerate(random_seeds):
    print(f"\n=== Iteration {i+1} with seed {seed} ===")
    random.seed(seed)
    
    # Create an output directory for this iteration
    iter_out_dir = os.path.join(out_dir, f"iter_{i+1}")
    os.makedirs(iter_out_dir, exist_ok=True)
    
    # Build arguments for each foreground cluster
    args_list = []
    for group_name, fg_gene_ids in foreground_groups.items():
        fg_size = len(fg_gene_ids)
        bg_size = 1000 - fg_size  # desired background group size
        # Sample background genes from the full non_DEGs_list
        bg_gene_ids = random.sample(non_DEGs_list, bg_size)
        args_list.append((group_name, fg_gene_ids, bg_gene_ids, promoters_dict, iter_out_dir, motif_detection_path, ame_executable, additional_params))
    
    # Run AME for each cluster in parallel for this iteration
    pool = multiprocessing.Pool(processes=cpus)
    pool.starmap(process_group, args_list)
    pool.close()
    pool.join()
    print(f"Iteration {i+1}: AME analyses complete.")

print("All 20 iterations complete.")


In [None]:
# testing
out_dir = "/home/temccrac/Programs/cellular_clarity_project/checkpoints/testing"
# Generate a list of 20 random seeds
random_seeds = [42, 7]

# We'll store the analysis results (e.g., the motif enrichment dictionary for each iteration) in a list
all_iteration_results = []

for i, seed in enumerate(random_seeds):
    print(f"\n=== Iteration {i+1} with seed {seed} ===")
    random.seed(seed)
    
    # Create an output directory for this iteration
    iter_out_dir = os.path.join(out_dir, f"iter_{i+1}")
    os.makedirs(iter_out_dir, exist_ok=True)
    
    # Build arguments for each foreground cluster
    args_list = []
    for group_name, fg_gene_ids in foreground_groups.items():
        fg_size = len(fg_gene_ids)
        bg_size = 1000 - fg_size  # desired background group size
        # Sample background genes from the full non_DEGs_list
        bg_gene_ids = random.sample(non_DEGs_list, bg_size)
        args_list.append((group_name, fg_gene_ids, bg_gene_ids, promoters_dict, iter_out_dir, motif_detection_path, ame_executable, additional_params))
    
    # Run AME for each cluster in parallel for this iteration
    pool = multiprocessing.Pool(processes=cpus)
    pool.starmap(process_group, args_list)
    pool.close()
    pool.join()
    print(f"Iteration {i+1}: AME analyses complete.")

print("All 20 iterations complete.")


### Running without controls 

In [None]:
motif_detection_path = '/home/temccrac/Programs/data/cellular_clarity/dap_seq_data/dap_data_v4/ArabidopsisDAPv1.meme'
promoters_fasta = '/home/temccrac/Programs/cellular_clarity_project/checkpoints/promoters_merged.fa'
#out_dir = "/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment"
ame_executable = "/home/temccrac/meme/bin/ame"                        # AME executable (or provide full path)
additional_params = ""                        # Additional parameters for AME (if any)
cpus = 10                                      # Number of parallel processes

DEGs_pr = pd.read_csv(os.path.join(data_path, 'paperresults/DEGs_clusters_epidermis_paperresults.csv'), index_col=None) # We are working with the Epidermis data
DEGs_included = DEGs_pr[~DEGs_pr['AGI'].isin(missing)]
cluster_sizes = DEGs_included['cluster_id'].value_counts()
print('smallest cluster size is', cluster_sizes.min(), 'genes')

foreground_groups = DEGs_included.groupby("cluster_id")["AGI"].apply(list).to_dict()

group_sizes = [len(genes) for genes in foreground_groups.values()]
average_size = sum(group_sizes) / len(group_sizes)
non_DEGs = pd.read_csv(os.path.join(data_path, 'paperresults/non_DEGs.csv'), header=None, index_col = None)
non_DEGs_list = non_DEGs[0].tolist()
non_DEGs_list = [id for id in non_DEGs_list if id not in missing]

non_DEGs_sampled = random.sample(non_DEGs_list, len(DEGs_included))
groups = np.array_split(non_DEGs_sampled, 50)
background_groups = {i+1: list(group) for i, group in enumerate(groups)}
random.seed(7)

In [113]:
def run_ame_no_control(fg_fasta, ame_output_dir, motif_detection_path, ame_executable, additional_params):
    """
    Runs the AME command using the provided motif file (last positional argument)
    and the foreground FASTA (preceding the motif file), with the -control --shuffle-- option
    specified to tell AME to use its internal shuffling method as a background.
    
    The resulting command will look like:
    <ame_executable> --oc <ame_output_dir> -control --shuffle-- <fg_fasta> <motif_detection_path>
    """
    # Build the command exactly as you want it
    cmd = [
        ame_executable,
        "--oc", ame_output_dir,
        "-control", "--shuffle--",
        fg_fasta,
        motif_detection_path
    ]
    
    # Optionally extend the command with additional parameters if provided
    if additional_params:
        cmd.extend(additional_params.split())
    
    print("Running AME command:", " ".join(cmd))
    subprocess.run(cmd, check=True)

def process_group_no_control(group_name, fg_gene_ids, promoters_dict, out_dir, motif_detection_path, ame_executable, additional_params):
    """
    Processes one group by creating a FASTA file from the foreground gene list.
    For each gene, all promoter versions (if any) are written.
    Then runs AME using the --shuffle option (i.e. without an explicit background).
    """
    # Define output FASTA file path for foreground
    fg_fasta = os.path.join(out_dir, f"{group_name}_foreground.fa")
    
    # Write the foreground FASTA (genes in this group)
    with open(fg_fasta, "w") as fg_handle:
        for gene in fg_gene_ids:
            if gene in promoters_dict:
                record_val = promoters_dict[gene]
                # If record_val is a list (multiple promoter versions), iterate over all records.
                if isinstance(record_val, list):
                    for record in record_val:
                        SeqIO.write(record, fg_handle, "fasta")
                else:
                    # Otherwise, write the single promoter record.
                    SeqIO.write(record_val, fg_handle, "fasta")
            else:
                print(f"Warning: Foreground gene {gene} not found in promoters dictionary.")
    
    # Create an output directory for AME results for this group
    ame_out = os.path.join(out_dir, f"{group_name}_ame")
    os.makedirs(ame_out, exist_ok=True)
    
    # Run AME without an explicit control (using --shuffle)
    run_ame_no_control(fg_fasta, ame_out, motif_detection_path, ame_executable, additional_params)

def load_promoters(promoters_fasta):
    """
    Loads the large FASTA file of promoters into a dictionary keyed by standardized gene ID.
    Instead of overwriting previous records, multiple promoter records for the same gene are stored in a list.
    Assumes each record's id has the format 'AGI::...' where AGI may include a version indicator (e.g., AT2G20724.1).
    Standardization is done so that all versions are keyed by the base gene ID (e.g., AT2G20724).
    """
    promoters = {}
    for record in SeqIO.parse(promoters_fasta, "fasta"):
        # Get the part before "::"
        gene_id_with_version = record.id.split("::")[0]
        # Standardize by removing any version information (e.g., split on '.' and keep the first element)
        base_gene_id = gene_id_with_version.split('.')[0].strip()
        if base_gene_id in promoters:
            promoters[base_gene_id].append(record)
        else:
            promoters[base_gene_id] = [record]
    return promoters

In [None]:
# Redoing trial 2 as trial 7, method that Selene gave me separately processing the samples with shuffling
out_dir = "/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial7"
# Build argument lists for foreground groups and background groups separately.
fg_args_list = []
for group_name, fg_gene_ids in foreground_groups.items():
    # Prefix the group name to indicate foreground
    key = f"fg_{group_name}"
    fg_args_list.append((key, fg_gene_ids, promoters_dict, out_dir, motif_detection_path, ame_executable, additional_params))

bg_args_list = []
for group_name, bg_gene_ids in background_groups.items():
    # Prefix the group name to indicate background
    key = f"bg_{group_name}"
    bg_args_list.append((key, bg_gene_ids, promoters_dict, out_dir, motif_detection_path, ame_executable, additional_params))

# Combine both lists
all_args = fg_args_list + bg_args_list

# Run AME for each group in parallel (without an explicit control, using --shuffle)
with multiprocessing.Pool(processes=cpus) as pool:
    pool.starmap(process_group_no_control, all_args)

print("All AME analyses for both foreground and background groups are complete.")

## Step 4 - Processing Motif Information Output

In [138]:
# Takes one cluster's motif output and converts it to a dataframe with all the motifs mapped to genes they were identified in
def process_cluster_motifs_combined(cluster_dir):
    """
    For a given cluster directory, this function reads both the AME summary file (ame.tsv)
    and the sequences file (sequences.tsv). It performs filtering on the AME summary file
    (excluding motifs whose motif_ID contains 'colamp' or whose 'adj_p-value' is <= 0.01),
    then uses the valid motif IDs to filter the sequences mapping file.
    If the sequences.tsv file is empty, it returns an empty DataFrame.
    """
    import os
    import pandas as pd

    ame_file = None
    seq_file = None

    # Find the relevant files in the cluster directory.
    for file in os.listdir(cluster_dir):
        if file.endswith("ame.tsv"):
            ame_file = os.path.join(cluster_dir, file)
        elif file.endswith("sequences.tsv"):
            seq_file = os.path.join(cluster_dir, file)
    
    # Check for file existence.
    if ame_file is None or seq_file is None:
        print(f"Warning: Missing file in {cluster_dir} (ame.tsv or sequences.tsv).")
        return pd.DataFrame(columns=['motif_ID', 'gene'])
    
    # Read the AME summary file.
    try:
        df_ame = pd.read_csv(ame_file, delimiter='\t')
    except pd.errors.EmptyDataError:
        print(f"Warning: AME file {ame_file} is empty.")
        return pd.DataFrame(columns=['motif_ID', 'gene'])
    
    if df_ame.empty:
        return pd.DataFrame(columns=['motif_ID', 'gene'])
    
    # Check for the 'adj_p-value' column.
    if 'adj_p-value' not in df_ame.columns:
        print(f"Warning: 'adj_p-value' not found in {ame_file}. Returning empty DataFrame.")
        return pd.DataFrame(columns=['motif_ID', 'gene'])
    
    # Convert 'adj_p-value' to numeric.
    df_ame['adj_p-value'] = pd.to_numeric(df_ame['adj_p-value'], errors='coerce')
    
    # Filter out rows based on motif_ID and adjusted p-value.
    df_ame = df_ame[~df_ame['motif_ID'].str.contains('colamp', case=False, na=False)]
    df_ame = df_ame[df_ame['adj_p-value'] <= 0.01]
    
    # Read the sequences.tsv file.
    try:
        df_seq = pd.read_csv(seq_file, delimiter='\t')
    except pd.errors.EmptyDataError:
        print(f"Warning: Sequences file {seq_file} is empty. Skipping processing.")
        return pd.DataFrame(columns=['motif_ID', 'gene'])
    
    if df_seq.empty:
        print(f"Warning: Sequences file {seq_file} is empty. Skipping processing.")
        return pd.DataFrame(columns=['motif_ID', 'gene'])
    
    # Filter the sequences file to keep only rows with valid motif_IDs.
    valid_motifs = set(df_ame['motif_ID'].unique())
    df_seq_filtered = df_seq[df_seq['motif_ID'].isin(valid_motifs)].copy()
    
    # Use a regular expression to extract the gene ID from the 'seq_ID' field.
    pattern = r'^(.+?)::'
    df_seq_filtered['gene'] = df_seq_filtered['seq_ID'].str.extract(pattern, expand=False)
    
    # Select only the columns we need.
    df_final = df_seq_filtered[['motif_ID', 'gene']].dropna().copy()
    
    return df_final



def analyze_motif_enrichment(ameoutputdir):
    """
    Walks through the provided output directory (ameoutputdir) to locate cluster subdirectories.
    Each subdirectory is expected to have both an AME summary file (ame.tsv) and a sequences file (sequences.tsv).
    The function processes each cluster using process_cluster_motifs_combined and assembles a dictionary where
    each key is the cluster (as derived from the directory name) and the value is a DataFrame of motif–gene pairs.
    """
    motif_results_dirs = []
    # Walk through the output directory (only at one level if your clusters are directly subdirectories)
    for entry in os.scandir(ameoutputdir):
        if entry.is_dir() and '_ame' in entry.name:  # e.g., directory names like "38_ame_foreground" or "38_ame_background"
            motif_results_dirs.append(entry.path)
    
    motif_enrichment_dict = {}
    for cluster_dir in motif_results_dirs:
        # Optionally, extract a simpler key from the directory name.
        # For example, if the directory name starts with a number:
        key = os.path.basename(cluster_dir)
        # Process the cluster's motif files.
        df_result = process_cluster_motifs_combined(cluster_dir)
        motif_enrichment_dict[key] = df_result
        print(f"Processed cluster directory {cluster_dir} with {len(df_result)} motif-target pairs.")
    
    empty_count = sum(1 for v in motif_enrichment_dict.values() if v.empty)
    print(f"{empty_count} dictionary entries are empty.")
    unique_motifs = set()
    for key, df in motif_enrichment_dict.items():
        if not df.empty:
            unique_motifs.update(df['motif_ID'].unique())
    print("Unique motif_ID count across clusters:", len(unique_motifs))
    
    return motif_enrichment_dict

In [139]:
# Analyze trial 7
# Analzying motif enrichments for trial 7 
motif_enrichment_path = '/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial7'
dict = analyze_motif_enrichment(motif_enrichment_path)
print('dict created')
# Understanding results 
deg_set = set(DEGs_pr['AGI'])

for cluster, df in dict.items():
    if df.empty:
        print(f"Cluster {cluster} has 0 unique motifs and 0 unique target genes (empty).")
    else:
        unique_motifs = df['motif_ID'].unique()
        unique_targets = df['gene'].unique()
        # Count how many unique targets are in the DEGs_pr['AGI'] list.
        targets_in_DEGs = [gene for gene in unique_targets if gene in deg_set]
        print(f"Cluster {cluster}: {len(unique_motifs)} unique motifs, {len(unique_targets)} unique target genes, "
              f"with {len(targets_in_DEGs)} of them found in DEGs_pr['AGI'].")

Processed cluster directory /home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial7/bg_24_ame with 4891 motif-target pairs.
Processed cluster directory /home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial7/bg_49_ame with 2823 motif-target pairs.
Processed cluster directory /home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial7/bg_11_ame with 3980 motif-target pairs.
Processed cluster directory /home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial7/fg_48_ame with 4904 motif-target pairs.
Processed cluster directory /home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial7/bg_29_ame with 5393 motif-target pairs.
Processed cluster directory /home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial7/bg_45_ame with 2280 motif-target pairs.
Processed cluster directory /home/temccrac/Programs/cellular_clarity_project/check

In [None]:
# Analzying motif enrichments for trial 4 - mostly empty results still 
motif_enrichment_path = '/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial4'
dict = analyze_motif_enrichment(motif_enrichment_path)

/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial4/47_ame_foreground/sequences.tsv
/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial4/42_ame_foreground/sequences.tsv
/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial4/38_ame/sequences.tsv
/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial4/43_ame_background/sequences.tsv
/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial4/49_ame_background/sequences.tsv
/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial4/47_ame_background/sequences.tsv
/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial4/1_ame/sequences.tsv
/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial4/36_ame/sequences.tsv
/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial4/37_ame/sequen

In [None]:
# Analzying motif enrichments for trial 5 - improved stats dependent on the comparison sequences used
motif_enrichment_path = '/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial5'
dict = analyze_motif_enrichment(motif_enrichment_path)

/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial5/38_ame/sequences.tsv
/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial5/1_ame/sequences.tsv
/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial5/36_ame/sequences.tsv
/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial5/37_ame/sequences.tsv
/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial5/12_ame/sequences.tsv
/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial5/28_ame/sequences.tsv
/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial5/11_ame/sequences.tsv
/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial5/43_ame/sequences.tsv
/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial5/3_ame/sequences.tsv
/home/temccrac/Programs/cellular_clarity_project

In [116]:
# Analyze trial 6
def process_cluster_motifs(path):
    try:
        df = pd.read_csv(path, delimiter='\t')
    except pd.errors.EmptyDataError:
        return pd.DataFrame(columns=['motif_ID', 'gene'])
    
    if df.empty:
        return pd.DataFrame(columns=['motif_ID', 'gene'])
    
    # Select the subset of interest
    df_subset = df[['motif_ID', 'seq_ID']].copy()
    pattern = r'^(.+?)::'
    # Use .loc to assign the new column to avoid SettingWithCopyWarning
    df_subset.loc[:, 'gene'] = df_subset['seq_ID'].str.extract(pattern, expand=False)
    # Optionally, if you want to drop the original seq_ID column, just select the desired columns.
    df_subset = df_subset[['motif_ID', 'gene']]
    return df_subset

def synthesize_results(out_dir):
    """
    Walks through the provided output directory (out_dir) to collect motif result files 
    (e.g., sequences.tsv) from all subdirectories (each representing a cluster and/or an iteration)
    and synthesizes them into a final dictionary. Each key in the dictionary is derived 
    from the cluster's directory name, and the value is a DataFrame of combined motif-target pairs.
    """
    final_dict = {}
    # Walk recursively through the out_dir
    for root, dirs, files in os.walk(out_dir):
        # Check if the current directory contains a motif output file
        for file in files:
            if file.endswith("sequences.tsv"):
                file_path = os.path.join(root, file)
                # Use the directory name that contains the result file as the cluster key.
                # If your directories are named something like "38_ame_foreground" or "38_ame_background", 
                # you could use a regex to extract the numeric cluster id.
                cluster_key = os.path.basename(root)
                # Optionally: use a regex to extract only the numeric part if needed:
                m = re.search(r'(\d+)', cluster_key)
                if m:
                    cluster_key = m.group(1)
                
                print(f"Processing: {file_path} as cluster {cluster_key}")
                df = process_cluster_motifs(file_path)
                if cluster_key in final_dict:
                    # Concatenate new data and drop duplicate rows (based on motif_ID and gene)
                    final_dict[cluster_key] = pd.concat([final_dict[cluster_key], df]).drop_duplicates(subset=['motif_ID', 'gene'])
                else:
                    final_dict[cluster_key] = df.copy()
    
    return final_dict

# Define your large out_dir that contains multiple cluster results directories.
# For example:
out_dir = "/home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial6"

# Synthesize all the results into one final dictionary.
final_results = synthesize_results(out_dir)

# Summary: Count clusters with no motifs and total unique motifs across clusters.
clusters_with_no_motifs = [cluster for cluster, df in final_results.items() if df.empty]
num_clusters_no_motifs = len(clusters_with_no_motifs)
print(f"Number of clusters with no motifs: {num_clusters_no_motifs}")

# Compute total unique motifs (across all clusters)
all_unique_motifs = set()
for df in final_results.values():
    if not df.empty:
        all_unique_motifs.update(df['motif_ID'].unique())
print("Total unique motif_ID count across clusters:", len(all_unique_motifs))

# (Optional) Print summary per cluster:
for cluster, df in final_results.items():
    unique_motifs = df['motif_ID'].unique()
    unique_targets = df['gene'].unique()
    print(f"Cluster {cluster}: {len(unique_motifs)} unique motifs, {len(unique_targets)} unique target genes.")


Processing: /home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial6/iter_20/38_ame/sequences.tsv as cluster 38
Processing: /home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial6/iter_20/1_ame/sequences.tsv as cluster 1
Processing: /home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial6/iter_20/36_ame/sequences.tsv as cluster 36
Processing: /home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial6/iter_20/37_ame/sequences.tsv as cluster 37
Processing: /home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial6/iter_20/12_ame/sequences.tsv as cluster 12
Processing: /home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial6/iter_20/28_ame/sequences.tsv as cluster 28
Processing: /home/temccrac/Programs/cellular_clarity_project/checkpoints/motif_enrichment_trial6/iter_20/11_ame/sequences.tsv as cluster 11
Processing: /home/temc

In [96]:
# Analysis for testing 
# Define your large out_dir that contains multiple cluster results directories.
# For example:
out_dir = "/home/temccrac/Programs/cellular_clarity_project/checkpoints/testing"

# Synthesize all the results into one final dictionary.
final_results = synthesize_results(out_dir)

# Summary: Count clusters with no motifs and total unique motifs across clusters.
clusters_with_no_motifs = [cluster for cluster, df in final_results.items() if df.empty]
num_clusters_no_motifs = len(clusters_with_no_motifs)
print(f"Number of clusters with no motifs: {num_clusters_no_motifs}")

# Compute total unique motifs (across all clusters)
all_unique_motifs = set()
for df in final_results.values():
    if not df.empty:
        all_unique_motifs.update(df['motif_ID'].unique())
print("Total unique motif_ID count across clusters:", len(all_unique_motifs))

# (Optional) Print summary per cluster:
for cluster, df in final_results.items():
    unique_motifs = df['motif_ID'].unique()
    unique_targets = df['gene'].unique()
    print(f"Cluster {cluster}: {len(unique_motifs)} unique motifs, {len(unique_targets)} unique target genes.")

Processing: /home/temccrac/Programs/cellular_clarity_project/checkpoints/testing/iter_2/38_ame/sequences.tsv as cluster 38
Processing: /home/temccrac/Programs/cellular_clarity_project/checkpoints/testing/iter_2/1_ame/sequences.tsv as cluster 1
Processing: /home/temccrac/Programs/cellular_clarity_project/checkpoints/testing/iter_2/36_ame/sequences.tsv as cluster 36
Processing: /home/temccrac/Programs/cellular_clarity_project/checkpoints/testing/iter_2/37_ame/sequences.tsv as cluster 37
Processing: /home/temccrac/Programs/cellular_clarity_project/checkpoints/testing/iter_2/12_ame/sequences.tsv as cluster 12
Processing: /home/temccrac/Programs/cellular_clarity_project/checkpoints/testing/iter_2/28_ame/sequences.tsv as cluster 28
Processing: /home/temccrac/Programs/cellular_clarity_project/checkpoints/testing/iter_2/11_ame/sequences.tsv as cluster 11
Processing: /home/temccrac/Programs/cellular_clarity_project/checkpoints/testing/iter_2/43_ame/sequences.tsv as cluster 43
Processing: /home/

In [140]:
def create_motif_matrix(motif_enrichment_dict):
    """
    Given a dictionary where each value is a DataFrame with columns 'motif_ID' and 'gene',
    this function builds a binary matrix X.
    
    - Rows: Unique target genes from across all clusters.
    - Columns: Unique motif_IDs from across all clusters.
    - X[i, j] is 1 if gene i was associated with motif j in any cluster, else 0.
    
    Parameters:
        motif_enrichment_dict (dict): Dictionary from analyze_motif_enrichment().
        
    Returns:
        X (pd.DataFrame): Binary matrix with gene IDs as index and motif_IDs as columns.
    """
    # Collect all unique genes and motifs from across clusters.
    all_genes = set()
    all_motifs = set()
    for cluster, df in motif_enrichment_dict.items():
        if not df.empty:
            all_genes.update(df['gene'].unique())
            all_motifs.update(df['motif_ID'].unique())
    
    all_genes = sorted(list(all_genes))
    all_motifs = sorted(list(all_motifs))
    
    # Create a DataFrame initialized with zeros.
    X = pd.DataFrame(0, index=all_genes, columns=all_motifs)
    
    # Fill in the matrix: set to 1 for every gene-motif pair present in any cluster.
    for cluster, df in motif_enrichment_dict.items():
        if not df.empty:
            for _, row in df.iterrows():
                gene = row['gene']
                motif = row['motif_ID']
                # Set the corresponding cell to 1.
                X.loc[gene, motif] = 1
                
    return X

# Example usage:
# Assuming `dict` is the result from analyze_motif_enrichment:
X = create_motif_matrix(dict)
print(X.head())

           ABI3VP1_tnt.AT5G18090_col_a_m1  ABI3VP1_tnt.AT5G60130_col_a_m1  \
AT1G01140                               0                               0   
AT1G01180                               0                               1   
AT1G01210                               0                               0   
AT1G01260                               1                               0   
AT1G01300                               0                               0   

           ABI3VP1_tnt.FUS3_col_a_m1  ABI3VP1_tnt.REM16_col_a_m1  \
AT1G01140                          0                           0   
AT1G01180                          0                           0   
AT1G01210                          0                           0   
AT1G01260                          0                           0   
AT1G01300                          0                           0   

           ABI3VP1_tnt.VRN1_col_a_m1  AP2EREBP_tnt.ABR1_col_a_m1  \
AT1G01140                          0                        

In [141]:
# Identify rows where every column is 0
zero_rows = X[(X == 0).all(axis=1)]

# Print the target genes (row indices) that have all zeros
print("Target genes with no associated motifs (all zero rows):")
print(list(zero_rows.index))

# Optionally, print the count of such rows
print("Number of rows with all zeros:", len(zero_rows))


Target genes with no associated motifs (all zero rows):
[]
Number of rows with all zeros: 0


expecting no all zero rows^

In [143]:
non_DEGs_list

['AT1G01010',
 'AT1G01030',
 'AT1G01050',
 'AT1G01183',
 'AT1G01220',
 'AT1G01230',
 'AT1G01240',
 'AT1G01260',
 'AT1G01290',
 'AT1G01300',
 'AT1G01340',
 'AT1G01350',
 'AT1G01360',
 'AT1G01370',
 'AT1G01380',
 'AT1G01400',
 'AT1G01440',
 'AT1G01470',
 'AT1G01490',
 'AT1G01500',
 'AT1G01540',
 'AT1G01550',
 'AT1G01570',
 'AT1G01580',
 'AT1G01590',
 'AT1G01630',
 'AT1G01710',
 'AT1G01720',
 'AT1G01725',
 'AT1G01740',
 'AT1G01780',
 'AT1G01800',
 'AT1G01810',
 'AT1G01820',
 'AT1G01870',
 'AT1G01900',
 'AT1G01910',
 'AT1G01930',
 'AT1G01940',
 'AT1G01950',
 'AT1G01960',
 'AT1G01980',
 'AT1G02000',
 'AT1G02010',
 'AT1G02020',
 'AT1G02090',
 'AT1G02100',
 'AT1G02130',
 'AT1G02140',
 'AT1G02145',
 'AT1G02170',
 'AT1G02210',
 'AT1G02220',
 'AT1G02250',
 'AT1G02270',
 'AT1G02340',
 'AT1G02390',
 'AT1G02400',
 'AT1G02460',
 'AT1G02480',
 'AT1G02520',
 'AT1G02540',
 'AT1G02560',
 'AT1G02590',
 'AT1G02600',
 'AT1G02630',
 'AT1G02650',
 'AT1G02670',
 'AT1G02681',
 'AT1G02720',
 'AT1G02730',
 'AT1G

In [142]:
X.shape

(5240, 371)

In [144]:
len(DEGs_pr['AGI'])

2739

In [145]:
X.to_csv(os.path.join('/home/temccrac/Programs/git_clones/ECE759_Project/model_inputs', 'motif_features'))