# Motif Detection of Every Gene Detected in Experimental Data

Teague McCracken 
03/30/2025

## Step 1 - Extract promoter sequences for each gene

In [None]:
# Initialize the notebook, set environment to transcriptomics
import pandas as pd
import subprocess
import os

# File paths
gtf = "/home/temccrac/Programs/data/genomes/Arabidopsis_thaliana.TAIR10.54.gtf"
gene_list = "genes_of_interest.txt"
genome_fa = "/home/temccrac/Programs/data/genomes/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa"
genome_sizes = "/home/temccrac/Programs/data/genomes/Arabidopsis.genome"

filtered_gtf = "filtered.gtf"
tss_bed = "tss.bed"
promoter_bed = "promoters.bed"
output_fasta = "promoters.fa"

Inputs needed: <br>
1. gtf_file (reference genome information) <br>
2. gene_list_file (detected genes from experiment) <br>
3. genome.fa

In [None]:
def filter_gtf_by_genes(gtf_file, gene_list_file, output_gtf):
    with open(gene_list_file, 'r') as f:
        genes = set(f.read().splitlines())

    with open(gtf_file, 'r') as gtf_in, open(output_gtf, 'w') as gtf_out:
        for line in gtf_in:
            if line.startswith('#'):
                continue
            if any(gene in line for gene in genes):
                gtf_out.write(line)

def extract_tss_bed(filtered_gtf, output_bed):
    awk_cmd = (
        '''awk '$3 == "transcript" {
            match($0, /gene_id "([^"]+)"/, m);
            gene = m[1];
            if ($7 == "+") {
                print $1 "\\t" $4-1 "\\t" $4 "\\t" gene "\\t.\\t" $7;
            } else {
                print $1 "\\t" $5-1 "\\t" $5 "\\t" gene "\\t.\\t" $7;
            }
        }' '''
    )
    full_cmd = f"{awk_cmd} {filtered_gtf} > {output_bed}"
    subprocess.run(full_cmd, shell=True, check=True)

def create_promoter_bed(tss_bed, genome_sizes, output_bed, length=1000):
    with open(output_bed, 'w') as out_f:
        subprocess.run([
            "bedtools", "flank",
            "-i", tss_bed,
            "-g", genome_sizes,
            "-l", str(length),
            "-r", "0",
            "-s"
        ], stdout=out_f, check=True)

def extract_promoter_fasta(genome_fa, promoter_bed, output_fasta):
    subprocess.run([
        "bedtools", "getfasta",
        "-fi", genome_fa,
        "-bed", promoter_bed,
        "-fo", output_fasta,
        "-s"
    ], check=True)


In [None]:


# Run steps
filter_gtf_by_genes(gtf, gene_list, filtered_gtf)
extract_tss_bed(filtered_gtf, tss_bed)
create_promoter_bed(tss_bed, genome_sizes, promoter_bed)
extract_promoter_fasta(genome_fa, promoter_bed, output_fasta)

print("✅ Promoter FASTA extraction complete.")