In [1]:
import pandas as pd
from glob import glob
import subprocess

In [2]:
# outpath='../../results/FindMotifs/quantseq_groups'
outpath='../../results/FindMotifs/full_utrs'

In [3]:
%%bash
patterns='AAA AATAAA GAT TTT ATT GGA,GGT'
# outpath=../../results/FindMotifs/quantseq_groups
outpath=../../results/FindMotifs/full_utrs


# utrs=../../results/GeneGroups_Quantseq/3UtrLoci/QuantseqGroups.bed
utrs=../../data/3UtrAtlas/ThreePrimeUtrsOfMostExpressedTxInS200WT2iL.bed
genome=../../data/genomes/Goodwright_m39/GRCm39.primary_assembly.genome.fa


mkdir -p $outpath

# Extract 3'UTR fasta - ignore strand
bedtools getfasta -name -fi $genome -bed $utrs > ${outpath}/full_utrs_noRevComp.fa

# Connect headers to relevant strand
awk '{new_var=$4"::"$1":"$2"-"$3; print new_var"\t"$6}' $utrs > ${outpath}/id_strand.tsv


# Locate subsequences - both positive and negative strand
for p in $patterns
do
  # Replace comma with space to pass multiple strings
  motifs=${p//','/' -p '}
  cat ${outpath}/full_utrs_noRevComp.fa \
  | seqkit locate --ignore-case --max-mismatch 0 --hide-matched -p $motifs --bed \
    > ${outpath}/full_utrs_noRevComp_locate_${p}.tsv
done

In [4]:
df_ids = pd.read_csv(f'{outpath}/id_strand.tsv', sep='\t', header=None, names=['id', 'strand']).astype(str)
files = glob(f'{outpath}/full_utrs_noRevComp_locate_*.tsv')

for f in files:
    m = f.split('_')[-1].split('.')[0]
    df_motifs = pd.read_csv(f, sep='\t', names=['id', 'start', 'end', 'motif', 'score', 'strand'])
    df_motifs[['id', 'strand']] = df_motifs[['id', 'strand']].astype(str)


    # Use the inner merge to only keep identified sites for the relevant strand
    df_motifs = df_ids.merge(df_motifs, on=['id', 'strand'], how='inner')

    # Separate id elements to get genomic coordinates
    df_motifs['reg_start'] = df_motifs.id.apply(lambda x: x.split(':')[-1].split('-')[0]).astype(int)
    df_motifs['reg_end'] = df_motifs.id.apply(lambda x: x.split(':')[-1].split('-')[1]).astype(int)
    df_motifs['chr'] = df_motifs.id.apply(lambda x: x.split(':')[2])
    df_motifs['regulation'] = df_motifs.id.apply(lambda x: x.split('::')[0])

    # Get genomic coordinates of motifs
    df_motifs['mot_start'] = df_motifs[['reg_start', 'start']].sum(axis='columns')
    df_motifs['mot_end'] = df_motifs[['reg_start', 'end']].sum(axis='columns')

    # Save bed file - checked in IGV -  worked perfectly
    bed = df_motifs[['chr', 'mot_start', 'mot_end', 'regulation', 'score', 'strand']]
    bed.to_csv(f'{outpath}/motifs_{m}.bed.gz', sep='\t', header=None, index=False, quoting=False)

In [9]:
%%bash
motifs='AAA AATAAA GAT TTT ATT GGA,GGT'
# outpath=../../results/FindMotifs/quantseq_groups
outpath=../../results/FindMotifs/full_utrs

for m in $motifs; do
    zcat $outpath/motifs_${m}.bed.gz | sort -k1,1 -k2,2n | bedtools merge -s -c 4,5,6 -o distinct,sum,distinct | gzip > $outpath/motifs_${m}_merged.bed.gz
done

In [6]:
df_motifs.head()

Unnamed: 0,id,strand,start,end,motif,score,reg_start,reg_end,chr,regulation,mot_start,mot_end
0,ENSMUSG00000063316::chr11:101336310-101336355,+,25,28,AAA,0,101336310,101336355,chr11,ENSMUSG00000063316,101336335,101336338
1,ENSMUSG00000063316::chr11:101336310-101336355,+,26,29,AAA,0,101336310,101336355,chr11,ENSMUSG00000063316,101336336,101336339
2,ENSMUSG00000063316::chr11:101336310-101336355,+,27,30,AAA,0,101336310,101336355,chr11,ENSMUSG00000063316,101336337,101336340
3,ENSMUSG00000063316::chr11:101336310-101336355,+,32,35,AAA,0,101336310,101336355,chr11,ENSMUSG00000063316,101336342,101336345
4,ENSMUSG00000063316::chr11:101336310-101336355,+,33,36,AAA,0,101336310,101336355,chr11,ENSMUSG00000063316,101336343,101336346


In [7]:
bed.head()

Unnamed: 0,chr,mot_start,mot_end,regulation,score,strand
0,chr11,101336335,101336338,ENSMUSG00000063316,0,+
1,chr11,101336336,101336339,ENSMUSG00000063316,0,+
2,chr11,101336337,101336340,ENSMUSG00000063316,0,+
3,chr11,101336342,101336345,ENSMUSG00000063316,0,+
4,chr11,101336343,101336346,ENSMUSG00000063316,0,+
