In [1]:
import pandas as pd
import pybedtools as pbt
import os
import sys

scriptpath = "../../src/"
sys.path.append(os.path.abspath(scriptpath))
import GtfBedLibrary as gbl

In [2]:
#Save to
outpath = '../../data/3PrimeStarts/Quantseq'
os.makedirs(outpath, exist_ok=True)

# Import filtered bed
BedRegions = gbl.ReadBed('../../results/GeneGroups_Quantseq/3UtrLoci/QuantseqGroups.bed')

fai = '../../data/genomes/Goodwright_m39/chromsizes.txt'

fasta = '../../data/genomes/Goodwright_m39/GRCm39.primary_assembly.genome.fa'

windows = [300, 500]
FastaW = 300

In [3]:
BedRegions.head()

Unnamed: 0,chrom,start,end,name,score,strand,6
0,chr10,24648303,24649802,DOWN,-6.899042,-,ENSMUSG00000019989
1,chr4,99546268,99546859,DOWN,-5.904965,+,ENSMUSG00000067261
2,chrX,85239198,85239553,DOWN,-5.527307,+,ENSMUSG00000025056
3,chr15,85760430,85760774,DOWN,-4.238394,+,ENSMUSG00000022385
4,chr13,114588825,114590341,DOWN,-4.188287,-,ENSMUSG00000021765


## Functions

In [4]:
def Get3UtrStarts(df):
    df.loc[df.strand == '+', 'end'] = df.loc[df.strand == '+', 'start'] + 1
    df.loc[df.strand == '-', 'start'] = df.loc[df.strand == '-', 'end'] - 1
    return df

def StrandedSlop(bt, fai, UpstreamW=0, DownstreamW=300):
    bt = bt.slop(s=True, l=UpstreamW, r=DownstreamW, g=fai)
    return bt

## Get intervals

In [5]:
# Get 3'UTR starts
btquantseq = pbt.BedTool.from_dataframe(Get3UtrStarts(BedRegions)).sort()
btquantseq.saveas(f'{outpath}/quantseq3PUtrStarts.bed.gz')

<BedTool(../../data/3PrimeStarts/Quantseq/quantseq3PUtrStarts.bed.gz)>

In [6]:
# Save a separate bed file for each quantseq group (3'ends)
for group, df in gbl.ParseBedToolToDf(btquantseq).groupby(['name']):
    print(group)
    bt = pbt.BedTool.from_dataframe(df)
    bt.saveas(f'{outpath}/{group}_quantseq3PUtrStarts.bed.gz')

CONTROL
DOWN
UP


In [7]:
# Get interval file with x nt upstream of 3'UTR end.
for DownstreamW in windows:
    btquantseqSlop = StrandedSlop(btquantseq, fai=fai, UpstreamW=0, DownstreamW=DownstreamW).sort()
    btquantseqSlop.saveas(f'{outpath}/Btquantseq3PStartsExtended{DownstreamW}Downstream.bed.gz')

In [8]:
dfquantseqSlop = gbl.ReadBed(f'{outpath}/Btquantseq3PStartsExtended{FastaW}Downstream.bed.gz')
dfquantseqSlop.head()

Unnamed: 0,chrom,start,end,name,score,strand,6
0,chr1,4846388,4846689,CONTROL,-0.221081,-,ENSMUSG00000033845
1,chr1,4966584,4966885,DOWN,-1.70947,+,ENSMUSG00000033813
2,chr1,9616843,9617144,DOWN,-1.120703,+,ENSMUSG00000061024
3,chr1,10094905,10095206,CONTROL,0.359694,-,ENSMUSG00000025917
4,chr1,10206367,10206668,CONTROL,-0.109736,+,ENSMUSG00000056763


In [9]:
# Get fasta files
for group, df in dfquantseqSlop.groupby(['name']):
    print(group)
    bt = pbt.BedTool.from_dataframe(df)
    bt.sequence(fi=fasta, s=True, fo=f'{outpath}/{group}_quantseqSlop_Downstream{FastaW}.fasta')

CONTROL
DOWN
UP
