In [1]:
import pandas as pd
import pybedtools as pbt
import os
import sys

scriptpath = "../../src/"
sys.path.append(os.path.abspath(scriptpath))
import GtfBedLibrary as gbl

In [2]:
#Save to
outpath = '../../data/3PrimeEnds/Quantseq'
os.makedirs(outpath, exist_ok=True)

# Import filtered bed
BedRegions = gbl.ReadBed('../../results/GeneGroups_Quantseq/3UtrLoci/QuantseqGroups.bed')


fai = '../../data/genomes/Goodwright_m39/chromsizes.txt'

fasta = '../../data/genomes/Goodwright_m39/GRCm39.primary_assembly.genome.fa'

windows = [300, 500]
FastaW = 300

In [3]:
BedRegions.head()

Unnamed: 0,chrom,start,end,name,score,strand,6
0,chr10,24648303,24649802,DOWN,-6.899042,-,ENSMUSG00000019989
1,chr4,99546268,99546859,DOWN,-5.904965,+,ENSMUSG00000067261
2,chrX,85239198,85239553,DOWN,-5.527307,+,ENSMUSG00000025056
3,chr15,85760430,85760774,DOWN,-4.238394,+,ENSMUSG00000022385
4,chr13,114588825,114590341,DOWN,-4.188287,-,ENSMUSG00000021765


In [4]:
BedRegions['id'] = BedRegions.astype(str).apply(lambda row: f'{row["name"]}_{row.chrom}:{row.start}-{row.end}({row.strand})', axis='columns')

In [5]:
BedRegions

Unnamed: 0,chrom,start,end,name,score,strand,6,id
0,chr10,24648303,24649802,DOWN,-6.899042,-,ENSMUSG00000019989,DOWN_chr10:24648303-24649802(-)
1,chr4,99546268,99546859,DOWN,-5.904965,+,ENSMUSG00000067261,DOWN_chr4:99546268-99546859(+)
2,chrX,85239198,85239553,DOWN,-5.527307,+,ENSMUSG00000025056,DOWN_chrX:85239198-85239553(+)
3,chr15,85760430,85760774,DOWN,-4.238394,+,ENSMUSG00000022385,DOWN_chr15:85760430-85760774(+)
4,chr13,114588825,114590341,DOWN,-4.188287,-,ENSMUSG00000021765,DOWN_chr13:114588825-114590341(-)
...,...,...,...,...,...,...,...,...
4871,chr2,152454543,152454649,UP,6.869512,+,ENSMUSG00000044863,UP_chr2:152454543-152454649(+)
4872,chr15,85419637,85421592,UP,7.345146,-,ENSMUSG00000022382,UP_chr15:85419637-85421592(-)
4873,chr8,95739612,95740845,UP,8.463418,+,ENSMUSG00000031785,UP_chr8:95739612-95740845(+)
4874,chr18,67500590,67500855,UP,8.887039,+,ENSMUSG00000024526,UP_chr18:67500590-67500855(+)


## Functions

In [6]:
def Get3UtrEnds(df):
    df.loc[df.strand == '+', 'start'] = df.loc[df.strand == '+', 'end'] - 1
    df.loc[df.strand == '-', 'end'] = df.loc[df.strand == '-', 'start'] + 1
    return df

def StrandedSlop(bt, fai, UpstreamW=300, DownstreamW=0):
    bt = bt.slop(s=True, l=UpstreamW, r=DownstreamW, g=fai)
    return bt

## Get intervals

In [7]:
# Get 3'UTR ends
btquantseq = pbt.BedTool.from_dataframe(Get3UtrEnds(BedRegions)).sort()
btquantseq.saveas(f'{outpath}/quantseq3PEnds.bed.gz')

# Save a separate bed file for each quantseq group (3'ends)
for group, df in gbl.ParseBedToolToDf(btquantseq).groupby(['name']):
    print(group)
    bt = pbt.BedTool.from_dataframe(df)
    bt.saveas(f'{outpath}/{group}_quantseq3PEnds.bed.gz')

# Get interval file with x nt upstream of 3'UTR end.
for UpstreamW in windows:
    btquantseqSlop = StrandedSlop(btquantseq, fai=fai, UpstreamW=UpstreamW).sort()
    btquantseqSlop.saveas(f'{outpath}/Btquantseq3PEndsExtended{UpstreamW}Upstream.bed.gz')

dfquantseqSlop = gbl.ReadBed(f'{outpath}/Btquantseq3PEndsExtended{FastaW}Upstream.bed.gz')

# Get fasta files
for group, df in dfquantseqSlop.groupby(['name']):
    print(group)
    bt = pbt.BedTool.from_dataframe(df)
    bt.sequence(fi=fasta, s=True, fo=f'{outpath}/{group}_quantseqSlop_Upstream{FastaW}.fasta')

CONTROL
DOWN
UP
CONTROL
DOWN
UP
