In [1]:
import pandas as pd
import pybedtools as pbt
import os
import sys

scriptpath = "../../src/"
sys.path.append(os.path.abspath(scriptpath))
import GtfBedLibrary as gbl

In [2]:
#Save to
outpath = '../../data/3PrimeEnds/Quantseq'
os.makedirs(outpath, exist_ok=True)

# Import filtered bed
BedRegions = pd.read_csv(
    '../../results/GeneGroups_Quantseq/3UtrLoci/NaiveGenesUtrs.bed', \
        sep='\t', header=None
        )[[0, 1, 2, 7, 4, 5, 6]].fillna('notexpressed')
BedRegions.columns = ['chrom', 'start', 'end', 'name', 'score', 'strand', 6]


fai = '../../data/genomes/Goodwright_m39/chromsizes.txt'

fasta = '../../data/genomes/Goodwright_m39/GRCm39.primary_assembly.genome.fa'

windows = [300, 500]
FastaW = 300

In [3]:
BedRegions.head()

Unnamed: 0,chrom,start,end,name,score,strand,6
0,chr4,99546268,99546859,DOWN,.,+,ENSMUSG00000067261
1,chr8,73074760,73075500,DOWN,.,+,ENSMUSG00000055148
2,chr6,122690511,122691592,DOWN,.,+,ENSMUSG00000012396
3,chr3,133169439,133172500,DOWN,.,-,ENSMUSG00000040943
4,chr6,71565955,71566621,DOWN,.,-,ENSMUSG00000053470


In [4]:
BedRegions['id'] = BedRegions.astype(str).apply(lambda row: f'{row["name"]}_{row.chrom}:{row.start}-{row.end}({row.strand})', axis='columns')

In [5]:
BedRegions

Unnamed: 0,chrom,start,end,name,score,strand,6,id
0,chr4,99546268,99546859,DOWN,.,+,ENSMUSG00000067261,DOWN_chr4:99546268-99546859(+)
1,chr8,73074760,73075500,DOWN,.,+,ENSMUSG00000055148,DOWN_chr8:73074760-73075500(+)
2,chr6,122690511,122691592,DOWN,.,+,ENSMUSG00000012396,DOWN_chr6:122690511-122691592(+)
3,chr3,133169439,133172500,DOWN,.,-,ENSMUSG00000040943,DOWN_chr3:133169439-133172500(-)
4,chr6,71565955,71566621,DOWN,.,-,ENSMUSG00000053470,DOWN_chr6:71565955-71566621(-)
5,chr19,23142273,23145498,DOWN,.,+,ENSMUSG00000033863,DOWN_chr19:23142273-23145498(+)
6,chr14,122716652,122717264,DOWN,.,+,ENSMUSG00000061524,DOWN_chr14:122716652-122717264(+)
7,chr1,136555702,136557791,DOWN,.,+,ENSMUSG00000041483,DOWN_chr1:136555702-136557791(+)
8,chrX,57079906,57081919,DOWN,.,+,ENSMUSG00000067860,DOWN_chrX:57079906-57081919(+)
9,chr8,89753862,89755008,DOWN,.,-,ENSMUSG00000031665,DOWN_chr8:89753862-89755008(-)


## Functions

In [6]:
def Get3UtrEnds(df):
    df.loc[df.strand == '+', 'start'] = df.loc[df.strand == '+', 'end'] - 1
    df.loc[df.strand == '-', 'end'] = df.loc[df.strand == '-', 'start'] + 1
    return df

def StrandedSlop(bt, fai, UpstreamW=300, DownstreamW=0):
    bt = bt.slop(s=True, l=UpstreamW, r=DownstreamW, g=fai)
    return bt

## Get intervals

In [7]:
# Get 3'UTR ends
btquantseq = pbt.BedTool.from_dataframe(Get3UtrEnds(BedRegions)).sort()
btquantseq.saveas(f'{outpath}/naive3PEnds.bed.gz')

# Get interval file with x nt upstream of 3'UTR end.
for UpstreamW in windows:
    btquantseqSlop = StrandedSlop(btquantseq, fai=fai, UpstreamW=UpstreamW).sort()
    btquantseqSlop.saveas(f'{outpath}/Btnaive3PEndsExtended{UpstreamW}Upstream.bed.gz')