In [1]:
import pybedtools as pbt
import pandas as pd
from glob import glob

In [2]:
def readBed(f):
    return pd.read_csv(f, sep='\t', header=None)

def GetStarts(df):
    df.loc[df[5] == '+', 2] = df.loc[df[5] == '+', 1] + 1
    df.loc[df[5] == '-', 1] = df.loc[df[5] == '-', 2] - 1
    return df

def StrandedSlop(bt, fai, UpstreamW=0, DownstreamW=300):
    bt = bt.slop(s=True, l=UpstreamW, r=DownstreamW, g=fai).sort()
    return bt

In [37]:
# Files

# Params
# Region around PAS
downW = 50
upW = 200

# Min 3'UTR length
minLen = 300

# terminal PAS in full 3'UTR set
# terminal PAS gene-groups
files = [
    # '../../../data/TerminalPas/terminal_pa_signals.bed.gz',
    '../../../data/TerminalPas/terminal_pa_signals_UP.bed.gz',
    '../../../data/TerminalPas/terminal_pa_signals_CONTROL.bed.gz',
    '../../../data/TerminalPas/terminal_pa_signals_DOWN.bed.gz',
]
# Quantseq regs
dfUtrs = readBed('../../../results/GeneGroups_Quantseq/3UtrLoci/QuantseqGroups.bed')
dfUtrs['len'] = dfUtrs[2] - dfUtrs[1]
dfUtrs = dfUtrs.loc[dfUtrs.len >= minLen, :6]
genesToPlot = dfUtrs[6].values.tolist()

# fasta index
fai = '../../../data/genomes/Goodwright_m39/GRCm39.primary_assembly.genome.fa.fai'

# Expression
DfTpm = pd.read_csv('../../../data/MihaDeseq/GeneLevel_TPM_counts.csv')

# Naive genes
naive_ids = pd.read_csv('../../../data/general/NaiveGeneIds.csv', index_col=0)

# Save to
out_pas_starts = '../../../data/TerminalPas/Starts'
out_regions = 'ProfileRegions'
out_combined_regions = '../../../results/Metaprofiles/PAS_Quantseq'
os.makedirs(out_pas_starts, exist_ok=True)
os.makedirs(out_regions, exist_ok=True)
os.makedirs(out_combined_regions, exist_ok=True)

In [4]:
# Get mean TPMs in each condition for gene-level TPMs
DfTpm.set_index('stable_gene_id', drop=True, inplace=True)
# To avoid 0-division error assign a +1 to all values
DfTpm = DfTpm + 1
DfTpm.head()
conditions = []
colnames = []
for c in DfTpm.columns:
    cond = '_'.join([c.split('_')[el] for el in [0, 2]])
    colnames.append(f'{cond}.{c}')
    conditions.append(cond)
conditions = sorted(set(conditions))
DfTpm.columns = colnames

DfTpmAveraged = pd.DataFrame()
for cond in conditions:
    DfTpmAveraged[f'{cond} Mean TPM'] = DfTpm[[c for c in DfTpm.columns if c.split('.')[0] == cond]].mean(axis='columns')
DfTpmAveraged.head()

Unnamed: 0_level_0,KO_2iL Mean TPM,KO_FCL Mean TPM,S200A_2iL Mean TPM,S200A_FCL Mean TPM,S200WT_2iL Mean TPM,S200WT_FCL Mean TPM,WT_2iL Mean TPM,WT_FCL Mean TPM
stable_gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ENSMUSG00000000001,7.67795,8.112323,6.88097,6.033886,8.122433,2.797935,4.333492,3.864102
ENSMUSG00000000003,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
ENSMUSG00000000028,11.692164,9.045459,10.77565,7.13384,14.348023,4.322841,15.10763,7.196186
ENSMUSG00000000031,1.093472,1.0,1.062915,1.569452,1.0,1.0,1.0,1.17864
ENSMUSG00000000037,1.14466,1.038239,1.305144,1.137961,1.196149,1.249172,1.057581,1.086607


In [5]:
cond_of_interest = ['KO_FCL', 'S200WT_FCL', 'S200A_FCL', 'S200WT_2iL']
for f in files:
    btStarts = pbt.BedTool().from_dataframe(GetStarts(readBed(f))).sort()
    fname = f.split('/')[-1].split('.')[0]
    btStarts.saveas(f'{out_pas_starts}/{fname}.bed.gz')
    dfRegs = StrandedSlop(btStarts, fai=fai, UpstreamW=upW, DownstreamW=downW).to_dataframe(disable_auto_names=True, header=None)
    dfRegs = dfRegs.loc[dfRegs[3].isin(genesToPlot)]
    print(fname, len(dfRegs))
    # introduce expression values into score columnn
    for c in cond_of_interest:
        dfRegs = dfRegs.merge(DfTpmAveraged[f'{c} Mean TPM'], left_on=3, right_index=True)
        dfRegs = dfRegs[[0, 1, 2, 3, f'{c} Mean TPM', 5]].sort_values(by=[0, 1], ascending=True)
        dfRegs.to_csv(f'{out_regions}/{fname}_up{upW}_down{downW}_cond-{c}.bed.gz', sep='\t', index=False, header=None, quoting=None)

terminal_pa_signals_UP 633
terminal_pa_signals_CONTROL 1739
terminal_pa_signals_DOWN 831


In [6]:
# For each condition merge UP and CONTROL
up_files = glob(f'{out_regions}/*_UP_*')
ctrl_files = glob(f'{out_regions}/*_CONTROL_*')
for c in cond_of_interest:
    up = pbt.BedTool([f for f in up_files if c in f][0]).sort()
    ctrl = pbt.BedTool([f for f in ctrl_files if c in f][0]).sort()
    concat = up.cat(ctrl, postmerge=False).sort()
    concat.saveas(f'{out_regions}/terminal_pa_signals_UP+CONTROL_up{upW}_down{downW}_cond-{c}.bed.gz')

In [7]:
# Save a file with all regions that are included in the metaprofiles
allregs = [f for f in glob(f'{out_regions}/terminal*{cond_of_interest[0]}*') if '+' not in f]
dfList = []
for f in allregs:
    group = f.split('/')[-1].split('_')[3]
    print(group)
    df = readBed(f)
    df[4] = 0
    df[6] = group
    dfList.append(df[[0, 1, 2, 6, 4, 5, 3]])
dfCombinedRegs = pd.concat(dfList).sort_values(by=[0, 1])
dfCombinedRegs.to_csv(f'{out_combined_regions}/AllRegionsInMetaprofile.bed.gz', sep='\t', index=False, header=None, quoting=None)
dfCombinedRegs.head()

DOWN
UP
CONTROL


Unnamed: 0,0,1,2,6,4,5,3
0,chr1,4843404,4843655,CONTROL,0,-,ENSMUSG00000033845
0,chr1,4967906,4968157,DOWN,0,+,ENSMUSG00000033813
1,chr1,9617456,9617707,DOWN,0,+,ENSMUSG00000061024
1,chr1,10095023,10095274,CONTROL,0,-,ENSMUSG00000025917
2,chr1,10206771,10207022,CONTROL,0,+,ENSMUSG00000056763


In [8]:
dfCombinedRegs.groupby(6).count()[0]

6
CONTROL    1739
DOWN        831
UP          633
Name: 0, dtype: int64