In [1]:
import pybedtools as pbt
import pandas as pd
from glob import glob

In [2]:
def readBed(f):
    return pd.read_csv(f, sep='\t', header=None)

def Get3UtrEnds(df):
    df.loc[df[5] == '+', 1] = df.loc[df[5] == '+', 2] - 1
    df.loc[df[5] == '-', 2] = df.loc[df[5] == '-', 1] + 1
    return df

def StrandedSlop(bt, fai, UpstreamW=0, DownstreamW=300):
    bt = bt.slop(s=True, l=UpstreamW, r=DownstreamW, g=fai).sort()
    return bt

In [3]:
# Files

# Params
# Region around PAS
downW = 0
upW = 500

# Min 3'UTR length
minLen = 501

# Quantseq regs
dfUtrs = readBed('../../../results/GeneGroups_Quantseq/3UtrLoci/QuantseqGroups.bed')
dfUtrs['len'] = dfUtrs[2] - dfUtrs[1]
dfUtrs = dfUtrs.loc[dfUtrs.len >= minLen, :6]

# fasta index
fai = '../../../data/genomes/Goodwright_m39/GRCm39.primary_assembly.genome.fa.fai'

# Expression
DfTpm = pd.read_csv('../../../data/MihaDeseq/GeneLevel_TPM_counts.csv')

# Save to
out_regions = 'ProfileRegions'
out_combined_regions = '../../../results/Metaprofiles/UTR3_termini'
os.makedirs(out_regions, exist_ok=True)
os.makedirs(out_combined_regions, exist_ok=True)

In [4]:
# Genes included in the metaprofile
dfUtrs.groupby(3).count()[0]

3
CONTROL    1760
DOWN        768
UP          623
Name: 0, dtype: int64

In [5]:
dfUtrs

Unnamed: 0,0,1,2,3,4,5,6
0,chr10,24648303,24649802,DOWN,-6.899042,-,ENSMUSG00000019989
1,chr4,99546268,99546859,DOWN,-5.904965,+,ENSMUSG00000067261
4,chr13,114588825,114590341,DOWN,-4.188287,-,ENSMUSG00000021765
5,chr17,27781462,27782648,DOWN,-4.120237,+,ENSMUSG00000046711
6,chr7,80819357,80820164,DOWN,-4.100704,+,ENSMUSG00000025726
...,...,...,...,...,...,...,...
4865,chr8,84449879,84450504,UP,5.618269,-,ENSMUSG00000002885
4867,chr2,164796823,164797770,UP,5.854944,+,ENSMUSG00000017737
4868,chr15,99481920,99482428,UP,6.180839,+,ENSMUSG00000023013
4872,chr15,85419637,85421592,UP,7.345146,-,ENSMUSG00000022382


In [6]:
# Get mean TPMs in each condition for gene-level TPMs
DfTpm.set_index('stable_gene_id', drop=True, inplace=True)
# To avoid 0-division error assign a +1 to all values
DfTpm = DfTpm + 1
DfTpm.head()
conditions = []
colnames = []
for c in DfTpm.columns:
    cond = '_'.join([c.split('_')[el] for el in [0, 2]])
    colnames.append(f'{cond}.{c}')
    conditions.append(cond)
conditions = sorted(set(conditions))
DfTpm.columns = colnames

DfTpmAveraged = pd.DataFrame()
for cond in conditions:
    DfTpmAveraged[f'{cond} Mean TPM'] = DfTpm[[c for c in DfTpm.columns if c.split('.')[0] == cond]].mean(axis='columns')
DfTpmAveraged.head()

Unnamed: 0_level_0,KO_2iL Mean TPM,KO_FCL Mean TPM,S200A_2iL Mean TPM,S200A_FCL Mean TPM,S200WT_2iL Mean TPM,S200WT_FCL Mean TPM,WT_2iL Mean TPM,WT_FCL Mean TPM
stable_gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ENSMUSG00000000001,7.67795,8.112323,6.88097,6.033886,8.122433,2.797935,4.333492,3.864102
ENSMUSG00000000003,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
ENSMUSG00000000028,11.692164,9.045459,10.77565,7.13384,14.348023,4.322841,15.10763,7.196186
ENSMUSG00000000031,1.093472,1.0,1.062915,1.569452,1.0,1.0,1.0,1.17864
ENSMUSG00000000037,1.14466,1.038239,1.305144,1.137961,1.196149,1.249172,1.057581,1.086607


In [7]:
cond_of_interest = ['KO_FCL', 'S200WT_FCL', 'S200A_FCL', 'S200WT_2iL']
# Get ends of 3'UTRs
btEnds = pbt.BedTool().from_dataframe(Get3UtrEnds(dfUtrs)).sort()
btEnds.saveas(f'{out_combined_regions}/3UTR_ends_included_in_metaprofile.bed.gz')
dfRegions = StrandedSlop(btEnds, fai=fai, UpstreamW=upW, DownstreamW=downW).to_dataframe(disable_auto_names=True, header=None)
dfRegions.to_csv(f'{out_combined_regions}/AllRegionsInMetaprofile.bed.gz', sep='\t', index=False, header=None, quoting=None)
for g, df in dfRegions.groupby(3):
    print(df.head())
    # introduce expression values into score columnn
    for c in cond_of_interest:
        dfRegs = df.merge(DfTpmAveraged[f'{c} Mean TPM'], left_on=6, right_index=True)
        dfRegs = dfRegs[[0, 1, 2, 3, f'{c} Mean TPM', 5]].sort_values(by=[0, 1], ascending=True)
        dfRegs.to_csv(f'{out_regions}/UTR3ends_{g}_up{upW}_down{downW}_cond-{c}.bed.gz', sep='\t', index=False, header=None, quoting=None)

       0         1         2        3         4  5                   6
0   chr1   4843433   4843934  CONTROL -0.221081  -  ENSMUSG00000033845
3   chr1  10206492  10206993  CONTROL -0.109736  +  ENSMUSG00000056763
5   chr1  13634921  13635422  CONTROL  0.255102  -  ENSMUSG00000025935
10  chr1  21299890  21300391  CONTROL  0.473224  +  ENSMUSG00000025933
11  chr1  23884705  23885206  CONTROL -0.324667  -  ENSMUSG00000026155
      0         1         2     3         4  5                   6
1  chr1   4967631   4968132  DOWN -1.709470  +  ENSMUSG00000033813
2  chr1   9617179   9617680  DOWN -1.120703  +  ENSMUSG00000061024
4  chr1  12930915  12931416  DOWN -2.002582  +  ENSMUSG00000016918
6  chr1  14823569  14824070  DOWN -2.025445  -  ENSMUSG00000025930
7  chr1  15913775  15914276  DOWN -0.920995  +  ENSMUSG00000025925
       0         1         2   3         4  5                   6
15  chr1  34918160  34918661  UP  1.696404  +  ENSMUSG00000026123
17  chr1  36461145  36461646  UP  0.9539

In [8]:
# For each condition merge UP and CONTROL
up_files = glob(f'{out_regions}/*_UP_*')
ctrl_files = glob(f'{out_regions}/*_CONTROL_*')
for c in cond_of_interest:
    up = pbt.BedTool([f for f in up_files if c in f][0]).sort()
    ctrl = pbt.BedTool([f for f in ctrl_files if c in f][0]).sort()
    concat = up.cat(ctrl, postmerge=False).sort()
    concat.saveas(f'{out_regions}/UTR3ends_UP+CONTROL_up{upW}_down{downW}_cond-{c}.bed.gz')