In [1]:
import glob
import pandas as pd
import pyranges as pr
import os

In [2]:
outpath = '../../data/3UtrAtlas'
os.makedirs(outpath, exist_ok=True)

In [3]:
# Annotation - I'll intersect gene IDs with mm39
gtf = pr.read_gtf('../../data/genomes/Goodwright_m39/gencode.vM28.primary_assembly.annotation.gencode_utr_fix.gtf.gz', as_df=True)

In [4]:
gtf.Feature.unique()

array(['gene', 'transcript', 'exon', 'CDS', 'start_codon', 'stop_codon',
       'three_prime_utr', 'five_prime_utr', 'Selenocysteine'],
      dtype=object)

In [5]:
Utrs = gtf.loc[gtf.Feature == 'three_prime_utr']
Utrs.head()

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,transcript_support_level,tag,protein_id,exon_number,exon_id,mgi_id,havana_gene,havana_transcript,ont,ccdsid
15,GL456210.1,ENSEMBL,three_prime_utr,149704,149707,.,+,.,ENSMUSG00000094799.2,protein_coding,...,5.0,basic,ENSMUSP00000111591.3,2,ENSMUSE00001005202.2,,,,,
24,GL456210.1,ENSEMBL,three_prime_utr,9123,9659,.,-,.,ENSMUSG00000079800.3,protein_coding,...,1.0,basic,ENSMUSP00000094625.5,2,ENSMUSE00000627465.5,,,,,
34,GL456210.1,ENSEMBL,three_prime_utr,108389,108392,.,-,.,ENSMUSG00000095092.2,protein_coding,...,5.0,basic,ENSMUSP00000135921.2,2,ENSMUSE00001028134.2,,,,,
50,GL456211.1,ENSEMBL,three_prime_utr,67323,67326,.,+,.,ENSMUSG00000096100.2,protein_coding,...,,basic,ENSMUSP00000136818.2,1,ENSMUSE00000992279.2,,,,,
73,GL456211.1,ENSEMBL,three_prime_utr,196305,196478,.,+,.,ENSMUSG00000079190.4,protein_coding,...,1.0,basic,ENSMUSP00000106991.3,7,ENSMUSE00000687549.3,,,,,


In [6]:
Utrs.columns.tolist()

['Chromosome',
 'Source',
 'Feature',
 'Start',
 'End',
 'Score',
 'Strand',
 'Frame',
 'gene_id',
 'gene_type',
 'gene_name',
 'level',
 'transcript_id',
 'transcript_type',
 'transcript_name',
 'transcript_support_level',
 'tag',
 'protein_id',
 'exon_number',
 'exon_id',
 'mgi_id',
 'havana_gene',
 'havana_transcript',
 'ont',
 'ccdsid']

In [7]:
# Import relevant quantfiles - S200WT_2iL
quantfiles = sorted(glob.glob('../../data/MihaDeseq/salmon_quantfiles/*S200WT*2iL*'))

In [8]:
quantfiles

['../../data/MihaDeseq/salmon_quantfiles/S200WT_1_2iL_1.quant.sf',
 '../../data/MihaDeseq/salmon_quantfiles/S200WT_1_2iL_2.quant.sf',
 '../../data/MihaDeseq/salmon_quantfiles/S200WT_2_2iL_2.quant.sf',
 '../../data/MihaDeseq/salmon_quantfiles/S200WT_2_2iL_3.quant.sf']

In [9]:
dfTpm = pd.DataFrame()
for f in quantfiles:
    s = f.split('/')[-1].split('.')[0]
    df = pd.read_csv(f, sep='\t', index_col=0)
    dfTpm[s] = df['TPM']
dfTpm.index.name = 'transcript_id'
dfTpm['Mean S200WT_2iL TPM'] = dfTpm.mean(axis='columns')
dfTpm.head()

Unnamed: 0_level_0,S200WT_1_2iL_1,S200WT_1_2iL_2,S200WT_2_2iL_2,S200WT_2_2iL_3,Mean S200WT_2iL TPM
transcript_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
magic_mihaels_magnificent_transgene,65.715048,125.926661,131.433109,118.527797,110.400654
ENSMUST00000082908.1,0.0,0.0,0.0,0.0,0.0
ENSMUST00000162897.1,0.0,0.0,0.0,0.0,0.0
ENSMUST00000159265.1,0.0,0.0,0.0,0.0,0.0
ENSMUST00000070533.4,0.0,0.0,0.067422,0.0,0.016855


In [10]:
len(dfTpm)

140658

In [11]:
dfTpm['stable_tx_id'] = [i.split('.')[0] for i in dfTpm.index]
Utrs['stable_tx_id'] = Utrs.transcript_id.apply(lambda x: x.split('.')[0])
Utrs['stable_gene_id'] = Utrs.gene_id.apply(lambda x: x.split('.')[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [12]:
dfTpm = dfTpm[['Mean S200WT_2iL TPM', 'stable_tx_id']].merge(Utrs[['stable_tx_id', 'stable_gene_id', 'gene_name', 'Chromosome', 'Start', 'End', 'Strand']], on='stable_tx_id', how='inner')

In [13]:
len(dfTpm)

89551

In [14]:
dfTpm.head()

Unnamed: 0,Mean S200WT_2iL TPM,stable_tx_id,stable_gene_id,gene_name,Chromosome,Start,End,Strand
0,0.016855,ENSMUST00000070533,ENSMUSG00000051951,Xkr4,chr1,3284704,3286247,-
1,0.0,ENSMUST00000208660,ENSMUSG00000025900,Rp1,chr1,4069779,4069782,-
2,0.0,ENSMUST00000027032,ENSMUSG00000025900,Rp1,chr1,4414368,4414825,-
3,0.0,ENSMUST00000027035,ENSMUSG00000025902,Sox17,chr1,4561153,4561941,-
4,0.013559,ENSMUST00000195555,ENSMUSG00000025902,Sox17,chr1,4561472,4561941,-


In [15]:
dfTpm['Length'] = dfTpm.End - dfTpm.Start

In [16]:
dfTpm.loc[dfTpm.gene_name == 'Cnbp']

Unnamed: 0,Mean S200WT_2iL TPM,stable_tx_id,stable_gene_id,gene_name,Chromosome,Start,End,Strand,Length
28978,66.771401,ENSMUST00000204653,ENSMUSG00000030057,Cnbp,chr6,87819596,87821023,-,1427
28979,0.0,ENSMUST00000032138,ENSMUSG00000030057,Cnbp,chr6,87819596,87821023,-,1427
28980,0.0,ENSMUST00000204890,ENSMUSG00000030057,Cnbp,chr6,87820063,87821023,-,960
28981,0.0129,ENSMUST00000113619,ENSMUSG00000030057,Cnbp,chr6,87820063,87821023,-,960
28982,0.0,ENSMUST00000113617,ENSMUSG00000030057,Cnbp,chr6,87820868,87821023,-,155


In [16]:
# For each gene find the most expressed transcript, if there are ties, keep longer 3'UTR
dfTpm = dfTpm.sort_values(by=['Mean S200WT_2iL TPM', 'Length'], ascending=[False, False]).drop_duplicates(subset=['stable_gene_id'], keep='first')

In [17]:
# Convert to bed and save
dfTpmBed = dfTpm[['Chromosome', 'Start', 'End', 'stable_gene_id', 'Mean S200WT_2iL TPM', 'Strand', 'gene_name', 'stable_tx_id']]
dfTpmBed.to_csv(f'{outpath}/ThreePrimeUtrsOfMostExpressedTxInS200WT2iL.bed', quoting=None, sep='\t', header=None, index=False)