In [20]:
%matplotlib inline


import numpy as np
from collections import OrderedDict, Counter, defaultdict
import pandas as pd

import Bio
from Bio import SeqIO
from Bio.Seq import Seq

import seaborn as sns
import matplotlib.pyplot as plt

import glob

import subprocess
from subprocess import call

import re

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas

import pickle

# Metadata

In [13]:
metadata_pc_g25 = pd.read_csv('../tmp_res/metadata_pc_g25.txt', sep='\t')
# open metadata with scores and metrics
meta = pd.read_csv('../tmp_res/local_and_global_df_g25ovlp_g35ovlp_g38ovlp_refseqovlp_strand_FRAMES_PhyloCSF.txt', sep='\t')

# open SET1 (PhyloSET) and SET2 (RiboSET)
PhyloSET = pd.read_csv('../tmp_res/SET1.txt', sep='\t')
RiboSET = pd.read_csv('../tmp_res/SET2.txt', sep='\t')
RiboSET = RiboSET[(~RiboSET['tr_id'].str.contains('PAR')) & (~RiboSET['Riboseq_Summary'].str.contains('ATG'))]
RiboSET.shape[0]

390

In [77]:
# ~4k RiboSET ext with non-zero CDS cov in my data 
RiboSET_EXT_cds_cov_nonzero = pd.read_csv('tmp_res/RiboSET_EXT_cds_cov_nonzero_theor_len_20codons_3663genes.txt', sep='\t')
RiboSET_EXT_cds_cov_nonzero.shape[0]

3663

In [76]:
pairs_4k_final = pd.read_csv('tmp_res/No_Upstream_Translation_CDS_match_4k.txt', sep='\t')
pairs_392_final = pd.read_csv('tmp_res/No_Upstream_Translation_CDS_match_392.txt', sep='\t')

# SignalP

## RiboSET

In [24]:
RiboSET['start_codon_local_coo_start'] = [int(x.split('; ')[1].split('-')[0])-1 for x in RiboSET['Riboseq_Summary'].tolist()]

riboset_fasta = open('../data/tmhmm_and_phobius_data/riboset.fasta', 'w')

li = []

for row in RiboSET[['tr_id', 'gene', 'N_term_start1', 'N_term_end1', 'start_codon_local_coo_start']].to_numpy():
    tr_id  = row[0]
    gene = row[1]
    N_term_start = row[2]
    N_term_stop = row[3]
    N_term_start_pred = row[-1] 
    tr_seq = metadata_pc_g25[metadata_pc_g25['tr_id'] == tr_id].iloc[0].transcript_seq 
    cds_stop = metadata_pc_g25[metadata_pc_g25['tr_id'] == tr_id].iloc[0].cds_stop_pos
    cds_start = metadata_pc_g25[metadata_pc_g25['tr_id'] == tr_id].iloc[0].cds_start_pos
    
    nte_seq_nt1 = tr_seq[N_term_start:cds_stop]
    nte_seq_nt2 = tr_seq[N_term_start_pred:cds_stop]
    
    nte_seq_aa1 = str(Seq(nte_seq_nt1).translate())
    nte_seq_aa2 = 'M'+str(Seq(nte_seq_nt2).translate())[1:]
    
    # position of CDS in NTE_aa: 
    # 5'    N_term_start_pred    cds_start_pos
    # 
    cds_start_aa_in_ext2 = (cds_start - N_term_start_pred) // 3
    
    li.append([tr_id, gene, nte_seq_aa1, nte_seq_aa2, cds_start_aa_in_ext2, 
              cds_start_aa_in_ext2])
    
    riboset_fasta.write('>'+tr_id+'_'+gene+'\n')
    riboset_fasta.write(nte_seq_aa2[:-1]+'\n')
  
riboset_fasta.close()
        
NTE_aa_df_ribo = pd.DataFrame(li, columns = ['tr_id', 'gene', 'nte_seq_aa1', 'nte_seq_aa2',
                                       'cds_start_aa_in_ext2', 'ext_len'])


In [25]:
!tail ../data/tmhmm_and_phobius_data/riboset.fasta

>ENST00000304661.5_C1GALT1C1
MVRSVTEWCANVRGNPCAAALSCPQAVLDAGKMLSESSSFLKGVMLGSIFCALITMLGHIRIGHGNRMHHHEHHHLQAPNKEDILKISEDERMELSKSFRVYCIILVKPKDVSLWAAVKETWTKHCDKAEFFSSENVKVFESINMDTNDMWLMMRKAYKYAFDKYRDQYNWFFLARPTTFAIIENLKYFLLKKDPSQPFYLGHTIKSGDLEYVGMEGGIVLSVESMKRLNSLLNIPEKCPEQGGMIWKISEDKQLAVCLKYAGVFAENAEDADGKDVFNTKSVGLSIKEAMTYHPNQVVEGCCSDMAVTFNGLTPNQMHVMMYGVYRLRAFGHIFNDALVFLPPNGSDND
>ENST00000371122.8_SMARCA1
MQPLPPPRSPRPLPCPPTRSRSHGMEQDTAAVAATVAAADATATIVVIEDEQPGPSTSQEEGAAAAATEATAATEKGEKKKEKNVSSFQLKLAAKAPKSEKEMDPEYEEKMKADRAKRFEFLLKQTELFAHFIQPSAQKSPTSPLNMKLGRPRIKKDEKQSLISAGDYRHRRTEQEEDEELLSESRKTSNVCIRFEVSPSYVKGGPLRDYQIRGLNWLISLYENGVNGILADEMGLGKTLQTIALLGYLKHYRNIPGPHMVLVPKSTLHNWMNEFKRWVPSLRVICFVGDKDARAAFIRDEMMPGEWDVCVTSYEMVIKEKSVFKKFHWRYLVIDEAHRIKNEKSKLSEIVREFKSTNRLLLTGTPLQNNLHELWALLNFLLPDVFNSADDFDSWFDTKNCLGDQKLVERLHAVLKPFLLRRIKTDVEKSLPPKKEIKIYLGLSKMQREWYTKILMKDIDVLNSSGKMDKMRLLNILMQLRKCCNHPYLFDGAEPGPPYTTDEHIVSNSGKMVVLDKLLAKLKEQGSRVLIFSQMTRLLDILEDYCMWRGYEYCRLDGQTPHEEREDKFLEVEFLGQREAIEAFNAPNSS

In [27]:
signalP_df_r = pd.read_csv('../data/SignalP_RiboSET_output_protein_type.txt', sep='\t', comment='#', header=None)
signalP_df_r.columns = ['id', 'class', 'SP_score', 'Other_score', 'SP_seq']
signalP_df_r['tr_id'] = [x.split('_')[0] for x in signalP_df_r['id'].tolist()]
signalP_df_r['gene'] = [x.split('_')[-1] for x in signalP_df_r['id'].tolist()]

# exclude AUGs 
signalP_df_r = signalP_df_r[~signalP_df_r['gene'].isin(['STIM2', 'AP3S1', 'PTPRJ'])]

In [28]:
tmp = signalP_df_r[(signalP_df_r['class'] != 'OTHER') & (signalP_df_r['SP_seq'] != 'CS pos: ?. Probable protein fragment')]
tmp['SP_end_pos'] = [int(x.split('CS pos: ')[1].split('-')[0]) for x in tmp['SP_seq'].tolist()]
tmp = tmp.merge(NTE_aa_df_ribo[['tr_id', 'gene', 'cds_start_aa_in_ext2']], on=['tr_id', 'gene'], how='inner')
tmp['SP_start_pos'] = tmp['SP_end_pos'] - 16
tmp[0:2]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,id,class,SP_score,Other_score,SP_seq,tr_id,gene,SP_end_pos,cds_start_aa_in_ext2,SP_start_pos
0,ENST00000399728.5_STMN1,SP(Sec/SPI),0.769171,0.230829,CS pos: 19-20. ASS-DI. Pr: 0.6043,ENST00000399728.5,STMN1,19,15,3
1,ENST00000370109.7_DPH5,SP(Sec/SPI),0.687048,0.312952,CS pos: 25-26. GDA-KD. Pr: 0.5493,ENST00000370109.7,DPH5,25,13,9


### CS before CDS start

In [29]:
tmp[tmp['cds_start_aa_in_ext2'] >= tmp['SP_end_pos']]

Unnamed: 0,id,class,SP_score,Other_score,SP_seq,tr_id,gene,SP_end_pos,cds_start_aa_in_ext2,SP_start_pos
23,ENST00000395841.6_RAE1,SP(Sec/SPI),0.564221,0.435779,CS pos: 16-17. ARA-GS. Pr: 0.3038,ENST00000395841.6,RAE1,16,40,0


### genes with CS-16 position before CDS 

In [30]:
tmp[tmp['cds_start_aa_in_ext2'] >= tmp['SP_start_pos']]['gene'].tolist()

['STMN1', 'DPH5', 'ADAM15', 'ZNF622', 'SUPT4H1', 'RAE1']

# Run same for matched set 

In [35]:
a = pairs_392_final[['tr_id']]
a.columns = ['tr_id1']

b= a.merge(meta, on=['tr_id1'], how='inner')

b.shape

(390, 30)

In [36]:
b[0:2]

Unnamed: 0,tr_id1,tr_id,N_term_start,N_term_end,len,N_term_start1,N_term_end1,len_codons,gene,50len_flag,...,max_sum_overlap_g38_all_exons_strandless,max_sum_overlap_refseq_all_exons,max_sum_overlap_refseq_all_exons_strandless,ovlp,ovlp_strandless,ovlp2,ovlp2_strandless,PhyloCSF120score,number_of_records,records_names
0,ENST00000377269,ENST00000377269.3,719,779,60,719,779,20.0,UBA1,719,...,0,0,0,0;0;0,0;0;0,0;0;0;0,0;0;0;0,-267.1708,31,Human; Chimp; Bonobo; Gorilla; Orangutan; Gibb...
1,ENST00000360708,ENST00000360708.9,0,8,8,2,8,2.0,ZC3HC1,2,...,0,0,0,0;0;0,0;0;0,0;0;0;0,0;0;0;0,-57.5643,117,Human; Chimp; Bonobo; Gorilla; Orangutan; Gibb...


In [39]:
match_fasta = open('tmp_res/match_390.fasta', 'w')

li = []

for row in b[['tr_id', 'gene', 'N_term_start1', 'N_term_end1']].to_numpy():
    tr_id  = row[0]
    gene = row[1]
    N_term_start = row[2]
    N_term_stop = row[3]
    
    tr_seq = metadata_pc_g25[metadata_pc_g25['tr_id'] == tr_id].iloc[0].transcript_seq 
    cds_stop = metadata_pc_g25[metadata_pc_g25['tr_id'] == tr_id].iloc[0].cds_stop_pos
    cds_start = metadata_pc_g25[metadata_pc_g25['tr_id'] == tr_id].iloc[0].cds_start_pos
    
    nte_seq_nt1 = tr_seq[N_term_start:cds_stop]  
    nte_seq_aa1 = str(Seq(nte_seq_nt1).translate())
    
    cds_start_aa_in_ext2 = (cds_start - N_term_start) // 3
    
    li.append([tr_id, gene, nte_seq_aa1, cds_start_aa_in_ext2])
    
    match_fasta.write('>'+tr_id+'_'+gene+'\n')
    match_fasta.write(nte_seq_aa1+'\n')
  
match_fasta.close()
        
match_aa_df = pd.DataFrame(li, columns = ['tr_id', 'gene', 'nte_seq_aa1', 
                                       'cds_start_aa_in_ext2'])


In [40]:
!tail tmp_res/match_390.fasta

>ENST00000343304.10_LRRC41
RSAGDTLGLAHLPKMAAPEAWRARSCWFCEVAAATTMEATSREAAPAKSSASGPNAPPALFELCGRAVSAHMGVLESGVWALPGPILQSILPLLNIYYLERIEETALKKGLSTQAIWRRLWDELMKTRPSSLESVTCWRAKFMEAFFSHVLRGTIDVSSDRRLCDQRFSPLLHSSRHVRQLTICNMLQGATELVAEPNRRVLETLASSLHTLKFRHLLFSDVAAQQSLRQLLHQLIHHGAVSQVSLYSWPVPESALFILILTMSAGFWQPGPGGPPCRLCGEASRGRAPSRDEGSLLLGSRRPRRDAAERCAAALMASRRKSEAKQMPRAAPATRVTRRSTQESLTAGGTDLKRELHPPATSHEAPGTKRSPSAPAATSSASSSTSSYKRAPASSAPQPKPLKRFKRAAGKKGARTRQGPGAESEDLYDFVFIVAGEKEDGEEMEIGEVACGALDGSDPSCLGLPALEASQRFRSISTLELFTVPLSTEAALTLCHLLSSWVSLESLTLSYNGLGSNIFRLLDSLRALSGQAGCRLRALHLSDLFSPLPILELTRAIVRALPLLRVLSIRVDHPSQRDNPGVPGNAGPPSHIIGDEEIPENCLEQLEMGFPRGAQPAPLLCSVLKASGSLQQLSLDSATFASPQDFGLVLQTLKEYNLALKRLSFHDMNLADCQSEVLFLLQNLTLQEITFSFCRLFEKRPAQFLPEMVAAMKGNSTLKGLRLPGNRLGNAGLLALADVFSEDSSSSLCQLDISSNCIKPDGLLEFAKRLERWGRGAFGHLRLFQNWLDQDAVTAREAIRRLRATCHVVSDSWDSSQAFADYVSTM*
>ENST00000394066.6_KLC2
ARRPPARTLVLTDATAMAMMVFPREEKLSQDEIVLGTKAVIQGLETLRGEHRALLAPLVAPEAGEAEPGSQERCILLRRSLEAIELGLGEAQEEKGDVPKDTLDDLFPNEDEQSPAPS

In [59]:
signalP_df_m = pd.read_csv('tmp_res/SignalP_matched_390.txt', sep='\t', comment='#', header=None)

print (signalP_df_m.shape[0])

signalP_df_m.columns = ['id', 'class', 'SP_score', 'Other_score', 'SP_seq']
signalP_df_m['tr_id'] = [x.split('_')[0] for x in signalP_df_m['id'].tolist()]
signalP_df_m['gene'] = [x.split('_')[-1] for x in signalP_df_m['id'].tolist()]

390


In [60]:
tmp = signalP_df_m[(signalP_df_r['class'] != 'OTHER') & (signalP_df_m['SP_seq'].notna())]
tmp['SP_end_pos'] = [int(x.split('CS pos: ')[1].split('-')[0]) for x in tmp['SP_seq'].tolist()]
tmp = tmp.merge(match_aa_df[['tr_id', 'gene', 'cds_start_aa_in_ext2']], on=['tr_id', 'gene'], how='inner')
tmp['SP_start_pos'] = tmp['SP_end_pos'] - 16
tmp[0:2]

  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,id,class,SP_score,Other_score,SP_seq,tr_id,gene,SP_end_pos,cds_start_aa_in_ext2,SP_start_pos
0,ENST00000529905.5_PPT1,SP,0.000524,0.999461,CS pos: 22-23. Pr: 0.9806,ENST00000529905.5,PPT1,22,2,6
1,ENST00000546933.5_PPP1CC,SP,0.000258,0.9997,CS pos: 23-24. Pr: 0.9777,ENST00000546933.5,PPP1CC,23,1,7


In [64]:
match_aa_df.sort_values(by='cds_start_aa_in_ext2')

Unnamed: 0,tr_id,gene,nte_seq_aa1,cds_start_aa_in_ext2
108,ENST00000370683.5,FHL1,AMASHRHSGPSSYKVGTMAEKFDCHYCRDPLQGKKYVQKDGHHCCL...,1
121,ENST00000515119.5,LRPAP1,KMAPRRVRSFLRGLPALLLLLLFLGPWPAASHGGKYSREKNQPKPS...,1
167,ENST00000453466.1,PMPCB,EMAAAAARVVLSSAARRRLWGFSESLLIRGAAGRSLYFGENRLRST...,1
277,ENST00000532402.5,GANAB,KMAAVAAVAARRRRSWASLVLAFLGVCLGITLAVDRSNFKTCEESS...,1
41,ENST00000536961.6,CINP,QMNGTIYANECQQIRHPNSKTLGTVTPRKPVLSVSARKIKDNAADW...,1
...,...,...,...,...
171,ENST00000471855.1,RPL36A,SDGFKEVNVPKTRRTFCKKCGKHQPHKVTQYKKGKDSLYAQGKRRY...,86
14,ENST00000358794.9,STIP1,DPHPDARAKGWGPVPLAIPGTAWNSRHSGARGKATQRPRSRRRHSY...,95
174,ENST00000391910.7,STRN4,QPTWPAIREDGATAGRPFSQSKSNCPGEWCLGVEGVRLFLWGEVGG...,146
298,ENST00000396359.1,FABP5,ERARLLPARGPQDSGGGPPRGPPTSASFPSPKRQPFRIASCPLARA...,156


In [66]:
match_aa_df[match_aa_df['cds_start_aa_in_ext2'] > 10]

Unnamed: 0,tr_id,gene,nte_seq_aa1,cds_start_aa_in_ext2
0,ENST00000377269.3,UBA1,VAGITGICHHAQLLFVFLVEMGFHHVGQAGLEPLTSGDPSALASQS...,20
2,ENST00000407537.5,COMT,RVCDLNPWASASGLLCSALCLLGPLTRLLSGSQATCLEGHTRRISN...,60
4,ENST00000425345.1,EPDR1,FSQRDLSRGGSLGDSMVRDWEGRCAHRGRPAGGGLSNTQRGGGRLF...,15
5,ENST00000591305.5,PTPN2,PALSPDRAGPEPLRRRRLCSRQLAPAAMPTTIEREFEELDTQRRWQ...,27
6,ENST00000549190.5,PPHLN1,PAGGAGVAAVAAAASGAGAASPRCSPVAPLVPPAAGDCLALQSPSE...,57
...,...,...,...,...
384,ENST00000587806.5,ELOF1,WASSVLNGVLLCCPGWSAMVRSRLTAVSASWVQAHPPADMGRRKSK...,18
385,ENST00000343304.10,LRRC41,RSAGDTLGLAHLPKMAAPEAWRARSCWFCEVAAATTMEATSREAAP...,14
386,ENST00000394066.6,KLC2,ARRPPARTLVLTDATAMAMMVFPREEKLSQDEIVLGTKAVIQGLET...,16
387,ENST00000356861.9,TNPO2,LGDVARIGLPCAMDWQPDEQGLQQVLQLLKDSQSPNTATQRIVQDK...,12


### CS before CDS start

In [61]:
tmp[tmp['cds_start_aa_in_ext2'] >= tmp['SP_end_pos']]

Unnamed: 0,id,class,SP_score,Other_score,SP_seq,tr_id,gene,SP_end_pos,cds_start_aa_in_ext2,SP_start_pos


### genes with CS-16 position before CDS 

In [62]:
tmp[tmp['cds_start_aa_in_ext2'] >= tmp['SP_start_pos']]['gene'].tolist()

[]

## top-5000 RiboSET 

In [69]:
RiboSET_EXT_cds_cov_nonzero.shape[0]

4341

In [74]:
RiboSET_EXT_cds_cov_nonzero['start_codon_local_coo_start'] = [int(x.split('; ')[1].split('-')[0])-1 for x in RiboSET_EXT_cds_cov_nonzero['Riboseq_Summary'].tolist()]

riboset_fasta = open('tmp_res/riboset_ext.fasta', 'w')

li = []

for row in RiboSET_EXT_cds_cov_nonzero[['tr_id', 'gene', 'N_term_start1', 'N_term_end1', 'start_codon_local_coo_start']].to_numpy():
    tr_id  = row[0]
    gene = row[1]
    N_term_start = row[2]
    N_term_stop = row[3]
    N_term_start_pred = row[-1] 
    tr_seq = metadata_pc_g25[metadata_pc_g25['tr_id'] == tr_id].iloc[0].transcript_seq 
    cds_stop = metadata_pc_g25[metadata_pc_g25['tr_id'] == tr_id].iloc[0].cds_stop_pos
    cds_start = metadata_pc_g25[metadata_pc_g25['tr_id'] == tr_id].iloc[0].cds_start_pos
    
    nte_seq_nt1 = tr_seq[N_term_start:cds_stop]
    nte_seq_nt2 = tr_seq[N_term_start_pred:cds_stop]
    
    nte_seq_aa1 = str(Seq(nte_seq_nt1).translate())
    nte_seq_aa2 = 'M'+str(Seq(nte_seq_nt2).translate())[1:]
    
    # position of CDS in NTE_aa: 
    # 5'    N_term_start_pred    cds_start_pos
    # 
    cds_start_aa_in_ext2 = (cds_start - N_term_start_pred) // 3
    
    li.append([tr_id, gene, nte_seq_aa1, nte_seq_aa2, cds_start_aa_in_ext2, 
              cds_start_aa_in_ext2])
    
    riboset_fasta.write('>'+tr_id+'_'+gene+'\n')
    riboset_fasta.write(nte_seq_aa2[:-1]+'\n')
  
riboset_fasta.close()
        
NTE_aa_EXT_df_ribo = pd.DataFrame(li, columns = ['tr_id', 'gene', 'nte_seq_aa1', 'nte_seq_aa2',
                                       'cds_start_aa_in_ext2', 'ext_len'])


NameError: name 'RiboSET_EXT_cds_cov_nonzero' is not defined

In [71]:
!tail tmp_res/riboset_ext.fasta

>ENST00000304661.5_C1GALT1C1
MVRSVTEWCANVRGNPCAAALSCPQAVLDAGKMLSESSSFLKGVMLGSIFCALITMLGHIRIGHGNRMHHHEHHHLQAPNKEDILKISEDERMELSKSFRVYCIILVKPKDVSLWAAVKETWTKHCDKAEFFSSENVKVFESINMDTNDMWLMMRKAYKYAFDKYRDQYNWFFLARPTTFAIIENLKYFLLKKDPSQPFYLGHTIKSGDLEYVGMEGGIVLSVESMKRLNSLLNIPEKCPEQGGMIWKISEDKQLAVCLKYAGVFAENAEDADGKDVFNTKSVGLSIKEAMTYHPNQVVEGCCSDMAVTFNGLTPNQMHVMMYGVYRLRAFGHIFNDALVFLPPNGSDND
>ENST00000371122.8_SMARCA1
MQPLPPPRSPRPLPCPPTRSRSHGMEQDTAAVAATVAAADATATIVVIEDEQPGPSTSQEEGAAAAATEATAATEKGEKKKEKNVSSFQLKLAAKAPKSEKEMDPEYEEKMKADRAKRFEFLLKQTELFAHFIQPSAQKSPTSPLNMKLGRPRIKKDEKQSLISAGDYRHRRTEQEEDEELLSESRKTSNVCIRFEVSPSYVKGGPLRDYQIRGLNWLISLYENGVNGILADEMGLGKTLQTIALLGYLKHYRNIPGPHMVLVPKSTLHNWMNEFKRWVPSLRVICFVGDKDARAAFIRDEMMPGEWDVCVTSYEMVIKEKSVFKKFHWRYLVIDEAHRIKNEKSKLSEIVREFKSTNRLLLTGTPLQNNLHELWALLNFLLPDVFNSADDFDSWFDTKNCLGDQKLVERLHAVLKPFLLRRIKTDVEKSLPPKKEIKIYLGLSKMQREWYTKILMKDIDVLNSSGKMDKMRLLNILMQLRKCCNHPYLFDGAEPGPPYTTDEHIVSNSGKMVVLDKLLAKLKEQGSRVLIFSQMTRLLDILEDYCMWRGYEYCRLDGQTPHEEREDKFLEVEFLGQREAIEAFNAPNSS

In [72]:
!grep '>' tmp_res/riboset_ext.fasta | wc -l

390


## ext match 

In [None]:
a = pairs_4k_final[['tr_id']]
a.columns = ['tr_id1']

b= a.merge(meta, on=['tr_id1'], how='inner')

b.shape

In [None]:
match_fasta = open('tmp_res/match_4k.fasta', 'w')

li = []

for row in b[['tr_id', 'gene', 'N_term_start1', 'N_term_end1']].to_numpy():
    tr_id  = row[0]
    gene = row[1]
    N_term_start = row[2]
    N_term_stop = row[3]
    
    tr_seq = metadata_pc_g25[metadata_pc_g25['tr_id'] == tr_id].iloc[0].transcript_seq 
    cds_stop = metadata_pc_g25[metadata_pc_g25['tr_id'] == tr_id].iloc[0].cds_stop_pos
    cds_start = metadata_pc_g25[metadata_pc_g25['tr_id'] == tr_id].iloc[0].cds_start_pos
    
    nte_seq_nt1 = tr_seq[N_term_start:cds_stop]  
    nte_seq_aa1 = str(Seq(nte_seq_nt1).translate())
    
    cds_start_aa_in_ext2 = (cds_start - N_term_start) // 3
    
    li.append([tr_id, gene, nte_seq_aa1, cds_start_aa_in_ext2])
    
    match_fasta.write('>'+tr_id+'_'+gene+'\n')
    match_fasta.write(nte_seq_aa1+'\n')
  
match_fasta.close()
        
match_aa_df = pd.DataFrame(li, columns = ['tr_id', 'gene', 'nte_seq_aa1', 
                                       'cds_start_aa_in_ext2'])


## PhyloSET

In [41]:
li = []

f = open('../data/tmhmm_and_phobius_data/phyloset.fasta', 'w')

for row in PhyloSET[['tr_id', 'gene', 'N_term_start1', 'N_term_end1']].to_numpy():
    tr_id  = row[0]
    gene = row[1]
    N_term_start = row[2]
    N_term_stop = row[3]
    tr_seq = metadata_pc_g25[metadata_pc_g25['tr_id'] == tr_id].iloc[0].transcript_seq 
    cds_stop = metadata_pc_g25[metadata_pc_g25['tr_id'] == tr_id].iloc[0].cds_stop_pos
    cds_start = metadata_pc_g25[metadata_pc_g25['tr_id'] == tr_id].iloc[0].cds_start_pos
    
    nte_seq_nt1 = tr_seq[N_term_start:cds_stop]
    
    nte_seq_aa1 = str(Seq(nte_seq_nt1).translate())
    
    # position of CDS in NTE_aa: 
    # 5'    N_term_start_pred    cds_start_pos
    # 
    cds_start_aa_in_ext2 = (cds_start - N_term_start) // 3
    
    li.append([tr_id, gene, nte_seq_aa1, cds_start_aa_in_ext2])
    
    f.write('>'+tr_id+'_'+gene+'\n')
    f.write(nte_seq_aa1[:-1]+'\n')
    
NTE_aa_df = pd.DataFrame(li, columns = ['tr_id', 'gene', 'nte_seq_aa1', 
                                       'cds_start_aa_in_ext2'])

f.close()

In [42]:
!tail ../data/tmhmm_and_phobius_data/phyloset.fasta

>ENST00000373519.1_PABPC1L2A
ADADADAKVAAEVAAEVAAAAAAADADADETLGDCEGNPDFQMASLYVGDLHPEVTEAMLYEKFSPAGPILSIRICRDKITRRSLGYAYVNYQQPVDAKRALETLNFDVIKGRPVRIMWSQRDPSLRKSGVGNVFIKNLGKTIDNKALYNIFSAFGNILSCKVACDEKGPKGYGFVHFQKQESAERAIDVMNGMFLNYRKIFVGRFKSHKEREAERGAWARQSTSADVKDFEEDTDEEATLR
>ENST00000371558.6_UBE2A
TACPALDVPPPSLCSQVGSAPVYTGVVLSRRQTDPRLRRGSAVPLGASASPSPASPASSASSPAAGTRDPSVCPTPDPARDMSTPARRRLMRDFKRLQEDPPAGVSGAPSENNIMVWNAVIFGPEGTPFEDGTFKLTIEFTEEYPNKPPTVRFVSKMFHPNVYADGSICLDILQNRWSPTYDVSSILTSIQSLLDEPNPNSPANSQAAQLYQENKREYEKRVSAIVEQSWRDC
>ENST00000346330.6_UBE2A
TGVVLSRRQTDPRLRRGSAVPLGASASPSPASPASSASSPAAGTRDPSVCPTPDPARDMSTPARRRLMRDFKRLQEDPPAGVSGAPSENNIMVWNAVIFGPEGTPFEDGTFKLTIEFTEEYPNKPPTVRFVSKMFHPNDGSICLDILQNRWSPTYDVSSILTSIQSLLDEPNPNSPANSQAAQLYQENKREYEKRVSAIVEQSWRDC
>ENST00000625938.2_UBE2A
VLSRRQTDPRLRRGSAVPLGASASPSPASPASSASSPAAGTRDPSVCPTPDPARDMSTPARRRLMRDFKRLQEDPPAGVSGAPSENNIMVWNAVIFGPEGTPFEDVYADGSICLDILQNRWSPTYDVSSILTSIQSLLDEPNPNSPANSQAAQLYQENKREYEKRVSAIVEQSWRDC
>ENST00000631185.2_UBE2A

In [44]:
signalP_df = pd.read_csv('../data/SignalP_PCSF.txt', sep='\t', comment='#', header=None)
signalP_df.columns = ['id', 'class', 'SP_score', 'Other_score', 'SP_seq']
signalP_df['tr_id'] = [x.split('_')[0] for x in signalP_df['id'].tolist()]
signalP_df['gene'] = [x.split('_')[-1] for x in signalP_df['id'].tolist()]

In [45]:
print ('# of genes with pred signalP from PhyloSET')
signalP_df[(signalP_df['class'] != 'OTHER') & 
           (signalP_df['SP_seq'] != 'CS pos: ?. Probable protein fragment')].shape[0]

# of genes with pred signalP from PhyloSET


0

# TargetP 

## RiboSET 

In [46]:
targetP_riboset = pd.read_csv('../data/TargetP_RiboSET.txt', sep='\t', comment='#', header=None)
targetP_riboset.columns = ['ids', 'class', 'score_other', 'score_sp', 'score_m', 'seq_m']

targetP_riboset['tr_id'] = [x.split('_')[0] for x in targetP_riboset['ids'].tolist()]
targetP_riboset['gene'] = [x.split('_')[-1] for x in targetP_riboset['ids'].tolist()]

targetP_riboset = targetP_riboset[~targetP_riboset['gene'].isin(['STIM2', 'AP3S1', 'PTPRJ'])]

print ('genes from RiboSET with mito signal: ')

print (targetP_riboset[(targetP_riboset['seq_m'].notna()) & (targetP_riboset['class'] == 'mTP')].gene.nunique())

genes from RiboSET with mito signal: 
16


In [48]:
tmp = targetP_riboset[(targetP_riboset['seq_m'].notna()) & 
                      (targetP_riboset['class'] == 'mTP')].merge(NTE_aa_df_ribo, on=['gene', 'tr_id'], how='inner')

print (tmp.shape[0])

tmp['CS_pos'] = [int(x.split('CS pos:')[1].split('-')[0]) for x in tmp['seq_m'].tolist()]
tmp['m_start'] = tmp['CS_pos']-20

16


### check if CS-n < cds 

In [51]:
tmp[tmp['cds_start_aa_in_ext2'] >= tmp['CS_pos']].gene.nunique()

5

### check if CS-n < m_start 

In [52]:
# NTE start ---- m_start -----  CS_pos ------ CDS start  
tmp[tmp['cds_start_aa_in_ext2'] >= tmp['m_start']].gene.nunique()

11

## PhyloSET

In [54]:
targetP_phyloset = pd.read_csv('../data/TargetP_PhyloSET.txt', sep='\t', comment='#', header=None)
targetP_phyloset.columns = ['ids', 'class', 'score_other', 'score_sp', 'score_m', 'seq_m']

targetP_phyloset['tr_id'] = [x.split('_')[0] for x in targetP_phyloset['ids'].tolist()]
targetP_phyloset['gene'] = [x.split('_')[-1] for x in targetP_phyloset['ids'].tolist()]

print ('genes from RiboSET with mito signal: ')

print (targetP_phyloset[(targetP_phyloset['seq_m'].notna()) & (targetP_phyloset['class'] == 'mTP')].gene.nunique())

genes from RiboSET with mito signal: 
0


## Match RiboSET 

In [None]:
targetP_m = pd.read_csv('tmp_res/X.txt', sep='\t', comment='#', header=None)
targetP_m.columns = ['ids', 'class', 'score_other', 'score_sp', 'score_m', 'seq_m']

targetP_m['tr_id'] = [x.split('_')[0] for x in targetP_m['ids'].tolist()]
targetP_m['gene'] = [x.split('_')[-1] for x in targetP_m['ids'].tolist()]

print ('genes from matched set 390 with mito signal: ')
print (targetP_m[(targetP_m['seq_m'].notna()) & (targetP_m['class'] == 'mTP')].gene.nunique())

In [None]:
tmp = targetP_m[(targetP_m['seq_m'].notna()) & 
                      (targetP_m['class'] == 'mTP')].merge(match_aa_df, on=['gene', 'tr_id'], how='inner')

print (tmp.shape[0])

tmp['CS_pos'] = [int(x.split('CS pos:')[1].split('-')[0]) for x in tmp['seq_m'].tolist()]
tmp['m_start'] = tmp['CS_pos']-20

### check if CS-n < cds 

In [None]:
tmp[tmp['cds_start_aa_in_ext2'] >= tmp['CS_pos']].gene.nunique()

### check if CS-n < m_start 

In [None]:
# NTE start ---- m_start -----  CS_pos ------ CDS start  
tmp[tmp['cds_start_aa_in_ext2'] >= tmp['m_start']].gene.nunique()

## RiboSET ext 

# Phobius

# TMHMM