In [10]:
%matplotlib inline

import numpy as np
import collections
from collections import OrderedDict, Counter, defaultdict
import pandas as pd

import Bio
from Bio import SeqIO

import seaborn as sns
import matplotlib.pyplot as plt

import glob

import subprocess
from subprocess import call

import re

import pickle

from gtfparse import read_gtf

# PMID: 24623590

In [11]:
ext_df = pd.read_csv('data/Lee_PMC4014282_extensions.txt', sep='\t', header=None)

ext_df = ext_df[[12, 13, 14, 8, 9, 16]]

ext_df.columns = ['tr_id', 'gene', 'start_codon', 'peptide', 'pos_relative_to_TIS', 'type']

In [12]:
ext_df[ext_df['type'] == 'n-term-ext'].to_csv('plots/final_S/SupY_PMC4014282_17extensions_Nterminomics_Riboseq.txt', sep='\t', index=False)

In [13]:
ext_df.tr_id.tolist()

['NM_000034',
 'NM_001199697',
 'NM_001143985',
 'NM_004396',
 'NM_004461',
 'NM_004860',
 'NM_004494',
 'NM_004494',
 'NM_003510',
 'NM_006805',
 'NM_001199155',
 'NM_199187',
 'NM_004539',
 'NM_000269',
 'NM_145886',
 'NM_002675',
 'NM_001008709',
 'NM_006267',
 'NM_001098426',
 'NM_001040011',
 'NM_032796',
 'NM_014501',
 'NM_001166221']

# Compare with PhyloSET and RiboSET

In [14]:
# open SET1 (PhyloSET) and SET2 (RiboSET)
SET1 = pd.read_csv('tmp_res/SET1.txt', sep='\t')
SET2 = pd.read_csv('tmp_res/SET2.txt', sep='\t')

In [15]:
', '.join(list(set(SET1.gene.tolist()).intersection(ext_df.gene.tolist())))

'FXR2, HNRNPA0'

In [16]:
', '.join(list(set(SET2.gene.tolist()).intersection(ext_df.gene.tolist())))

'SYAP1, KAT7, HNRNPA0, HDGF, BAG6, FXR2, NARS'

In [17]:
ext_df

Unnamed: 0,tr_id,gene,start_codon,peptide,pos_relative_to_TIS,type
0,NM_000034,ALDOA,CTG,MDFQGR,-57,n-term-ext
1,NM_001199697,BAG6,ACG,MEVAVGGR,-36,n-term-ext
2,NM_001143985,BANF1,GTG,MEESSSGLR,-42,n-term-ext
3,NM_004396,DDX5,ATT,MDAMSGYSSDR,-9,n-term-ext
4,NM_004461,FARSA,CTG,MEGVMADGQVAELLLR,-12,n-term-ext
5,NM_004860,FXR2,GTG,AETKAAAADGER,-219,n-term-ext
6,NM_004494,HDGF,GTG,AAPELGPGATIEAGAAR,-150,n-term-ext
7,NM_004494,HDGF,ATT,MEFQTQTTAR,-255,n-term-ext
8,NM_003510,HIST1H2AK,CTG,MELAGNAAR,190,n-term-trunc
9,NM_006805,HNRNPA0,ACG,AATAKPR,-61,n-term-ext


# Comprehensive analysis of human protein N-termini enables assessment of various protein forms


In [22]:
# open metadata with scores and metrics
meta = pd.read_csv('tmp_res/METADATA_CURRENT.txt', sep='\t')

# gencode 25
metadata_pc_g25 = pd.read_csv('tmp_res/metadata_pc_g25.txt', sep='\t')


In [26]:
nterm_df = pd.read_csv('data/PMID28747677_nterm.txt', sep='\t', header=None)

nterm_df.columns = ['chr', 'start', 'strand', 'tr_id1', 
                   'uniprot_id', 'pos_to_CDS', 'aa_seq', 'nt_seq']

print (nterm_df.shape[0], nterm_df.tr_id1.nunique())

nterm_df[0:4]

534 394


Unnamed: 0,chr,start,strand,tr_id1,uniprot_id,pos_to_CDS,aa_seq,nt_seq
0,chr17,1400113,-1,ENST00000571732,P62258-2,-102,AMDDREDLVYQAKLAEQAER,GCTATGGATGATCGAGAGGATCTGGTGTACCAGGCGAAGCTGGCCG...
1,chr17,1400113,-1,ENST00000573026,I3L0W5,-3,AMDDREDLVYQAKLAEQAER,GCTATGGATGATCGAGAGGATCTGGTGTACCAGGCGAAGCTGGCCG...
2,chr17,1400113,-1,ENST00000264335,P62258,-3,AMDDREDLVYQAKLAEQAER,GCTATGGATGATCGAGAGGATCTGGTGTACCAGGCGAAGCTGGCCG...
3,chr17,1400113,-1,ENST00000573196,B4DJF2,-3,AMDDREDLVYQAKLAEQAER,GCTATGGATGATCGAGAGGATCTGGTGTACCAGGCGAAGCTGGCCG...


In [28]:
metadata_pc_g25['tr_id1'] = [x.split('.')[0] for x in metadata_pc_g25['tr_id'].tolist()]

nterm_df2 = nterm_df.merge(metadata_pc_g25[['tr_id1', 'gene']], on='tr_id1', how='left')

In [29]:
nterm_df2.shape[0], nterm_df2.drop_duplicates().shape[0]

(534, 534)

In [47]:
nterm_genes = list(nterm_df2[nterm_df2['pos_to_CDS'] <= -3].gene.unique())

print (len(nterm_genes))

171


In [55]:
# phyloset 

print (len(set(SET1.gene.tolist()).intersection(nterm_genes))), set(SET1.gene.tolist()).intersection(nterm_genes)

1


(None, {'ENY2'})

In [46]:
# riboset

print (len(set(SET2.gene.tolist()).intersection(nterm_genes)))

17


In [52]:
nterm_df2

Unnamed: 0,chr,start,strand,tr_id1,uniprot_id,pos_to_CDS,aa_seq,nt_seq,gene
0,chr17,1400113,-1,ENST00000571732,P62258-2,-102,AMDDREDLVYQAKLAEQAER,GCTATGGATGATCGAGAGGATCTGGTGTACCAGGCGAAGCTGGCCG...,YWHAE
1,chr17,1400113,-1,ENST00000573026,I3L0W5,-3,AMDDREDLVYQAKLAEQAER,GCTATGGATGATCGAGAGGATCTGGTGTACCAGGCGAAGCTGGCCG...,YWHAE
2,chr17,1400113,-1,ENST00000264335,P62258,-3,AMDDREDLVYQAKLAEQAER,GCTATGGATGATCGAGAGGATCTGGTGTACCAGGCGAAGCTGGCCG...,YWHAE
3,chr17,1400113,-1,ENST00000573196,B4DJF2,-3,AMDDREDLVYQAKLAEQAER,GCTATGGATGATCGAGAGGATCTGGTGTACCAGGCGAAGCTGGCCG...,YWHAE
4,chr17,1400113,-1,ENST00000575977,I3L3T1,-3,AMDDREDLVYQAKLAEQAER,GCTATGGATGATCGAGAGGATCTGGTGTACCAGGCGAAGCTGGCCG...,YWHAE
...,...,...,...,...,...,...,...,...,...
529,chrX,102463273,-1,ENST00000618881,Q5JRM6,-1668,LTHLAPPAAWRVHFQAGQTLLMDRIRLQELAFQLHQL,CTGACTCACCTAGCTCCTCCTGCAGCATGGCGTGTTCACTTCCAAG...,NXF2B
530,chrX,155065142,-1,ENST00000369479,P56277,-216,AIEPLMDIIGWTWDLMNPTKVSTFYRTDALITKDLVLI,GCCATAGAACCCTTAATGGACATCATCGGCTGGACCTGGGATCTGA...,CMC4
531,chrX,21374865,1,ENST00000279451,Q8WXI2-2,-33,LCALHGTDPVPMALIMEPVSKWSPSQVVD,CTCTGCGCTCTGCACGGAACCGACCCCGTACCCATGGCTCTGATAA...,CNKSR2
532,chrX,21374865,1,ENST00000425654,Q8WXI2-5,-33,LCALHGTDPVPMALIMEPVSKWSPSQVVD,CTCTGCGCTCTGCACGGAACCGACCCCGTACCCATGGCTCTGATAA...,CNKSR2


In [57]:
nterm_df2[nterm_df2['gene'].isin(SET1.gene.tolist()+SET2.gene.tolist())][['tr_id1', 'pos_to_CDS', 'gene', 'nt_seq']].to_csv('tmp_res/comparison_with_PMID28747677.txt', sep='\t', index=False)