In [1]:
%matplotlib inline

import numpy as np
import collections
from collections import OrderedDict, Counter, defaultdict
import pandas as pd

import Bio
from Bio import SeqIO

import seaborn as sns
import matplotlib.pyplot as plt
from bokeh.plotting import figure, output_file, show 
import matplotlib.patches as patches    
from matplotlib import colors


import glob

import subprocess
from subprocess import call

import re

import pickle

from Bio.SubsMat import MatrixInfo
import Bio.Data.CodonTable

from scipy import stats


from Bio.SubsMat import MatrixInfo
import Bio.Data.CodonTable

import joypy

  import pandas.util.testing as tm


In [2]:
# open metadata with scores and metrics
meta = pd.read_csv('tmp_res/METADATA_CURRENT.txt', sep='\t')

# gencode 25
metadata_pc_g25 = pd.read_csv('tmp_res/metadata_pc_g25.txt', sep='\t')


P_Set = pd.read_csv('tmp_res/SET1.txt', sep='\t')

R_Set = pd.read_csv('tmp_res/SET2.txt', sep='\t')

R_Set['pos_of_codon'] = [int(x.split('; ')[1].split('-')[0])-1 for x in R_Set.Riboseq_Summary.tolist()]

R_Set['codon'] = [x.split('; ')[0] for x in R_Set.Riboseq_Summary.tolist()]

R_Set[0:2]

Unnamed: 0,tr_id,gene,N_term_start1,N_term_end1,len_codons,global_coo_50_and_less,strand,global_coo_primary,PhyloCSF120score,number_of_records,ovlp,Riboseq_Summary,Coverage_value_ext,Proteomics_count_ext,tag,transcript_type,CDS_ratio,pos_of_codon,codon
0,ENST00000379389.4,ISG15,1,151,50.0,chr1:1013424-1013573,+,chr1:1013424-1013573,-2443.9971,111,0;0;0,ATC; 98-152; Rank: 288; cov: 53.85,53.846154,0.0,"basic,appris_principal_1,CCDS",protein_coding,0.32724,97,ATC
1,ENST00000349431.10,UBE2J2,28,220,64.0,chr1:1273666-1273815,-,chr1:1273666-1273857,-1285.8186,78,0;0;0,ACG; 173-221; Rank: 328; cov: 90.91,90.909091,0.0,"basic,appris_principal_1,CCDS",protein_coding,0.762814,172,ACG


# Some important functions 

In [3]:
def get_aln(fasta_input):
    aln_col = []
    ids_li = []
    
    with open(fasta_input, 'r') as input_handle:
        for record in SeqIO.parse(input_handle, "fasta"):
            seq = str(record.seq)
            ids = str(record.id)
            aln_col.append([seq[i:i+1] for i in range(0, len(seq), 1)])
            ids_li.append(ids)
    return aln_col, ids_li

def re_index_with_gaps(aln_col):
    new_col_ind = []

    # take ref seq
    counter_pos = 0 # to exclude '-'

    for col in pd.DataFrame(aln_col).to_numpy()[0]:
        if (col == '-') | (col == '.'): # UCC UGA C = TCC TGA C
            new_col_ind.append(counter_pos)
        else: 
            counter_pos += 1
            new_col_ind.append(counter_pos)
    
    return new_col_ind


def slice_aln(aln_col, new_col_ind, a, b):
    fr_stop_motif_array = []
    for row in pd.DataFrame(aln_col).to_numpy():
        tmp = []
        for ind, col in zip(new_col_ind, row):
            if (ind >= a) & (ind <= b):
                tmp.append(col)
            
        fr_stop_motif_array.append(tmp)
    
    return fr_stop_motif_array


In [4]:
li = []

kek = {}

for i, el in enumerate(R_Set[['tr_id', 'gene', 'pos_of_codon']].merge(meta[['tr_id', 'N_term_start1', 'global_coo_primary']], on='tr_id', how='inner').to_numpy()):
    tr_id = el[0]
    
    #print (tr_id)
    chrom = el[-1].split(':')[0]
    nte_start = el[-2]
    pos_of_codon = el[2]
    
    a = pos_of_codon - nte_start + 1
    
    b = a + 2
    
    fasta_input = 'data/nte_full_aln/out/%s/fasta3/%s.fasta' % (chrom, tr_id)

    aln_col, ids_li = get_aln(fasta_input)
    new_col_ind = re_index_with_gaps(aln_col)
    sliced_array = slice_aln(aln_col, new_col_ind, a, b)
    
    a1 = pos_of_codon - nte_start + 1 - 4
    b1 = pos_of_codon - nte_start + 1 + 4
    sliced_array_TIS = slice_aln(aln_col, new_col_ind, a1, b1)
    sliced_array_TIS_and_surroundings = slice_aln(aln_col, new_col_ind, a1-15, b1+15)
    
    # check conservation
    # fraction of the exact reference codon 
    # Counter of near-cognate codons 
    ref_codon = ''.join(pd.DataFrame(sliced_array).to_numpy()[0]).replace('-', '')
    
    #if tr_id == 'ENST00000341772.4':
        #print (pd.DataFrame(sliced_array))
    
    number_of_species  = len(aln_col)
    
    codons = [''.join(list(x)).replace('-', '') for x in pd.DataFrame(sliced_array).to_numpy()[0:]]
    
    ref_codon_count = codons.count(ref_codon)
    
    # count of all non-cognate + AUG
    non_cognates = ['ATG', 'CTG', 'TTG', 'GTG', 'ATC', 'ATT', 'ATA', 'AGG', 'AAG', 'ACG']
    
    non_cognates_codon_count = 0
    for codon in non_cognates:
        non_cognates_codon_count += codons.count(codon)
    
    ref_codon_fraction = ref_codon_count / number_of_species
    near_cognate_codong_fraction = non_cognates_codon_count / number_of_species
    
    counter_codon = Counter(codons)
    
    if i % 10 == 0:
        print (i)
        
    kek[tr_id] = [sliced_array, aln_col,sliced_array_TIS,sliced_array_TIS_and_surroundings,ids_li]
    
    dict_animals = dict(zip(ids_li[1:], codons))
    
    li.append([tr_id, el[2], nte_start, ref_codon, number_of_species, ref_codon_count, ref_codon_fraction, counter_codon,
              non_cognates_codon_count, near_cognate_codong_fraction, dict_animals, ids_li])

0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390


In [5]:
conservation_df = pd.DataFrame(li, columns = ['tr_id', 'codon_start', 'nte_start', 'ref_codon', 
                                  'number_of_species', 'ref_codon_count', 
                                  'ref_codon_fraction', 'counter_codon', 
                                 'near_cognate_codon_count', 'near_cognate_codon_fraction',
                                             'specie_codon_dict', 'animals_li'])

conservation_df = conservation_df.merge(metadata_pc_g25[['tr_id', 'gene']], on='tr_id', how='inner')

# exclude ATG extensions
conservation_df = conservation_df[conservation_df['codon_start'] != 'ATG']

print (conservation_df.shape)
conservation_df[0:2]

(395, 13)


Unnamed: 0,tr_id,codon_start,nte_start,ref_codon,number_of_species,ref_codon_count,ref_codon_fraction,counter_codon,near_cognate_codon_count,near_cognate_codon_fraction,specie_codon_dict,animals_li,gene
0,ENST00000379389.4,97,1,ATC,111,33,0.297297,"{'ATC': 33, 'TTC': 2, '': 7, 'GTCTTCGCCCGACCGG...",36,0.324324,"{'Chimp': 'ATC', 'Bonobo': 'ATC', 'Gorilla': '...","[Human, Chimp, Bonobo, Gorilla, Orangutan, Gib...",ISG15
1,ENST00000349431.10,172,28,ACG,82,58,0.707317,"{'ACG': 58, 'ACA': 1, 'AGG': 2, 'GCA': 1, 'GCG...",60,0.731707,"{'Chimp': 'ACG', 'Bonobo': 'ACG', 'Gorilla': '...","[Human, Chimp, Bonobo, Gorilla, Orangutan, Gib...",UBE2J2


In [6]:
conservation_df[['tr_id', 'ref_codon', 'specie_codon_dict', 'gene']][0:5]

Unnamed: 0,tr_id,ref_codon,specie_codon_dict,gene
0,ENST00000379389.4,ATC,"{'Chimp': 'ATC', 'Bonobo': 'ATC', 'Gorilla': '...",ISG15
1,ENST00000349431.10,ACG,"{'Chimp': 'ACG', 'Bonobo': 'ACG', 'Gorilla': '...",UBE2J2
2,ENST00000338370.7,ATT,"{'Chimp': 'ATT', 'Bonobo': 'ATT', 'Gorilla': '...",AURKAIP1
3,ENST00000235310.7,GTG,"{'Chimp': 'GTG', 'Bonobo': 'GTG', 'Gorilla': '...",MAD2L2
4,ENST00000356634.7,GTG,"{'Chimp': 'GTG', 'Bonobo': 'GTG', 'Gorilla': '...",KDM1A


# multiple sequence alignment

In [14]:
P_Set[P_Set['gene'] == 'LRP5L']

Unnamed: 0,tr_id,gene,N_term_start1,N_term_end1,len_codons,global_coo_50_and_less,strand,global_coo_primary,PhyloCSF120score,number_of_records,ovlp,Riboseq_Summary,Coverage_value_ext,Proteomics_count_ext,tag,transcript_type
68,ENST00000610821.4,LRP5L,2366,2465,33.0,chr22:25360093-25360191,-,chr22:25360093-25360191,1669.2062,110,0;0;0,ACG; 2367-2466; Rank: 3720; cov: 14.29,14.285714,0.0,"basic,appris_principal_2,CCDS",protein_coding
69,ENST00000402859.6,LRP5L,425,524,33.0,chr22:25360093-25360191,-,chr22:25360093-25360191,1669.2062,110,0;0;0,,,,"basic,appris_principal_2,CCDS",protein_coding
70,ENST00000444995.7,LRP5L,602,701,33.0,chr22:25360093-25360191,-,chr22:25360093-25360191,1669.2062,110,0;0;0,,,,"basic,appris_alternative_2",protein_coding
71,ENST00000402785.2,LRP5L,1,97,32.0,chr22:25360093-25360188,-,chr22:25360093-25360188,1554.5767,110,0;0;0,,,,"basic,appris_principal_2,CCDS",protein_coding


In [None]:
So for LRP5L theoretical extension (I included the last 3' in-frame stop and annotated ATG): 
https://data.broadinstitute.org/compbio1/cav.php?controlsState=show&intervals=chr22%3A25360090-25360194&strand=-&alnset=hg38_241mammals&spliceSites=human

In [7]:
codon_aln, full_aln, tis_aln, sliced_array_TIS_and_surroundings, ids_li = kek['ENST00000357214.5']

In [9]:
pd.DataFrame(full_aln)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,142,143,144,145,146,147,148,149,150,151
0,G,C,C,G,C,C,T,G,T,G,...,-,-,-,-,-,-,-,-,-,-
1,G,C,C,G,C,C,T,G,T,G,...,-,-,-,-,-,-,-,-,-,-
2,G,C,C,G,C,C,T,G,T,G,...,-,-,-,-,-,-,-,-,-,-
3,G,C,C,G,C,C,T,G,T,G,...,-,-,-,-,-,-,-,-,-,-
4,G,C,C,G,C,C,T,G,T,G,...,-,-,-,-,-,-,-,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,G,C,C,G,C,C,T,G,T,G,...,-,-,-,-,-,-,-,-,-,-
111,G,C,C,G,C,C,T,G,T,G,...,-,-,-,-,-,-,-,-,-,-
112,G,G,G,G,C,T,T,G,T,G,...,-,-,-,-,-,-,-,-,-,-
113,G,G,T,G,C,T,T,G,T,G,...,-,-,-,-,-,-,-,-,-,-
