In [22]:
%matplotlib inline

import numpy as np
import collections
from collections import OrderedDict, Counter, defaultdict
import pandas as pd

import Bio
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqUtils import molecular_weight
from Bio.Alphabet import generic_dna, generic_rna, generic_protein

import seaborn as sns
import matplotlib.pyplot as plt

import glob

import subprocess
from subprocess import call

import re

import pickle

import sys                                                                         
from gffutils.iterators import DataIterator  
from gtfparse import read_gtf

In [23]:
# gencode 25 metadata 
metadata_pc_g25 = pd.read_csv('tmp_res/metadata_pc_g25.txt', sep='\t')

# gencode 35 metadata 
metadata_pc_g35 = pd.read_csv('tmp_res/metadata_pc_g35.txt', sep='\t')

# open metadata with scores and metrics
meta = pd.read_csv('tmp_res/local_and_global_df_g25ovlp_g35ovlp_g38ovlp_refseqovlp_strand_FRAMES_PhyloCSF.txt', sep='\t')

In [24]:
PhyloSET = pd.read_csv('tmp_res/SET1.txt', sep='\t')

RiboSET = pd.read_csv('tmp_res/SET2.txt', sep='\t')

In [25]:
PhyloSET[['tr_id', 'gene', 'global_coo_primary', 'strand']].to_csv('../../Isabel_Sesaldo/PhyloSET_genes.txt', sep='\t', index=False)

# RiboSET: start-codon local coordinates, all NTE-coordinates 

In [26]:
RiboSET['start_codon_local_coo_start'] = [int(x.split(';')[1].split('-')[0]) for x in RiboSET['Riboseq_Summary'].tolist()]

RiboSET['start_codon'] = [x.split(';')[0] for x in RiboSET['Riboseq_Summary'].tolist()]

In [27]:
tmp = RiboSET[['tr_id', 'gene', 'start_codon_local_coo_start', 'start_codon', 'N_term_end1']]

tmp['start_codon_local_coo_stop'] = tmp['start_codon_local_coo_start'] + 3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [28]:
tmp.to_csv('tmp_res/RiboSET_local_coo_pred_start.txt', sep='\t', index=False)

# Global coo 

In [29]:
def prepare_global_coo(path_to_file_with_global_coo, colname):
    global_coo_g25 = pd.read_csv(path_to_file_with_global_coo, sep='\t')
    global_coo_g25_f = global_coo_g25[global_coo_g25['hit'] == True]
    global_coo_g25_f = global_coo_g25_f.sort_values(by=['seqnames', 'start'])
    global_coo_g25_f['global_coo'] = global_coo_g25_f['seqnames']+':'+global_coo_g25_f['start'].astype(str) +'-'+global_coo_g25_f['end'].astype(str)
    global_coo_g25_f = global_coo_g25_f[['group_name', 'strand', 'global_coo']].groupby(['group_name', 'strand']).agg('+'.join).reset_index()
    global_coo_g25_f.columns = ['tr_id', 'strand', colname]
    return global_coo_g25_f

In [30]:
start_codon_global = prepare_global_coo(path_to_file_with_global_coo='tmp_res/RiboSET_global_coo_pred_start.txt',
                                   colname = 'global_coo_start_codon')

ext_global = prepare_global_coo(path_to_file_with_global_coo='tmp_res/RiboSET_global_coo_pred_start_ext.txt',
                                   colname = 'global_coo_ext')


In [31]:
start_codon_global[0:5]

Unnamed: 0,tr_id,strand,global_coo_start_codon
0,ENST00000014914.5,+,chr12:12891636-12891638
1,ENST00000054950.3,+,chr11:32091140-32091142
2,ENST00000166345.7,+,chr5:892927-892929
3,ENST00000176643.10,+,chr17:19648849-19648851
4,ENST00000203001.6,-,chr20:5950487-5950489


In [32]:
ext_global[0:5]

Unnamed: 0,tr_id,strand,global_coo_ext
0,ENST00000014914.5,+,chr12:12891636-12891664+chr12:12908243-12908249
1,ENST00000054950.3,+,chr11:32091140-32091196
2,ENST00000166345.7,+,chr5:892927-892998
3,ENST00000176643.10,+,chr17:19648849-19648971
4,ENST00000203001.6,-,chr20:5950406-5950489


In [33]:
RiboSET_coo = RiboSET.merge(ext_global[['tr_id', 'global_coo_ext']].merge(start_codon_global[['tr_id', 'global_coo_start_codon']], on='tr_id', how='inner'), on='tr_id', how='inner')

ENST00000379389.4, chr1:1013520-1013523, + 

In [34]:
tmp[tmp['tr_id'] == 'ENST00000014914.5']

Unnamed: 0,tr_id,gene,start_codon_local_coo_start,start_codon,N_term_end1,start_codon_local_coo_stop
234,ENST00000014914.5,GPRC5A,855,AAG,890,858


In [35]:
metadata_pc_g25[metadata_pc_g25['tr_id'] == 'ENST00000014914.5'].iloc[0].transcript_seq[854:890]+'_'+metadata_pc_g25[metadata_pc_g25['tr_id'] == 'ENST00000014914.5'].iloc[0].transcript_seq[890:893]

'AAGTTCACGGCCAACGCCTTGGCACTAGGGTCCAGA_ATG'

In [36]:
RiboSET_coo.columns

Index(['tr_id', 'gene', 'N_term_start1', 'N_term_end1', 'len_codons',
       'global_coo_50_and_less', 'strand', 'global_coo_primary',
       'PhyloCSF120score', 'number_of_records', 'ovlp', 'Riboseq_Summary',
       'Coverage_value_ext', 'Proteomics_count_ext', 'tag', 'transcript_type',
       'CDS_ratio', 'start_codon_local_coo_start', 'start_codon',
       'global_coo_ext', 'global_coo_start_codon'],
      dtype='object')

In [37]:
#RiboSET_coo[['tr_id', 'gene', 'global_coo_ext', 'global_coo_start_codon', 'strand', 'Riboseq_Summary']].to_csv('../../MYSTERIOUS_UNDEGRAD_PROJECT/NTE_riboseq_predictions.txt', sep='\t', index=False)

In [38]:
RiboSET_coo[['tr_id', 'gene', 'global_coo_ext', 'global_coo_start_codon', 'strand', 'Riboseq_Summary']][0:2]

Unnamed: 0,tr_id,gene,global_coo_ext,global_coo_start_codon,strand,Riboseq_Summary
0,ENST00000379389.4,ISG15,chr1:1013520-1013573,chr1:1013520-1013522,+,ATC; 98-152; Rank: 288; cov: 53.85
1,ENST00000349431.10,UBE2J2,chr1:1273666-1273713,chr1:1273711-1273713,-,ACG; 173-221; Rank: 328; cov: 90.91


In [39]:
RiboSET_coo[RiboSET_coo['gene'] == 'CCDC8'][['tr_id', 'gene', 'global_coo_ext', 'global_coo_start_codon', 'strand', 'Riboseq_Summary']]

Unnamed: 0,tr_id,gene,global_coo_ext,global_coo_start_codon,strand,Riboseq_Summary
346,ENST00000307522.3,CCDC8,chr19:46412811-46413026,chr19:46413024-46413026,-,ATC; 559-775; Rank: 400; cov: 79.1


In [40]:
 0.05/2768

1.8063583815028903e-05

In [41]:
2768 
2

2768! / (2768-2)! * 2!

SyntaxError: invalid syntax (<ipython-input-41-a9868954c0f9>, line 4)

In [None]:
1*2*...*2766*2767*2768 / (1*2*....*2768) * 2 =


In [None]:
2767*2768/2

In [None]:
0.5/3829528

In [None]:
0.5/2768

-------

# Parse Clinvar DB

In [None]:
clinvar = pd.read_csv('data/CLinvar/clinvar.vcf', sep='\t', comment='#', header=None)

clinvar['chr'] = 'chr'+clinvar[0].astype(str)

clinvar.columns = ['chr_', 'start_pos', 'clinvar_ID', 'ref', 'alt', 'qual', 'filter', 'info', 'chr']

# filter normal chromosomes 
clinvar_f = clinvar[clinvar['chr'].isin(['chr'+str(i) for i in range(1, 23)]+['chrY', 'chrX', 'chrM'])]

print (clinvar.shape, clinvar_f.shape)

clinvar_f = clinvar_f[clinvar_f['info'].str.contains('CLNSIG=')]

print (clinvar.shape, clinvar_f.shape)

In [None]:
clinvar_f['Clin_sign'] = [x.split('CLNSIG=')[1].split(';')[0] for x in clinvar_f['info'].tolist()]

clinvar_f['location_type'] = [x.split('|')[1].split(';')[0].split(',')[0] if x.find('|') != -1 else None for x in clinvar_f['info'].tolist()]

clinvar_f['review_status'] = [x.split('CLNREVSTAT')[1].split(';')[0] if x.find('|') != -1 else None for x in clinvar_f['info'].tolist()]

#review_status_filter = ['=criteria_provided,_single_submitter', '=criteria_provided,_multiple_submitters,_no_conflicts', 
#                       '=reviewed_by_expert_panel', '=practice_guideline']

# Let's overlap full db and then choose what we have

In [None]:
clinvar['ann'] = 'ref:'+clinvar['ref']+';alt:'+clinvar['alt']+';info:'+clinvar['info']

clinvar['len_alt'] = clinvar['alt'].str.len()

clinvar['stop_pos'] = clinvar['start_pos'] + clinvar['len_alt'] - 1

clinvar[['chr', 'start_pos', 'stop_pos', 'ann']].to_csv('tmp_res/clinvarVCF.bed', sep='\t', index=False, header=None)

### check long variations

In [None]:
tmp = clinvar[['chr', 'start_pos', 'stop_pos', 'ann']]

tmp[tmp['stop_pos'] - tmp['start_pos'] > 1]

### Make bed files for extensions and start codons 

In [None]:
f = open('tmp_res/RiboSET_pred_start_codon.bed', 'w')

f2 = open('tmp_res/RiboSET_pred_ext.bed', 'w')

for el in RiboSET_coo.to_numpy():
    tr_id = el[0]
    start_codon_coo = el[-1].split('+')
    ext_coo = el[-2].split('+')
    
    for coo in start_codon_coo:
        f.write(coo.replace(':', '\t').replace('-', '\t')+'\t'+tr_id+'\n')
    
    for coo in ext_coo:
        f2.write(coo.replace(':', '\t').replace('-', '\t')+'\t'+tr_id+'\n')
    
f.close()
f2.close()

In [None]:
! bedtools intersect -wao -a tmp_res/RiboSET_pred_start_codon.bed -b tmp_res/clinvarVCF.bed > tmp_res/RiboSET_pred_start_codon_clinvar.bed

----------

### Parse bedtools ovlp file: START CODON VARIATION

In [42]:
res_start = pd.read_csv('tmp_res/RiboSET_pred_start_codon_clinvar.bed', sep='\t', header=None)

res_start.columns = ['chr_codon', 'start_codon', 'stop_codon', 'tr_id', 'chr_var', 'start_var', 'stop_var',
                    'ann_var', '-1']

In [43]:
res_start[res_start['start_var'] != -1].to_csv('Supplementary_tables_and_plots/tables/RSET_start_ClinVar.txt', sep='\t', index=False)
res_start[res_start['start_var'] != -1]

Unnamed: 0,chr_codon,start_codon,stop_codon,tr_id,chr_var,start_var,stop_var,ann_var,-1
11,chr1,32817272,32817274,ENST00000373477.8,chr1,32817272,32817272,ref:C;alt:T;info:ALLELEID=365223;CLNDISDB=MedG...,-1
37,chr1,169485878,169485880,ENST00000236137.9,chr1,169485880,169485880,ref:G;alt:A;info:AF_TGP=0.00719;ALLELEID=27713...,-1
114,chr5,80654847,80654849,ENST00000439211.6,chr5,80654848,80654848,ref:A;alt:C;info:ALLELEID=1043665;CLNDISDB=Med...,0
115,chr5,80654847,80654849,ENST00000439211.6,chr5,80654848,80654848,ref:A;alt:G;info:ALLELEID=634061;CLNDISDB=MedG...,0
116,chr5,80654847,80654849,ENST00000439211.6,chr5,80654849,80654849,ref:C;alt:A;info:ALLELEID=634062;CLNDISDB=MOND...,-1
117,chr5,80654847,80654849,ENST00000439211.6,chr5,80654849,80654849,ref:C;alt:G;info:ALLELEID=634063;CLNDISDB=MedG...,-1
118,chr5,80654847,80654849,ENST00000439211.6,chr5,80654849,80654849,ref:C;alt:T;info:AF_EXAC=0.00001;ALLELEID=6340...,-1
276,chr16,14630192,14630194,ENST00000437198.6,chr16,14630192,14630192,ref:A;alt:G;info:AF_TGP=0.00060;ALLELEID=62453...,-1


In [44]:
for el in res_start.to_numpy():
    info = el[-2]
    tr_id = el[3]
    if info.find('CLNSIG=') != -1:
        Cl_sign = info.split('CLNSIG=')[1].split(';')[0]
        
        #if (Cl_sign.find('Pathogenic') != -1) | (Cl_sign.find('pathogen') != -1):
        print (tr_id, Cl_sign, info.split(';')[0], info.split(';')[1])
        print (info)
        print ()

ENST00000373477.8 Likely_benign ref:C alt:T
ref:C;alt:T;info:ALLELEID=365223;CLNDISDB=MedGen:CN169374;CLNDN=not_specified;CLNHGVS=NC_000001.11:g.32817272C>T;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Likely_benign;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;GENEINFO=YARS1:8565|S100PBP:64766;ORIGIN=1;RS=1057524574

ENST00000236137.9 Conflicting_interpretations_of_pathogenicity ref:G alt:A
ref:G;alt:A;info:AF_TGP=0.00719;ALLELEID=277134;CLNDISDB=Human_Phenotype_Ontology:HP:0004860,MedGen:C0271972|MONDO:MONDO:0009575,MedGen:C0342287,OMIM:249270,Orphanet:ORPHA49827,SNOMED_CT:237617006;CLNDN=Thiamine-responsive_megaloblastic_anemia|Megaloblastic_anemia,_thiamine-responsive,_with_diabetes_mellitus_and_sensorineural_deafness;CLNHGVS=NC_000001.11:g.169485880G>A;CLNREVSTAT=criteria_provided,_conflicting_interpretations;CLNSIG=Conflicting_interpretations_of_pathogenicity;CLNSIGCONF=Likely_benign(1),Uncertain_significance(1);CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;CLN

SLC19A2, ENST00000236137.9 there is Conflicting_interpretations_of_pathogenicity, Thiamine-responsive_megaloblastic_anemia|Megaloblastic_anemia,_thiamine-responsive,_with_diabetes_mellitus_and_sensorineural_deafness

In [45]:
RiboSET[RiboSET['tr_id'] == 'ENST00000236137.9']

Unnamed: 0,tr_id,gene,N_term_start1,N_term_end1,len_codons,global_coo_50_and_less,strand,global_coo_primary,PhyloCSF120score,number_of_records,ovlp,Riboseq_Summary,Coverage_value_ext,Proteomics_count_ext,tag,transcript_type,CDS_ratio,start_codon_local_coo_start,start_codon
37,ENST00000236137.9,SLC19A2,78,237,53.0,chr1:169485767-169485916,-,chr1:169485767-169485925,-2215.7487,99,0;0;0,CTG; 124-238; Rank: 464; cov: 39.39,39.393939,0.0,"basic,appris_principal_1,CCDS",protein_coding,0.739155,124,CTG


In [46]:
# if strand = '-' => if ovlp at the end e.g. codon ATT [14630192, 14630194]
# and var = [14630192] => var at the 3rd letter, T

In [47]:
res_start[res_start['-1'] != 0].iloc[2].ann_var, res_start[res_start['-1'] != 0].iloc[2].tr_id

('ref:C;alt:A;info:ALLELEID=634062;CLNDISDB=MONDO:MONDO:0015356,MedGen:C0027672,Orphanet:ORPHA140162,SNOMED_CT:699346009|MedGen:CN517202;CLNDN=Hereditary_cancer-predisposing_syndrome|not_provided;CLNHGVS=NC_000005.10:g.80654849C>A;CLNREVSTAT=criteria_provided,_multiple_submitters,_no_conflicts;CLNSIG=Uncertain_significance;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;GENEINFO=DHFR:1719|MSH3:4437;MC=SO:0001583|missense_variant,SO:0001619|non-coding_transcript_variant,SO:0001623|5_prime_UTR_variant;ORIGIN=1;RS=763928973',
 'ENST00000439211.6')

In [48]:
print (RiboSET_coo[RiboSET_coo['tr_id'] == 'ENST00000439211.6'].iloc[0].global_coo_ext)
print (RiboSET_coo[RiboSET_coo['tr_id'] == 'ENST00000439211.6'].iloc[0].strand)

chr5:80654490-80654849
-


In [49]:
RiboSET_coo[RiboSET_coo['tr_id'] == 'ENST00000439211.6']

Unnamed: 0,tr_id,gene,N_term_start1,N_term_end1,len_codons,global_coo_50_and_less,strand,global_coo_primary,PhyloCSF120score,number_of_records,...,Riboseq_Summary,Coverage_value_ext,Proteomics_count_ext,tag,transcript_type,CDS_ratio,start_codon_local_coo_start,start_codon,global_coo_ext,global_coo_start_codon
114,ENST00000439211.6,DHFR,2,494,164.0,chr5:80654490-80654639,-,chr5:80654490-80654981,-2881.5796,107,...,GTG; 135-495; Rank: 291; cov: 73.91,73.913043,0.0,"basic,appris_principal_1,CCDS",protein_coding,1.090317,135,GTG,chr5:80654490-80654849,chr5:80654847-80654849


# Let's check all variants in extensions then.... 

In [50]:
! bedtools intersect -wao -a tmp_res/RiboSET_pred_ext.bed -b tmp_res/clinvarVCF.bed > tmp_res/RiboSET_pred_ext_clinvar.bed

In [51]:
res_ext = pd.read_csv('tmp_res/RiboSET_pred_ext_clinvar.bed', sep='\t', header=None)

res_ext.columns = ['chr_ext', 'start_ext', 'stop_ext', 'tr_id', 'chr_var', 'start_var', 'stop_var',
                    'ann_var', '-1']

res_ext[res_ext['start_var'] != -1][['chr_ext', 'start_ext', 'stop_ext', 'tr_id', 'chr_var', 'start_var', 'stop_var',
                    'ann_var']].to_csv('Supplementary_tables_and_plots/tables/RSET_ext_ClinVar.txt', sep='\t', index=False)

res_ext[res_ext['start_var'] != -1][['chr_ext', 'start_ext', 'stop_ext', 'tr_id', 'chr_var', 'start_var', 'stop_var',
                    'ann_var']]

Unnamed: 0,chr_ext,start_ext,stop_ext,tr_id,chr_var,start_var,stop_var,ann_var
15,chr1,32817245,32817274,ENST00000373477.8,chr1,32817272,32817272,ref:C;alt:T;info:ALLELEID=365223;CLNDISDB=MedG...
24,chr1,55039628,55039837,ENST00000302118.5,chr1,55039638,55039638,ref:C;alt:A;info:ALLELEID=282656;CLNDISDB=MOND...
25,chr1,55039628,55039837,ENST00000302118.5,chr1,55039653,55039653,ref:A;alt:G;info:ALLELEID=282659;CLNDISDB=MOND...
26,chr1,55039628,55039837,ENST00000302118.5,chr1,55039658,55039658,ref:T;alt:G;info:ALLELEID=281414;CLNDISDB=MOND...
27,chr1,55039628,55039837,ENST00000302118.5,chr1,55039670,55039670,ref:C;alt:A;info:ALLELEID=434133;CLNDISDB=MOND...
...,...,...,...,...,...,...,...,...
634,chrX,120469170,120469229,ENST00000371335.4,chrX,120469183,120469183,ref:AGGCGGCGAC;alt:A;info:ALLELEID=45103;CLNDI...
635,chrX,120469170,120469229,ENST00000371335.4,chrX,120469184,120469184,ref:GGCGGCGAC;alt:G;info:ALLELEID=549818;CLNDI...
636,chrX,120469170,120469229,ENST00000371335.4,chrX,120469196,120469196,ref:G;alt:A;info:AF_EXAC=0.00005;ALLELEID=1417...
637,chrX,120469170,120469229,ENST00000371335.4,chrX,120469202,120469202,ref:G;alt:A;info:AF_ESP=0.00085;AF_EXAC=0.0002...


In [52]:
for el in res_ext.to_numpy():
    info = el[-2]
    tr_id = el[3]
    if info.find('CLNSIG=') != -1:
        Cl_sign = info.split('CLNSIG=')[1].split(';')[0]
        
        if (Cl_sign.find('Pathogenic') != -1) | (Cl_sign.find('pathogen') != -1):
            print (tr_id, RiboSET[RiboSET['tr_id'] == tr_id].iloc[0].gene, Cl_sign, info.split(';')[0], info.split(';')[1])

ENST00000302118.5 PCSK9 Conflicting_interpretations_of_pathogenicity ref:C alt:T
ENST00000302118.5 PCSK9 Conflicting_interpretations_of_pathogenicity ref:T alt:C
ENST00000236137.9 SLC19A2 Conflicting_interpretations_of_pathogenicity ref:G alt:C
ENST00000236137.9 SLC19A2 Conflicting_interpretations_of_pathogenicity ref:G alt:A
ENST00000325404.2 SOX2 Pathogenic ref:GGCCGGGCCCGCGCACAGCGCCCGCATGTACAACATGATGGAGACGGAGCTGAAGCC alt:G
ENST00000439211.6 DHFR Pathogenic ref:GA alt:G
ENST00000439211.6 DHFR Pathogenic ref:C alt:T
ENST00000248553.6 HSPB1 Conflicting_interpretations_of_pathogenicity ref:G alt:A
ENST00000248553.6 HSPB1 Conflicting_interpretations_of_pathogenicity ref:C alt:T
ENST00000369085.7 BAG3 Conflicting_interpretations_of_pathogenicity ref:G alt:A
ENST00000369085.7 BAG3 Conflicting_interpretations_of_pathogenicity ref:G alt:A
ENST00000369085.7 BAG3 Conflicting_interpretations_of_pathogenicity ref:C alt:T
ENST00000322344.7 PNKP Conflicting_interpretations_of_pathogenicity ref:A a

In [53]:
PCSK9 (2 SNPs), SLC19A2 (2 SNPs), HSPB1 (2 SNPs), BAG3 (3 SNPs), PNKP (1 SNP), LAMP2 (2 SNPs)

SOX2 (deletion of 56nt), DHFR (1 SNP, 1 deletion of 1nt)

SyntaxError: invalid syntax (<ipython-input-53-20c73fa68843>, line 1)

In [None]:
len('GGCCGGGCCCGCGCACAGCGCCCGCATGTACAACATGATGGAGACGGAGCTGAAGCC')

In [None]:
res_ext[(res_ext['tr_id'] == 'ENST00000369085.7') & (res_ext['ann_var'].str.contains('pathogen'))].iloc[2]

# Check whether variants impact Kozak context or create ATG or disrupt STOp etc

## SOX2: pathogenic variant, large deletion 

In [None]:
print (RiboSET_coo[RiboSET_coo['gene'] == 'SOX2'].iloc[0].Riboseq_Summary)

print (RiboSET_coo[RiboSET_coo['gene'] == 'SOX2'].iloc[0].global_coo_start_codon)

print (RiboSET_coo[RiboSET_coo['gene'] == 'SOX2'].iloc[0].global_coo_ext)

print ('del start: chr3:181712336')

# this del probably cut part of potential extension
# 

In [None]:
res_ext[(res_ext['tr_id'] == 'ENST00000325404.2')].iloc[1].ann_var

In [None]:
len('GGCCGGGCCCGCGCACAGCGCCCGCATGTACAACATGATGGAGACGGAGCTGAAGCC')

# 57nt -> 1nt, let's check sequences 

In [None]:
tr_seq = metadata_pc_g25[metadata_pc_g25['tr_id'] == 'ENST00000325404.2'].iloc[0].transcript_seq

N_term_start1 = RiboSET_coo[RiboSET_coo['gene'] == 'SOX2'].iloc[0].start_codon_local_coo_start-1

N_term_end1 = RiboSET_coo[RiboSET_coo['gene'] == 'SOX2'].iloc[0].N_term_end1 

ext_seq = tr_seq[N_term_start1:N_term_end1+3]

print (len(ext_seq) % 3)

ext_seq

In [None]:
ext_seq.find('G GCC GGG CCC GCG CAC AGC GCC CGC ATG TAC AAC ATG ATG GAG ACG GAG CTG AAG CC')

deletion in SOX2 affect 3' part of the predicted extension (8 codons) and 5'part of CDS (11 codons) 
thus completely removing cdsATG. The absence of this protein leads to a 
Anophthalmia/microphthalmia esophageal atresia syndrome which is a rare disorder characterized by 
abnormal development of the eyes and other parts of the body.


## BAG3: 3 variants 

In [None]:
gene = 'BAG3'
tr_id = RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].tr_id


In [None]:
print (RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].Riboseq_Summary)
print ()

print ('start codon coo', RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].global_coo_start_codon)
print ()

print ('ext coo', RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].global_coo_ext)

print ()
print (RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].Riboseq_Summary)

In [None]:
res_ext[(res_ext['tr_id'] == tr_id) & (res_ext['ann_var'].str.contains('Conflicting_interpretations_of_pathogenicity'))]

In [None]:
119651658-119651415, 119651675-119651658, G->A

In [None]:
119651659-119651415, 119651675-119651659, G->A

In [None]:
119651672-119651415, 119651675-119651672, C->T

In [None]:
tr_seq = metadata_pc_g25[metadata_pc_g25['tr_id'] == tr_id].iloc[0].transcript_seq

N_term_start1 = RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].start_codon_local_coo_start-1

N_term_end1 = RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].N_term_end1 

ext_seq = tr_seq[N_term_start1:N_term_end1+3]

print (len(ext_seq) % 3)

print ()

ext_seq

In [None]:
ext_seq[242:]   #243, G->A

In [None]:
ext_seq[243:]   #244, G->A ????


In [None]:
ext_seq[256:]  #C-> T

CTGGACCAGAAGTTTCTAGCCGGCCAGTTGCTACCTCCCTTTATCTCCTCCTTCCCCTCTGGCAGCGAGGAGGCTATTTCCAGACACTTCCACCCCTCTCTGGCCACGTCACCCCCGCCTTTAATTCATAAAGGTGCCCGGCGCCGGCTTCCCGGACACGTCGGCGGCGGAGAGGGGCCCACGGCGGCGGCCCGGCCAGAGACTCGGCGCCCGGAGCCAGCGCCCCGCACC CGC GCC CCA GCG GGC AGA CCC CAA CCC AGC ATG
                    **                    *  
                    AA                    T  
                    
                GCG AGC AGA         CCC AGT ATG
                    
                GCG GAC AGA

In [None]:
all 3 SNPs in predicted extension of BAG3 gene do not create new AUGs or STOPs in any frames. 

In [None]:
(chr10:119651658, G->A),  GGC -> AGC (Gly->Ser)
(chr10:119651659, G->A), GGC -> GAC  (Gly->Asp) ??
(chr10:119651672, C->T), AGC -> AGT  (Ser->Ser)

In [None]:
; G->A   G->A   C->T

## PCSK9 - 2 variations with Conflicting_interpretations_of_pathogenicity

In [None]:
gene = 'PCSK9'
tr_id = RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].tr_id

print (RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].Riboseq_Summary)
print ()

print ('start codon coo', RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].global_coo_start_codon)
print ()

print ('ext coo', RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].global_coo_ext)

print ()
print (RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].Riboseq_Summary) 

print ()
print (tr_id)

In [None]:
res_ext[(res_ext['tr_id'] == tr_id) & (res_ext['ann_var'].str.contains('Conflicting_interpretations_of_pathogenicity'))]

In [None]:
tr_seq = metadata_pc_g25[metadata_pc_g25['tr_id'] == tr_id].iloc[0].transcript_seq

N_term_start1 = RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].start_codon_local_coo_start-1

N_term_end1 = RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].N_term_end1 

ext_seq = tr_seq[N_term_start1:N_term_end1+3]

print (len(ext_seq) % 3)

print ()

ext_seq

GTG AGA CTG GCT CGG GCG GGC CGG GAC GCG TCG TTG CAG CAG CGG CTC CCA GCT CCC AGC CAG GAT TCC GCG CGC CCC TTC ACG
                                                                                             T
                                                                                        TCC GTG CGC
                                                                                         

CGC CCT GCT CCT GAA CTT CAG CTC CTG CAC AGT CCT CCC CAC CGC AAG GCT CAA GGC GCC GCC GGC GTG GAC CGC GCA CGG CCT CTA GGT CTC CTC GCC AGG ACA GCA ACC TCT CCC CTG GCC CTC ATG
                                             C
                                        CCC CCG GCC

One variant (chr1:55039698; C->T) in PCSK9 gene changes extension codon GCG to GTG (Ala -> Val) while another one (chr1:55039830) changes CTG to CCG (Leu -> Pro). 

In [None]:
no peak in ribo-seq at these positions 

change aa? 

## SLC19A2 - 2 variations with Conflicting_interpretations_of_pathogenicity

In [None]:
gene = 'SLC19A2'
tr_id = RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].tr_id

print (RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].Riboseq_Summary)
print ()

print ('start codon coo', RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].global_coo_start_codon)
print ()

print ('ext coo', RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].global_coo_ext)

print ()
print (RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].Riboseq_Summary) 

print ()
print (tr_id)

In [None]:
res_ext[(res_ext['tr_id'] == tr_id) & (res_ext['ann_var'].str.contains('Conflicting_interpretations_of_pathogenicity'))]

In [None]:
SLC19A2 has 1 variant. 

(chr1:169485778, G->C which is on '-' strand, so on in mRNA it is C->G), CTC->GTC, Leu->Val

In [None]:
tr_seq = metadata_pc_g25[metadata_pc_g25['tr_id'] == tr_id].iloc[0].transcript_seq

N_term_start1 = RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].start_codon_local_coo_start-1

N_term_end1 = RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].N_term_end1 

ext_seq = tr_seq[N_term_start1:N_term_end1+3]

print (len(ext_seq) % 3)

print ()

ext_seq

CTG GCC TTA CAG GGA GAA GGC GTC ACT CGC GGT TAC AAG TGC CTG ACC CTC ACT CCA GTT GGC GGA GGA GGA GAA GGA AGG GGC 
T
TTG


CGG GCC GGG TCC CCT CCC CTC GCG CCC CGG ATG
                        G
                    CCC GTC GCG

## HSPB1 - 2 variations with Conflicting_interpretations_of_pathogenicity

In [None]:
gene = 'HSPB1'
tr_id = RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].tr_id

print (RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].Riboseq_Summary)
print ()

print ('start codon coo', RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].global_coo_start_codon)
print ()

print ('ext coo', RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].global_coo_ext)

print ()
print (RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].Riboseq_Summary) 

print ()
print (tr_id)

In [None]:
res_ext[(res_ext['tr_id'] == tr_id) & (res_ext['ann_var'].str.contains('Conflicting_interpretations_of_pathogenicity'))]

In [None]:
tr_seq = metadata_pc_g25[metadata_pc_g25['tr_id'] == tr_id].iloc[0].transcript_seq

N_term_start1 = RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].start_codon_local_coo_start-1

N_term_end1 = RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].N_term_end1 

ext_seq = tr_seq[N_term_start1:N_term_end1+3]

print (len(ext_seq) % 3)

print ()

ext_seq

CTG GAG GAG CAT AAA AGC GCA GCC GAG CCC AGC GCC CCG CAC TTT TCT GAG CAG ACG TCC AGA GCA GAG TCA GCC AGC ATG
                                                                                    A             T
                                                                                    ACA         GCT

In [None]:
chr7:76302698, G->A, GCA->ACA (Ala->Thr)
chr7:76302709, C->T, GCC->GCT (Ala->Ala), it creates out-of-frame stop codon (UAG), 
    however, no corresponding AUG (or CUG) is in sequence already. 

In [None]:
s = 'CTGGAGGAGCATAAAAGCGCAGCCGAGCCCAGCGCCCCGCACTTTT CTG AGC AGA CGT CCA GAG CAG AGT CAG CTA GCATG'

s.find('CTG')
                              

In [None]:
s[78:]

## LAMP2 - 2 variations with Conflicting_interpretations_of_pathogenicity

In [None]:
gene = 'LAMP2'
tr_id = RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].tr_id

print (RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].Riboseq_Summary)
print ()

print ('start codon coo', RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].global_coo_start_codon)
print ()

print ('ext coo', RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].global_coo_ext)

print ()
print (RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].Riboseq_Summary) 

print ()
print (tr_id)

In [None]:
res_ext[(res_ext['tr_id'] == tr_id) & (res_ext['ann_var'].str.contains('Conflicting_interpretations_of_pathogenicity'))]

In [None]:
tr_seq = metadata_pc_g25[metadata_pc_g25['tr_id'] == tr_id].iloc[0].transcript_seq

N_term_start1 = RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].start_codon_local_coo_start-1

N_term_end1 = RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].N_term_end1 

ext_seq = tr_seq[N_term_start1:N_term_end1+3]

print (len(ext_seq) % 3)

print ()

ext_seq

In [None]:
chrX:120469173, C->G or G->C on mRNA; GGG->GGC (Gly->Gly)
    
chrX:120469179, G->A or C->T on mRNA; CTC->CTT (Leu->Leu)

In [None]:
GTG TTG CAG CTG TTG TTG TAC CGC CGC CGT CGC CGC CGT CGC CGC CTG CTC TGC GGG GTC ATG
                                                                  T   T   C

In [None]:
there is another one


chrX:120469176, C->T or G->A on mRNA; TGC->TGT (Cys->Cys)

## PNKP - 1 variation with Conflicting_interpretations_of_pathogenicity

In [None]:
gene = 'PNKP'
tr_id = RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].tr_id

print (RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].Riboseq_Summary)
print ()

print ('start codon coo', RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].global_coo_start_codon)
print ()

print ('ext coo', RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].global_coo_ext)

print ()
print (RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].Riboseq_Summary) 

print ()
print (tr_id)

In [None]:
res_ext[(res_ext['tr_id'] == tr_id) & (res_ext['ann_var'].str.contains('Conflicting_interpretations_of_pathogenicity'))]

In [None]:
tr_seq = metadata_pc_g25[metadata_pc_g25['tr_id'] == tr_id].iloc[0].transcript_seq

N_term_start1 = RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].start_codon_local_coo_start-1

N_term_end1 = RiboSET_coo[RiboSET_coo['gene'] == gene].iloc[0].N_term_end1 

ext_seq = tr_seq[N_term_start1:N_term_end1+3]

print (len(ext_seq) % 3)

print ()

ext_seq

In [None]:
GTG GCC GTG AGC CCA AGC CGC GGT CCC GGG CCG GCA CCC AGG ATG
         C
        GCG

In [None]:
chr19:49867490, A->G, GTG->GCG (Val->Ala), which disrupt out-of-frame stop codon (UGA->CGA), no corresponding uORFs to it. 

# dbSNP 

# Cancer implicated genes from COSMIC 

In [54]:
!ls data/Census_cancer_implicated_genes.csv

data/Census_cancer_implicated_genes.csv


In [55]:
cosmic = pd.read_csv('data/Census_cancer_implicated_genes.csv', sep='\t')
cosmic[0:2]

Unnamed: 0,Gene Symbol,Name,Entrez GeneId,Genome Location,Tier,Hallmark,Chr Band,Somatic,Germline,Tumour Types(Somatic),Tumour Types(Germline),Cancer Syndrome,Tissue Type,Molecular Genetics,Role in Cancer,Mutation Types,Translocation Partner,Other Germline Mut,Other Syndrome,Synonyms
0,A1CF,APOBEC1 complementation factor,29974.0,10:50799421-50885675,2,,11.23,yes,,melanoma,,,E,,oncogene,Mis,,,,"29974,A1CF,ACF,ACF64,ACF65,APOBEC1CF,ASP,ENSG0..."
1,ABI1,abl-interactor 1,10006.0,10:26746593-26860935,1,Yes,12.1,yes,,AML,,,L,Dom,"TSG, fusion",T,KMT2A,,,"10006,ABI-1,ABI1,E3B1,ENSG00000136754.17,Q8IZP..."


In [56]:
cosmic['Gene Symbol'].nunique()

723

In [57]:
cosmic_explode = cosmic.assign(Synonyms=cosmic['Synonyms'].str.split(',')).explode('Synonyms')

In [58]:
cosmic_explode[cosmic_explode['Gene Symbol'] == cosmic_explode['Synonyms']].shape[0], cosmic.shape[0]

(713, 723)

In [59]:
set(cosmic_explode['Gene Symbol'].tolist()) - set(cosmic_explode[cosmic_explode['Gene Symbol'] == cosmic_explode['Synonyms']]['Gene Symbol'].tolist())

{'DUX4L1',
 'HMGN2P46',
 'IGH',
 'IGK',
 'IGL',
 'MALAT1',
 'MDS2',
 'TRA',
 'TRB',
 'TRD'}

In [60]:
cosmic_explode[cosmic_explode['Gene Symbol'].isin(['DUX4L1',
 'HMGN2P46',
 'IGH',
 'IGK',
 'IGL',
 'MALAT1',
 'MDS2',
 'TRA',
 'TRB',
 'TRD'])][['Gene Symbol', 'Synonyms']]

Unnamed: 0,Gene Symbol,Synonyms
186,DUX4L1,
299,HMGN2P46,
317,IGH,
318,IGK,
319,IGL,
381,MALAT1,
394,MDS2,
679,TRA,
681,TRB,
682,TRD,


### RiboSet ovlp 

In [61]:
len(set(RiboSET.gene.tolist()).intersection(cosmic['Gene Symbol'].tolist()))

20

In [62]:
len(set(RiboSET.gene.tolist()).intersection(cosmic_explode.Synonyms.tolist()))

21

In [63]:
set(RiboSET.gene.tolist()).intersection(cosmic_explode.Synonyms.tolist())-set(RiboSET.gene.tolist()).intersection(cosmic['Gene Symbol'].tolist())

{'RAD1'}

21 genes from RiboSET are known to be causally implicated in various tumours (mostly in various skin tumours and lymphomas) according to The Cancer Gene Census (CGC) [PMID: 30293088]. 

In [64]:
cosmic_explode[(cosmic_explode['Synonyms'] == 'RAD1')]

Unnamed: 0,Gene Symbol,Name,Entrez GeneId,Genome Location,Tier,Hallmark,Chr Band,Somatic,Germline,Tumour Types(Somatic),Tumour Types(Germline),Cancer Syndrome,Tissue Type,Molecular Genetics,Role in Cancer,Mutation Types,Translocation Partner,Other Germline Mut,Other Syndrome,Synonyms
211,ERCC4,excision repair cross-complementing rodent rep...,2072.0,16:13920157-13952345,1,Yes,13.12,,yes,,"skin basal cell, skin squamous cell, melanoma",xeroderma pigmentosum (F),E,Rec,TSG,"Mis, N, F",,,,RAD1


In [65]:
riboset_cosmic = cosmic_explode[(cosmic_explode['Gene Symbol'].isin(list(set(RiboSET.gene.tolist()).intersection(cosmic['Gene Symbol'].tolist())))) | 
               (cosmic_explode['Synonyms'] == 'RAD1')][['Gene Symbol', 'Cancer Syndrome', 'Tumour Types(Germline)',
                                                        'Tumour Types(Somatic)',
                                                       'Role in Cancer', 'Tissue Type', 'Mutation Types']].drop_duplicates()

riboset_cosmic

Unnamed: 0,Gene Symbol,Cancer Syndrome,Tumour Types(Germline),Tumour Types(Somatic),Role in Cancer,Tissue Type,Mutation Types
67,BLM,Bloom syndrome,"leukaemia, lymphoma, skin squamous cell, other...",,TSG,"L, E","Mis, N, F"
211,ERCC4,xeroderma pigmentosum (F),"skin basal cell, skin squamous cell, melanoma",,TSG,E,"Mis, N, F"
224,EZR,,,NSCLC,fusion,E,T
301,HNRNPA2B1,,,prostate,"oncogene, fusion",E,T
311,HSP90AA1,,,NHL,fusion,L,T
335,JUN,,,sarcoma,oncogene,M,A
338,KAT7,,,CCRCC,oncogene,E,Mis
344,KDSR,,,B-NHL,fusion,L,T
360,LASP1,,,AML,fusion,L,T
372,LPP,,,"lipoma, leukaemia","oncogene, fusion","L, M",T


In [66]:
riboseq_cancer_list = [x.split(', ') for x in ['leukaemia, lymphoma, skin squamous cell, other tumour types',
 'skin basal cell, skin squamous cell, melanoma','NSCLC',
 'prostate',
 'NHL',
 'sarcoma',
 'CCRCC',
 'B-NHL',
 'AML',
 'lipoma, leukaemia',
 'NHL, APL, AML',
 'papillary renal',
 'Spitzoid tumour',
 'pancreas acinar carcinoma',
 'NSCLC, oesophageal squamous carcinoma',
 'medulloblastoma',
 'colorectal',
 'SCC, melanoma',
 'AML*',
 'endometrial stromal sarcoma',
 'ALL']]


len(riboseq_cancer_list)

21

In [67]:
flat_list = [item for sublist in riboseq_cancer_list for item in sublist]
Counter(flat_list)

Counter({'leukaemia': 2,
         'lymphoma': 1,
         'skin squamous cell': 2,
         'other tumour types': 1,
         'skin basal cell': 1,
         'melanoma': 2,
         'NSCLC': 2,
         'prostate': 1,
         'NHL': 2,
         'sarcoma': 1,
         'CCRCC': 1,
         'B-NHL': 1,
         'AML': 2,
         'lipoma': 1,
         'APL': 1,
         'papillary renal': 1,
         'Spitzoid tumour': 1,
         'pancreas acinar carcinoma': 1,
         'oesophageal squamous carcinoma': 1,
         'medulloblastoma': 1,
         'colorectal': 1,
         'SCC': 1,
         'AML*': 1,
         'endometrial stromal sarcoma': 1,
         'ALL': 1})

In [68]:
Counter(riboset_cosmic['Role in Cancer'].tolist())

Counter({'TSG': 3,
         'fusion': 7,
         'oncogene, fusion': 5,
         'oncogene': 4,
         'TSG, fusion': 2})

## PhyloSET

In [69]:
set(PhyloSET.gene.tolist()).intersection(cosmic['Gene Symbol'].tolist())

{'FBXW7', 'MAF', 'SFPQ'}

In [70]:
len(set(PhyloSET.gene.tolist()).intersection(cosmic_explode.Synonyms.tolist()))

3

In [71]:
phyloset_cosmic = cosmic_explode[(cosmic_explode['Gene Symbol'].isin(['FBXW7', 'MAF', 'SFPQ']))][['Gene Symbol', 'Cancer Syndrome', 'Tumour Types(Germline)',
                                                        'Tumour Types(Somatic)',
                                                       'Role in Cancer', 'Tissue Type', 'Mutation Types']].drop_duplicates()

phyloset_cosmic

Unnamed: 0,Gene Symbol,Cancer Syndrome,Tumour Types(Germline),Tumour Types(Somatic),Role in Cancer,Tissue Type,Mutation Types
240,FBXW7,,,"colorectal, endometrial, T-ALL",TSG,"E, L","Mis, N, D, F"
379,MAF,,,MM,"oncogene, fusion",L,T
594,SFPQ,,,papillary renal,"TSG, fusion",E,T


# COSMIC variants in non-coding regions: bedtools intersect 

In [72]:
cosmic_var = pd.read_csv('data/CosmicNCV.tsv', sep='\t')

In [73]:
cosmic_var.shape[0]

18306185

In [74]:
cosmic_var.columns

Index(['Sample name', 'ID_SAMPLE', 'ID_tumour', 'Primary site',
       'Site subtype 1', 'Site subtype 2', 'Site subtype 3',
       'Primary histology', 'Histology subtype 1', 'Histology subtype 2',
       'Histology subtype 3', 'GENOMIC_MUTATION_ID', 'LEGACY_MUTATION_ID',
       'zygosity', 'GRCh', 'genome position', 'Mutation somatic status',
       'WT_SEQ', 'MUT_SEQ', 'SNP', 'FATHMM_MKL_NON_CODING_SCORE',
       'FATHMM_MKL_NON_CODING_GROUPS', 'FATHMM_MKL_CODING_SCORE',
       'FATHMM_MKL_CODING_GROUPS', 'Whole_Genome_Reseq', 'Whole_Exome',
       'ID_STUDY', 'PUBMED_PMID', 'HGVSG'],
      dtype='object')

In [75]:
cosmic_var_f = cosmic_var[['HGVSG', 'Primary site', 'Primary histology', 'Histology subtype 1', 
           'genome position', 'GRCh']]


# make a bed file 

bed = open('tmp_res/cosmic_mut.bed', 'w')


for row in cosmic_var_f.to_numpy():
    if row[-1] == 38:
        pos = row[-2]
        chrom = 'chr'+pos.split(':')[0]
        start = pos.split(':')[1].split('-')[0]
        stop = pos.split(':')[1].split('-')[1]
        
        if int(stop) > int(start): #insertion, take the 1st coo?? + 1
            stop = str(int(start)+1)
        elif int(stop) == int(start):
            stop = str(int(start)+1)
        else:
            continue
            
        score=str(1)
        strand='+'
        name = row[0]
            
        bed.write(chrom+'\t'+start+'\t'+stop+'\t'+name+'\t'+score+'\t'+strand+'\n')

bed.close()

In [76]:
bed = pd.read_csv('tmp_res/cosmic_mut.bed', sep='\t', header=None).drop_duplicates()
bed.shape[0]

15620360

In [77]:
bed.to_csv('tmp_res/cosmic_mut_no_dup.bed', sep='\t', header=None, index=False)

In [78]:
! sort -k1,1V -k2,2n -k3,3n tmp_res/cosmic_mut_no_dup.bed > tmp_res/cosmic_mut_no_dup_sorted.bed

In [79]:
bed_srt = pd.read_csv('tmp_res/cosmic_mut_no_dup_sorted.bed', sep='\t', header=None)
bed_srt[0:20]

Unnamed: 0,0,1,2,3,4,5
0,chr1,10108,10109,1:g.10108C>T,1,+
1,chr1,10151,10152,1:g.10151T>A,1,+
2,chr1,10175,10176,1:g.10175T>A,1,+
3,chr1,10181,10182,1:g.10181A>T,1,+
4,chr1,10237,10238,1:g.10237A>C,1,+
5,chr1,10257,10258,1:g.10257del,1,+
6,chr1,10273,10274,1:g.10273_10277del,1,+
7,chr1,10281,10282,1:g.10275_10281dup,1,+
8,chr1,10333,10334,1:g.10333C>T,1,+
9,chr1,10341,10342,1:g.10341_10345del,1,+


In [80]:
! bedtools intersect -wao -a tmp_res/RiboSET_pred_ext.bed -b tmp_res/cosmic_mut_no_dup_sorted.bed > tmp_res/RiboSET_pred_ext_cosmic.bed

In [81]:
! bedtools intersect -wao -a tmp_res/RiboSET_pred_start_codon.bed -b tmp_res/cosmic_mut_no_dup_sorted.bed > tmp_res/RiboSET_pred_start_codon_cosmic.bed

## Start codon ovlps

In [92]:
res_start = pd.read_csv('tmp_res/RiboSET_pred_start_codon_cosmic.bed', sep='\t', header=None)

#res_start.columns = ['chr_codon', 'start_codon', 'stop_codon', 'tr_id', 'chr_var', 'start_var', 'stop_var',
                    #'ann_var', '-1']

#res_start[res_start['start_var'] != -1].to_csv('Supplementary_tables_and_plots/tables/RSET_start_ClinVar.txt', sep='\t', index=False)
#res_start[res_start['start_var'] != -1]

sel = res_start[res_start[5] != -1]

sel.columns = ['chr_codon', 'start_codon', 'stop_codon', 'tr_id', 'chr_var', 'start_var', 'stop_var',
                   'HGVSG', '1', 'strand', 'ovlp']

sel[0:2]

Unnamed: 0,chr_codon,start_codon,stop_codon,tr_id,chr_var,start_var,stop_var,HGVSG,1,strand,ovlp
11,chr1,32817272,32817274,ENST00000373477.8,chr1,32817272,32817273,1:g.32817272C>A,1,+,1
44,chr1,235128851,235128853,ENST00000366607.4,chr1,235128851,235128852,1:g.235128851C>T,1,+,1


In [93]:
sel_ann = sel.merge(cosmic_var_f, on=['HGVSG'], how='inner')

In [98]:
metadata_pc_g25[metadata_pc_g25['tr_id'] == 'ENST00000273158.8']

Unnamed: 0,tr_id,gene,gene_tr,transcript_seq,5UTR_start_seq,CDS_start,cds_seq,cds_start_codon,cds_stop_codon,cds_start_pos,cds_stop_pos,utr5_start,utr5_end,record_id
15116,ENST00000273158.8,SLC25A38,SLC25A38-001,GAAGCGAAATCTCCCCTTCTACAGAGTTCCTCCGGCGCTTCCTCCA...,GAAGCGAAATCTCCCCTTCTACAGAGTTCCTCCGGCGCTTCCTCCA...,CCA,ATGATTCAGAACTCACGTCCGTCGCTGCTGCAACCCCAAGATGTCG...,ATG,TGA,377,1292,0,377,ENST00000273158.8|ENSG00000144659.10|OTTHUMG00...


In [None]:
(1) add genes and strand 
(2) add start codons sequence
(3) global change -> local change 
e.g. '-' strand, [start=C, middle=T, stop=G], change at the start C>A => ATG 


(4) for extensions 

In [94]:
* ENST00000373477.8, YARS, '-', CTG, 72C>A

   272 273 274
'+' C   A   G
'-' G   T   C
   274 273 272


   272 273 274
'+' C   A   T
'-' G   T   A
   274 273 272
    
CTG -> ATG



* ENST00000366607.4, TOMM20, '-', GTG, 
* ENST00000307296.7, PDCD6IP, '+', CTG, T>A, CAG 
* ENST00000273158.8, SLC25A38, '', 

Unnamed: 0,chr_codon,start_codon,stop_codon,tr_id,chr_var,start_var,stop_var,HGVSG,1,strand,ovlp,Primary site,Primary histology,Histology subtype 1,genome position,GRCh
0,chr1,32817272,32817274,ENST00000373477.8,chr1,32817272,32817273,1:g.32817272C>A,1,+,1,urinary_tract,carcinoma,NS,1:32817272-32817272,38
1,chr1,235128851,235128853,ENST00000366607.4,chr1,235128851,235128852,1:g.235128851C>T,1,+,1,skin,malignant_melanoma,NS,1:235128851-235128851,38
2,chr3,33798684,33798686,ENST00000307296.7,chr3,33798685,33798686,3:g.33798685T>A,1,+,1,liver,carcinoma,NS,3:33798685-33798685,38
3,chr3,39383692,39383694,ENST00000273158.8,chr3,39383692,39383693,3:g.39383692C>A,1,+,1,cervix,carcinoma,squamous_cell_carcinoma,3:39383692-39383692,38
4,chr4,105895605,105895607,ENST00000379987.6,chr4,105895605,105895606,4:g.105895605C>A,1,+,1,endometrium,carcinoma,endometrioid_carcinoma,4:105895605-105895605,38
5,chr5,69167221,69167223,ENST00000256442.9,chr5,69167221,69167222,5:g.69167221C>T,1,+,1,cervix,carcinoma,squamous_cell_carcinoma,5:69167221-69167221,38
6,chr5,131359392,131359394,ENST00000505065.1,chr5,131359392,131359393,5:g.131359392T>C,1,+,1,liver,other,neoplasm,5:131359392-131359392,38
7,chr6,85642947,85642949,ENST00000369622.7,chr6,85642948,85642949,6:g.85642948A>T,1,+,1,lung,carcinoma,NS,6:85642948-85642948,38
8,chr6,138404385,138404387,ENST00000607197.5,chr6,138404385,138404386,6:g.138404385C>T,1,+,1,pancreas,carcinoma,ductal_carcinoma,6:138404385-138404385,38
9,chr7,137343468,137343470,ENST00000348225.6,chr7,137343468,137343469,7:g.137343468C>T,1,+,1,lung,carcinoma,squamous_cell_carcinoma,7:137343468-137343468,38


# Extension overlaps 

In [99]:
res_start = pd.read_csv('tmp_res/RiboSET_pred_ext_cosmic.bed', sep='\t', header=None)

#res_start.columns = ['chr_codon', 'start_codon', 'stop_codon', 'tr_id', 'chr_var', 'start_var', 'stop_var',
                    #'ann_var', '-1']

#res_start[res_start['start_var'] != -1].to_csv('Supplementary_tables_and_plots/tables/RSET_start_ClinVar.txt', sep='\t', index=False)
#res_start[res_start['start_var'] != -1]

res_start[res_start[5] != -1]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,chr1,1013520,1013573,ENST00000379389.4,chr1,1013528,1013529,1:g.1013528C>T,1,+,1
1,chr1,1013520,1013573,ENST00000379389.4,chr1,1013531,1013532,1:g.1013531T>C,1,+,1
2,chr1,1013520,1013573,ENST00000379389.4,chr1,1013559,1013560,1:g.1013559C>T,1,+,1
4,chr1,1374757,1375053,ENST00000338370.7,chr1,1374760,1374761,1:g.1374760C>A,1,+,1
5,chr1,1374757,1375053,ENST00000338370.7,chr1,1374801,1374802,1:g.1374801A>G,1,+,1
...,...,...,...,...,...,...,...,...,...,...,...
1096,chr22,41620840,41620905,ENST00000263256.6,chr22,41620874,41620875,22:g.41620874C>G,1,+,1
1099,chr22,42070298,42070330,ENST00000396398.7,chr22,42070303,42070304,22:g.42070303T>A,1,+,1
1100,chr22,42070298,42070330,ENST00000396398.7,chr22,42070305,42070306,22:g.42070305G>C,1,+,1
1101,chr22,42070298,42070330,ENST00000396398.7,chr22,42070319,42070320,22:g.42070319G>T,1,+,1


In [None]:
# transform global coo into local coo 

# Alphafold 

In [91]:
gene = 'SFPQ'


transcript_seq = metadata_pc_g25[metadata_pc_g25['tr_id'] == 'ENST00000357214.5'].iloc[0].transcript_seq
cds_start_pos = metadata_pc_g25[metadata_pc_g25['tr_id'] == 'ENST00000357214.5'].iloc[0].cds_start_pos
cds_stop_pos = metadata_pc_g25[metadata_pc_g25['tr_id'] == 'ENST00000357214.5'].iloc[0].cds_stop_pos
N_start = RiboSET[RiboSET['gene'] == gene].iloc[0].start_codon_local_coo_start-1



NTE_proteoform = transcript_seq[N_start:cds_stop_pos]
CDS_proteoform = transcript_seq[cds_start_pos:cds_stop_pos]

NTE_proteoform_aa = str(Seq(NTE_proteoform).translate())

In [92]:
'M'+NTE_proteoform_aa[1:]

'MASTFPERLLRFCLDRPLTTDMSRDRFRSRGGGGGGFHRRGGGGGRGGLHDFRSPPPGMGLNQNRGPMGPGPGQSGPKPPIPPPPPHQQQQQPPPQQPPPQQPPPHQPPPHPQPHQQQQPPPPPQDSSKPVVAQGPGPAPGVGSAPPASSSAPPATPPTSGAPPGSGPGPTPTPPPAVTSAPPGAPPPTPPSSGVPTTPPQAGGPPPPPAAVPGPGPGPKQGPGPGGPKGGKMPGGPKPGGGPGLSTPGGHPKPPHRGGGEPRGGRQHHPPYHQQHHQGPPPGGPGGRSEEKISDSEGFKANLSLLRRPGEKTYTQRCRLFVGNLPADITEDEFKRLFAKYGEPGEVFINKGKGFGFIKLESRALAEIAKAELDDTPMRGRQLRVRFATHAAALSVRNLSPYVSNELLEEAFSQFGPIERAVVIVDDRGRSTGKGIVEFASKPAARKAFERCSEGVFLLTTTPRPVIVEPLEQLDDEDGLPEKLAQKNPMYQKERETPPRFAQHGTFEYEYSQRWKSLDEMEKQQREQVEKNMKDAKDKLESEMEDAYHEHQANLLRQDLMRRQEELRRMEELHNQEMQKRKEMQLRQEEERRRREEEMMIRQREMEEQMRRQREESYSRMGYMDPRERDMRMGGGGAMNMGDPYGSGGQKFPPLGGGGGIGYEANPGVPPATMSGSMMGSDMRTERFGQGGAGPVGGQGPRGMGPGTPAGYGRGREEYEGPNKKPRF*'

In [None]:
len('GTGGCCTCCACGTTTCCTGAGCGTCTTCTTCGCTTTTGCCTCGACCGCCCCTTGACCACAGAC') % 3

In [None]:
len(CDS_proteoform) % 3