In [1]:
import re
import numpy as np
import pandas as pd
import seaborn as sns

from pathlib import Path
from scipy import stats
from veliadb import base, settings
from veliadb import benchling_orm as bo
from veliadb.base import (Session, Orf, OrfXref, Transcript, Gene, 
                          TranscriptOrf, SequenceRegionXref, Protein, 
                          ProteinXref, Dataset, ProteinOrf)

from sqlalchemy.sql.expression import func, and_, or_

from dashboard import data_load
import pyarrow.parquet as pq
from dashboard.etl import CACHE_DIR, TPM_DESEQ2_FACTOR, DATA_DIR

pd.options.display.max_columns = 50
pd.options.display.max_colwidth = 100



In [2]:
session = base.Session()

In [3]:
session.query(Orf).count()

16282850

In [4]:
dashboard_orfs = session.query(Orf).filter(Orf.vtx_id != '').all()

In [5]:
len(dashboard_orfs)

22820

In [7]:
session.query(Dataset).all()

[Data Set (3): CCDS,
 Data Set (4): HGNC_ID,
 Data Set (5): HGNC_SYMBOL,
 Data Set (6): swissprot,
 Data Set (7): trembl,
 Data Set (10): RefSeqFE,
 Data Set (12): BestRefSeq%2CGnomon,
 Data Set (17): CHESS,
 Data Set (18): havana,
 Data Set (9): Gnomon,
 Data Set (11): cmsearch,
 Data Set (21): ensembl,
 Data Set (13): BestRefSeq,
 Data Set (23): FANTOM,
 Data Set (24): ensembl_havana,
 Data Set (2): HAVANA,
 Data Set (1): ENSEMBL,
 Data Set (14): Curated Genomic,
 Data Set (8): RefSeq,
 Data Set (29): StringTie,
 Data Set (16): tRNAscan-SE,
 Data Set (31): openprot,
 Data Set (81): velia_phase6_viral_sORF,
 Data Set (33): velia_phase1_secreted_smORFs,
 Data Set (34): velia_phase1_Prensner,
 Data Set (37): velia_phase2_Chothani2022_SignalP,
 Data Set (38): velia_phase2_lncRNA_Jen,
 Data Set (40): velia_phase1_Chen,
 Data Set (41): velia_phase5_autoimmune lncRNA,
 Data Set (43): velia_phase2_Mudge2022_SignalP,
 Data Set (96): velia_phase7_tcga-DE_conserved_signalp+,
 Data Set (97): vel

In [11]:
score_df = pd.DataFrame(session.query(Orf.vtx_id, OrfXref.xref).join(OrfXref).filter(and_(OrfXref.xref_dataset_id == 145,
                                                           OrfXref.type == 'score',)).all())

In [30]:
low_vtx = list(score_df[score_df['xref'].astype(float) < .95]['vtx_id'])

In [31]:
len(set([d.vtx_id for d in dashboard_orfs]))

22820

In [33]:
with open('../cache_updates/all_vtx_240606.txt', 'w') as outfile:
    for orf in dashboard_orfs:
        if orf.vtx_id in low_vtx:
            continue
        else:
            outfile.write(f'{orf.vtx_id}\n')


### Swissprot

In [35]:
swissprot_query = \
    session.query(Protein, Orf)\
           .join(ProteinXref, ProteinXref.protein_id == Protein.id)\
           .join(Dataset, Dataset.id == ProteinXref.xref_dataset_id)\
           .join(ProteinOrf, ProteinOrf.protein_id == Protein.id)\
           .join(Orf, Orf.id == ProteinOrf.orf_id)\
           .filter(Dataset.name == 'swissprot')\
           .filter(func.length(Protein.aa_seq) < 150)\
           .distinct(ProteinXref.protein_id)



In [32]:
alan_df = pd.read_csv('../../data/orfs_under_150codons_with_function-2.csv')

In [37]:
with open('../cache_updates/swissprot_sORF_150aa_vtx.txt', 'w') as outfile:
    for prot, orf in swissprot_query.all():
        outfile.write(f'VTX-{orf.id:07d}\n')
    

In [28]:
prot, orf = swissprot_query.first()

In [19]:
prot.

SyntaxError: invalid syntax (2621462385.py, line 1)

In [15]:
prot.ensembl_protein_id

'ENSP00000319240.2'

In [14]:
orf.ensembl_protein_id

'ENSP00000319240.2'

In [12]:
session.query(ProteinOrf).count()

187319

In [None]:
from dashboard.etl.dashboard_etl import parse_deepsig, parse_phobius, parse_signalp41, par

In [None]:
with open(f'{OUTPUT_PREFIX}/phobius.results.txt', 'r') as fopen:
    phobius_data = ''.join(fopen.readlines())
phobius_results = parse_phobius(phobius_data, seqs)
deep_tmhmm_results = parse_deeptmhmm(f'{OUTPUT_PREFIX}/biolib_results/predicted_topologies.3line')
signalp_6_results = parse_signalp6(f'{OUTPUT_PREFIX}/output.json', seqs)
signalp_5_results = parse_signalp5(f'{OUTPUT_PREFIX}/results_summary.signalp5', seqs)
with open(f'{OUTPUT_PREFIX}/signalp41.results.txt', 'r') as fopen:
    signalp41_data = ''.join(fopen.readlines())
signalp_41_results = parse_signalp41(signalp41_data, seqs)
deepsig_results = parse_deepsig(f'{OUTPUT_PREFIX}/deepsig.results', seqs)

deepsig_results = pd.DataFrame(deepsig_results).T
signalp_6_results = pd.DataFrame(signalp_6_results).T
signalp_5_results = pd.DataFrame(signalp_5_results).T
signalp_41_results = pd.DataFrame(signalp_41_results).T
phobius_results = pd.DataFrame(phobius_results).T
deep_tmhmm_results = pd.DataFrame(deep_tmhmm_results).T
string_representations = pd.DataFrame()
string_representations['Deepsig'] = deepsig_results['string_rep']
string_representations['SignalP 6slow'] = signalp_6_results['string_rep']
string_representations['SignalP 5b'] = signalp_5_results['string_rep']
string_representations['SignalP 4.1'] = signalp_41_results['string_rep']
string_representations['Phobius'] = phobius_results['string_rep']
string_representations['DeepTMHMM'] = deep_tmhmm_results['string_rep']

vtx_ids = []
df_ordr = []
for k, v in seqs.items():
    vtx_ids.append(k)
    df_ordr.append(v)
string_representations = string_representations.loc[df_ordr].copy()
string_representations.index = vtx_ids
string_representations['Sequence'] = [seqs[i] for i in string_representations.index]

string_representations.to_csv(f'{OUTPUT_PREFIX}/sequence_features_strings.csv')

In [43]:
df = data_load.load_sorf_df_conformed()

In [44]:
metaorf_score_df = pd.DataFrame(session.query(OrfXref.orf_id, Orf.vtx_id, OrfXref.xref).\
                join(Orf, Orf.id == OrfXref.orf_id).\
                filter(Orf.vtx_id.in_(df.index)).\
                filter(and_(OrfXref.xref_dataset_id == 145, OrfXref.type == 'score')).all())
metaorf_score_df = metaorf_score_df.groupby('vtx_id').first()
metaorf_score_df['MetaORF v1.0 Score'] = metaorf_score_df['xref'].astype(float)
df = df.merge(metaorf_score_df, left_index=True, right_index=True, how='left')

In [45]:
df

Unnamed: 0_level_0,show_details,vtx_id,aa_length,ucsc_track,source,protein_xrefs,gene_xrefs,transcripts_exact,screening_phase_id,uniprot_annotation_score,aa,nonsignal_seqs,blastp_subject,blastp_hit_description,blastp_align_length,blastp_align_identity,nonsig_blastp_align_identity,tblastn_hit_id,tblastn_description,tblastn_align_length,tblastn_align_identity,nonsig_tblastn_align_identity,Deepsig_score,SignalP 6slow_score,SignalP 5b_score,...,translated_mean,secreted_mean,secreted_hibit,translated_hibit,phylocsf_58m_avg,phylocsf_58m_max,ESMFold plddt 90th percentile,swissprot_isoform,ensembl_isoform,refseq_isoform,spdis_ot,consequences_ot,trait_ot,coding_variant_ot,spdis_gb,consequences_gb,trait_gb,coding_variant_gb,chr,start,end,Ribo-Seq sORF,orf_id,xref,MetaORF v1.0 Score
vtx_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
VTX-0869911,False,VTX-0869911,65,chr19:40849973-40850426,[MetaORF v1.0],"[155940, CYP2A6, ENSP00000472905.1, M0R2Z4]",1548;2610;CYP2A6;CYP2A6;ENSG00000255974.8;gene-CYP2A6;HGNC:2610,[ENST00000600495.1],-1,1.0,MLASGMLLVALLVCLTVMVLMSVWQQRKSKGKLPPGPTPLPFIGNYLQLNTEQMYNSLMKVSQDQ,SVWQQRKSKGKLPPGPTPLPFIGNYLQLNTEQMYNSLMKVSQD,gnl|BL_ORD_ID|28300,NP_031838.2 cytochrome P450 2A5 [Mus musculus],40.0,0.475000,88.095,NM_009997.2,"NM_009997.2 Mus musculus cytochrome P450, family 2, subfamily a, polypeptide 4 (Cyp2a4), mRNA",63.0,77.778,88.095,0.00,0.0000,0.528297,...,,,False,,,,71.328249,['ENSP00000472905.1'],,,,,,False,,,,False,chr19,40849973,40850426,True,7241839.0,0.9795820393021725,0.979582
VTX-0868159,False,VTX-0868159,143,chr2:669580-676639,[MetaORF v1.0],"[96278, B5MBX8, ENSP00000384347.3, TMEM18]",129787;25257;ENSG00000151353.15;gene-TMEM18;HGNC:25257;TMEM18;TMEM18,"[CHS.28558.15, ENST00000405941.3, rna-NM_001352681.1, NM_001352681.1]",-1,1.0,MGTRWGLGRKPCSPHLRASGLLQTDWTEPWLMGLATFHALCVLLTCLSSRSYRLQIGHFLCLVILVYCAEYINEAAAMNWRLFSKYQYFDSRGMFI...,,gnl|BL_ORD_ID|53388,NP_742046.2 transmembrane protein 18 [Mus musculus],110.0,0.781818,,NM_172049.2,"NM_172049.2 Mus musculus transmembrane protein 18 (Tmem18), mRNA",110.0,78.182,,0.00,0.0000,0.003723,...,,,False,,,,79.875378,['ENSP00000384347.3'],,['NP_001339610.1'],,,,False,,,,False,chr2,669580,676639,True,3505104.0,0.999329127655237,0.999329
VTX-0868065,False,VTX-0868065,184,chr4:47031652-47032799,[MetaORF v1.0],"[91099, B4DJD0, NaN]",ENSG00000163288.14;GABRB1;HGNC:4081,[ENST00000381582.3],-1,1.0,MWTVQNRESLGLLSFPVMITMVCCAHSTNEPSNMSYVKETVDRLLKGYDIRLRPDFGGPPVDVGMRIDVASIDMVSEVNMVSGLPRGPAVRLTQMG...,TNEPSNMSYVKETVDRLLKGYDIRLRPDFGGPPVDVGMRIDVASIDMVSEVNMVSGLPRGPAVRLTQMGNGQVPLPSAFHWRSPRPWPLRSHSAPA...,gnl|BL_ORD_ID|74510,NP_032095.1 gamma-aminobutyric acid receptor subunit beta-1 isoform 1 precursor [Mus musculus],84.0,0.392857,28.477,NM_008069.5,"NM_008069.5 Mus musculus gamma-aminobutyric acid (GABA) A receptor, subunit beta 1 (Gabrb1), tra...",80.0,96.250,44.086,0.97,0.9998,0.573732,...,,,False,,,,52.184468,['ENSP00000295454.3' 'ENSP00000426753.1'],['P18505'],['XP_016863475.1' 'NP_000803.2' 'XP_024309744.1' 'XP_024309745.1'],,,,False,,,,False,chr4,47031652,47032799,True,3085778.0,0.7319417441907309,0.731942
VTX-0870183,False,VTX-0870183,99,chr2:69013500-69044817,[MetaORF v1.0],[],ANTXR1;ENSG00000169604.21;HGNC:21014,[ENST00000681568.1],-1,-1.0,MATAERRALGIGFQWLSLATLVLICAGQGGRREDGGPACYGGFDLYFILDKSGSVLHHWNEIYYFVEQLAHKFISPQLRMSFIVFSTRGTTLMKLTEDR,QGGRREDGGPACYGGFDLYFILDKSGSVLHHWNEIYYFVEQLAHKFISPQLRMSFIVFSTRGTTLMKLTED,gnl|BL_ORD_ID|65999,NP_473382.1 anthrax toxin receptor 1 precursor [Mus musculus],53.0,0.377358,91.549,NM_054041.2,"NM_054041.2 Mus musculus anthrax toxin receptor 1 (Antxr1), mRNA",91.0,89.011,41.429,0.97,0.9998,0.768687,...,,,False,,,,66.896906,['ENSP00000505171.1' 'ENSP00000430776.2' 'ENSP00000301945.4'\n 'ENSP00000505578.1'],['Q9H6X2'],['XP_016860566.1' 'NP_060623.2' 'XP_016860565.1' 'NP_001397769.1'\n 'NP_444262.1' 'NP_115584.1'],,,,False,,,,False,chr2,69013500,69044817,True,9322037.0,0.9992817032398345,0.999282
VTX-0870826,False,VTX-0870826,87,chr1:156591805-156592186,[MetaORF v1.0],[],128240;18453;gene-NAXE;NAXE,"[CHS.3338.10, CHS.3338.9]",-1,-1.0,MSRLRALLGLGLLVAGSRVPRIKSQTIACRSGPTWWGPQRLNSGGRWDSEVMASTVVKYLRRRPRPWTRSYLTNTSSAWTNLWNWPG,QTIACRSGPTWWGPQRLNSGGRWDSEVMASTVVKYLRRRPRPWTRSYLTNTSSAWTNLWNWP,gnl|BL_ORD_ID|26088,NP_001396707.1 NAD(P)H-hydrate epimerase isoform 11 precursor [Mus musculus],44.0,0.500000,38.000,NM_001409770.1,"NM_001409770.1 Mus musculus NAD(P)HX epimerase (Naxe), transcript variant 4, mRNA",28.0,67.857,38.000,0.99,0.8332,0.723234,...,,,False,,,,39.781020,['ENSP00000505883.1' 'ENSP00000505907.1' 'ENSP00000357216.3'\n 'ENSP00000357217.3' 'ENSP00000506...,['Q8NCW5'],['NP_658985.2'],,,,False,,,,False,chr1,156591805,156592186,True,15710174.0,0.952741322243183,0.952741
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
VTX-0850328,False,VTX-0850328,72,chr22:50061583-50064177,"[Benchling - PurifiedProtein, velia_phase1_secreted_smORFs]","[20-021, 6xhis-HSA-HRV3C-sORF368537_01, smORF368537, smORF368537, smORF368537-his-HSA_01, sORF36...",ENSG00000100427.16;HGNC:17082;MLC1,"[rna-NM_001376482.1, CHS.36227.20, CHS.36227.16, rna-NM_001376472.1, CHS.36227.38, CHS.36227.4, ...",-1,-1.0,LLLLLVLLLQAGLNTGTAIQCVRFKVSARLQGASWDTQNGPQERLAGEVARSPLKEFDKEKAWRAVVVQMAQ,IQCVRFKVSARLQGASWDTQNGPQERLAGEVARSPLKEFDKEKAWRAVVVQMA,gnl|BL_ORD_ID|63397,NP_573504.1 membrane protein MLC1 isoform 1 [Mus musculus],63.0,0.857143,84.906,NM_001364855.1,NM_001364855.1 Mus musculus megalencephalic leukoencephalopathy with subcortical cysts 1 homolog...,63.0,85.714,84.906,0.93,0.9994,0.914312,...,135158.333333,199775.0,True,True,3.926375,10.098,46.765108,['ENSP00000379216.2'],['A0A024R4V4'],['NP_001363413.1' 'NP_001363411.1' 'NP_001363412.1' 'NP_001363410.1'\n 'NP_001363409.1' 'NP_0013...,,,,False,,,,False,chr22,50061583,50064177,False,,,
VTX-0539925,False,VTX-0539925,30,chr18:51079867-51079959,"[openprot, velia_phase3_nan]","[NaN, SMAD4/ENST00000342988.7/mRNA/3'UTR, SMAD4/ENST00000398417.6/mRNA/3'UTR, SMAD4/NM_005359.5/...",ENSG00000141646.15;HGNC:6770;SMAD4,"[ENST00000398417.6, ENST00000688574.1, CHS.24657.5, ENST00000342988.8, rna-NM_005359.6]",-1,-1.0,MTGNSQFYWAALFFSLTLYRDFPMDIVYHV,RDFPMDIVYH,,,,,,NM_008540.3,"NM_008540.3 Mus musculus SMAD family member 4 (Smad4), transcript variant 1, mRNA",16.0,75.000,,0.73,0.0000,0.118051,...,3303.333333,230.0,False,True,-7.763167,-4.359,70.168698,,,,"chr18:51055547:C:T,chr18:51055547:C:T,chr18:51055547:C:T,chr18:51055547:C:T,chr18:51055547:C:T,c...",intergenic_variant;intergenic_variant,"Eczema,Hay fever and/or eczema,Eosinophil counts,Eosinophil counts,Eosinophill count,Asthma,Eosi...",False,"chr18:51054948:C:G;chr18:51054956:A:G,chr18:51054956:A:G;chr18:51054960:C:T;chr18:51054977:C:A,c...",intergenic_variant;intergenic_variant;intergenic_variant;intergenic_variant;intergenic_variant;i...,L29.4 Endarterectomy of carotid artery and patch repair of carotid artery;neoclarityn 5mg tablet...,False,chr18,51079867,51079959,False,,,
VTX-0850293,False,VTX-0850293,35,chr1:1335338-1335445,[velia_phase1_secreted_smORFs],[smORF32821],DVL1;ENSG00000107404.21;HGNC:3084,"[NM_004421.3, CHS.81.2, ENST00000378891.9, rna-XM_047448090.1, CHS.81.16, rna-XM_005244732.5, rn...",-1,-1.0,VHTLRAHIAAFLASLAPTCLCCRYCIKVPAFRWLT,CRYCIKVPAFRWL,gnl|BL_ORD_ID|83137,XP_006508316.1 transmembrane channel-like protein 5 isoform X2 [Mus musculus],17.0,0.529412,,NM_053147.4,"NM_053147.4 Mus musculus protocadherin beta 22 (Pcdhb22), mRNA",31.0,45.161,,0.86,0.0000,0.522772,...,513.333333,445.0,False,False,-8.252333,-3.351,68.224702,,,,"chr1:1332270:G:A,chr1:1332270:G:A;chr1:1332778:C:T,chr1:1332778:C:T;chr1:1334173:T:C,chr1:133417...",downstream_gene_variant;downstream_gene_variant;downstream_gene_variant;intron_variant;intron_va...,"Eosinophil counts,Eosinophil percentage of white cells;Systolic blood pressure,Systolic blood pr...",False,,,,False,chr1,1335338,1335445,False,,,
VTX-0294322,False,VTX-0294322,85,chr1:15491927-15492184,"[openprot, velia_phase9_tcgaDE_esmPhylocsf]",[],CASP9;ENSG00000132906.18;HGNC:1511,"[rna-NM_001278054.2, rna-NR_102733.2, rna-XM_047432034.1, rna-NM_032996.3, ENST00000333868.10, C...",Phase_9_137,-1.0,MFFWGLAMLPRLVSNSWAQVTLPSSSYYRRAPLHPAPFIHVLFHLILIPSQEGNKGENPVLREVRSLVQGKTIIQRRDLARTWVP,QVTLPSSSYYRRAPLHPAPFIHVLFHLILIPSQEGNKGENPVLREVRSLVQGKTIIQRRDLARTWV,gnl|BL_ORD_ID|84594,XP_006511917.1 inositol hexakisphosphate kinase 2 isoform X2 [Mus musculus],31.0,0.451613,44.828,NM_001001335.2,"NM_001001335.2 Mus musculus pleckstrin homology domain containing, family A (phosphoinositide bi...",36.0,52.778,,0.00,0.0000,0.276696,...,,,False,,,,39.764525,,,,"chr1:15487690:C:T,chr1:15487690:C:T,chr1:15487690:C:T,chr1:15487690:C:T,chr1:15487690:C:T,chr1:1...",intergenic_variant;downstream_gene_variant;intron_variant;intron_variant;intron_variant;intron_v...,"Red blood cell count,Urinary metabolite levels in chronic kidney disease [beta-guanidinopropanoa...",False,,,,False,chr1,15491927,15492184,False,,,
