In [3]:
import re
import numpy as np
import pandas as pd
import seaborn as sns

from pathlib import Path
from scipy import stats
from veliadb import base, settings
from veliadb import benchling_orm as bo
from veliadb.base import (Session, Orf, OrfXref, Transcript, Gene, 
                          TranscriptOrf, SequenceRegionXref, Protein, 
                          ProteinXref, Dataset)

from sqlalchemy.sql.expression import func, and_, or_

from dashboard import data_load
import pyarrow.parquet as pq
from dashboard.etl import CACHE_DIR, TPM_DESEQ2_FACTOR, DATA_DIR

pd.options.display.max_columns = 50
pd.options.display.max_colwidth = 100

In [2]:
session = base.Session()

In [80]:
df = pd.read_parquet(CACHE_DIR.joinpath('sorf_df.parq'))

In [81]:
df['Ribo-Seq sORF'] = ((df['source'].str.contains('velia_phase2_Chang_Saghatelian')) | \
            (df['source'].str.contains('Chothani')) | \
            (df['source'].str.contains('Prensner')) | \
            (df['source'].str.contains('Slavoff')) | \
            (df['source'].apply(lambda x: 'velia_phase1_Chen' in x)) | \
            (df['source'].str.contains('Rat')) | \
            (df['source'].str.contains('Mudge')) | \
            (df['source'].str.contains('gencode_riboseq')) | \
            (df['source'].str.contains('Bona')) |
            (df['source'].str.contains('bona')) |
            (df['source'].str.contains('orfrater')) |
            (df['source'].str.contains('mass_spec')) |
            (df['source'].str.contains('PBMC')) |
            (df['screening_phase'] == 'Not Screened') |
            (df['orf_xrefs'].str.contains('RibORF')))

df[df['Ribo-Seq sORF']].shape

(7313, 59)

In [84]:
set([item for sublist in df['source'].values for item in sublist])

{'ENSEMBL',
 'RefSeq',
 'gencode_riboseq',
 'openprot',
 'velia_phase1_Bona fide',
 'velia_phase1_Chen',
 'velia_phase1_Prensner',
 'velia_phase1_secreted_smORFs',
 'velia_phase2_83',
 'velia_phase2_Bianca_Chen',
 'velia_phase2_Bonafide_Bianca',
 'velia_phase2_Cao_Slavoff_MINAS60',
 'velia_phase2_Chang_Saghatelian',
 'velia_phase2_Chothani2022_SignalP',
 'velia_phase2_Mudge2022_SignalP',
 'velia_phase2_Rat_Cardiac_Huang',
 'velia_phase2_Seung',
 'velia_phase2_lncRNA_Jen',
 'velia_phase3_nan',
 'velia_phase4_nan',
 'velia_phase5_Blume_Mudge',
 'velia_phase5_autoimmune lncRNA',
 'velia_phase5_bona fide',
 'velia_phase5_uniprot-tremble',
 'velia_phase6_autoimmune_gwas',
 'velia_phase6_plasma_mass_spec',
 'velia_phase6_public_mass_spec',
 'velia_phase6_viral_sORF',
 'velia_phase7_Ribo-seq_PBMC_LPS_R848',
 'velia_phase7_tcga-DE_conserved_signalp+',
 'velia_phase9_Bona fide',
 'velia_phase9_Li et al VSMC',
 'velia_phase9_Olsen',
 'velia_phase9_orfrater',
 'velia_phase9_tcgaDE_esmPhylocsf'}

In [3]:
sorf_df = data_load.load_sorf_df_conformed()

2023-12-05 06:00:56.063 
  command:

    streamlit run /opt/conda/envs/veliadash_updated/lib/python3.11/site-packages/ipykernel_launcher.py [ARGUMENTS]
2023-12-05 06:00:56.064 No runtime found, using MemoryCacheStorageManager


In [4]:
vtx_fasta = Path('/home/ubuntu/repos/dashboard/cache/protein_data/protein_tools_input.fasta')

In [5]:
!cp $vtx_fasta isoform_data

### Swissprot

In [5]:
swissprot_query = \
    session.query(Protein)\
           .join(ProteinXref, ProteinXref.protein_id == Protein.id)\
           .join(Dataset, Dataset.id == ProteinXref.xref_dataset_id)\
           .filter(Dataset.name == 'swissprot')\
           .distinct(ProteinXref.protein_id)

fasta_file = Path('isoform_data/swissprot_proteins.fa')

with open(fasta_file, 'w') as outfile:
    for protein in swissprot_query.all():
        outfile.write(f'>{protein.uniprot_id}\n{protein.aa_seq}\n')
        
query_db = f'/root/{vtx_fasta.name}'
target_db = f'/root/{fasta_file.name}'
output_file = f'/root/{vtx_fasta.stem}_{fasta_file.stem}_alignments.m8'
options = '--format-output query,target,evalue,qstart,qend,qlen,qcov,gapopen,pident,fident,alnlen,raw,bits'

base_cmd = "docker run --rm -it -v ${PWD}/isoform_data:/root soedinglab/mmseqs2 mmseqs easy-search" 
full_cmd = f'{base_cmd} {options} {query_db} {target_db} {output_file} /root/tmp'

In [6]:
#!$full_cmd &>/dev/null

In [7]:
isoform_df = pd.read_csv(f'isoform_data/{vtx_fasta.stem}_{fasta_file.stem}_alignments.m8', sep='\t', names=options.split()[1].split(','))
isoform_df = isoform_df[(isoform_df['pident'] == 100.0)]
isoform_df.drop_duplicates(inplace=True)
swissprot_isoform_df = isoform_df.groupby('query').aggregate(list)
swissprot_isoform_df.rename(columns={'target': 'swissprot_isoform'}, inplace=True)
swissprot_isoform_df.shape

(1165, 12)

### Ensembl

In [12]:
fasta_file = Path('isoform_data/ensembl_proteins.fa')

with open(fasta_file, 'w') as outfile:
    for protein in session.query(Protein).filter(Protein.ensembl_protein_id.ilike('ENSP%')).all():
        outfile.write(f'>{protein.ensembl_protein_id}\n{protein.aa_seq}\n')
        
query_db = f'/root/{vtx_fasta.name}'
target_db = f'/root/{fasta_file.name}'
output_file = f'/root/{vtx_fasta.stem}_{fasta_file.stem}_alignments.m8'
options = '--format-output query,target,evalue,qstart,qend,qlen,qcov,gapopen,pident,fident,alnlen,raw,bits'

base_cmd = "docker run --rm -it -v ${PWD}/isoform_data:/root soedinglab/mmseqs2 mmseqs easy-search" 
full_cmd = f'{base_cmd} {options} {query_db} {target_db} {output_file} /root/tmp'

In [13]:
#!$full_cmd &>/dev/null

In [14]:
isoform_df = pd.read_csv(f'isoform_data/{vtx_fasta.stem}_{fasta_file.stem}_alignments.m8', sep='\t', names=options.split()[1].split(','))
isoform_df = isoform_df[(isoform_df['pident'] == 100.0)]
isoform_df.drop_duplicates(inplace=True)
ensembl_isoform_df = isoform_df.groupby('query').aggregate(list)
ensembl_isoform_df.rename(columns={'target': 'ensembl_isoform'}, inplace=True)
ensembl_isoform_df.shape

(1006, 12)

### Refseq

In [15]:
fasta_file = Path('/home/ubuntu/repos/dashboard/dashboard/etl/isoform_data/GRCh38_latest_protein.faa')

query_db = f'/root/{vtx_fasta.name}'
target_db = f'/root/{fasta_file.name}'
output_file = f'/root/{vtx_fasta.stem}_{fasta_file.stem}_alignments.m8'
options = '--format-output query,target,evalue,qstart,qend,qlen,qcov,gapopen,pident,fident,alnlen,raw,bits'

base_cmd = "docker run --rm -it -v ${PWD}/isoform_data:/root soedinglab/mmseqs2 mmseqs easy-search" 
full_cmd = f'{base_cmd} {options} {query_db} {target_db} {output_file} /root/tmp'

In [16]:
#!$full_cmd &>/dev/null

In [17]:
isoform_df = pd.read_csv(f'isoform_data/{vtx_fasta.stem}_{fasta_file.stem}_alignments.m8', sep='\t', names=options.split()[1].split(','))
isoform_df = isoform_df[(isoform_df['pident'] == 100.0)]
isoform_df.drop_duplicates(inplace=True)
refseq_isoform_df = isoform_df.groupby('query').aggregate(list)
refseq_isoform_df.rename(columns={'target': 'refseq_isoform'}, inplace=True)
refseq_isoform_df.shape

(944, 12)

In [18]:
sorf_df.drop(columns=['swissprot_isoform', 
                      'ensembl_isoform', 
                      'refseq_isoform'], inplace=True)

In [30]:
swissprot_isoform_df.to_csv('../../cache/protein_data/swissprot_isoform.csv')
ensembl_isoform_df.to_csv('../../cache/protein_data/ensembl_isoform.csv')
refseq_isoform_df.to_csv('../../cache/protein_data/refseq_isoform.csv')

In [19]:
output_df = sorf_df.merge(swissprot_isoform_df[['swissprot_isoform']], how='left', left_index=True, right_index=True)
output_df = output_df.merge(ensembl_isoform_df[['ensembl_isoform']], how='left', left_index=True, right_index=True)
output_df = output_df.merge(refseq_isoform_df[['refseq_isoform']], how='left', left_index=True, right_index=True)

output_df.replace(pd.NA, '', inplace=True)

In [23]:
output_df[(output_df['swissprot_isoform'] == '') & \
          (output_df['ensembl_isoform'] == '') & \
          (output_df['refseq_isoform'] == '')]

Unnamed: 0_level_0,show_details,vtx_id,aa_length,screening_phase_id,screening_phase,ucsc_track,source,orf_xrefs,protein_xrefs,gene_xrefs,transcript_xrefs,transcripts_exact,aa,nucl,index_copy,genscript_id,chr,strand,start,end,chrom_starts,block_sizes,phases,blastp_score,blastp_query_coverage,...,SignalP 4.1_score,Deepsig_cut,SignalP 6slow_cut,SignalP 5b_cut,SignalP 4.1_cut,Phobius,DeepTMHMM,trans1,trans2,trans3,sec1,sec2,sec3,translated_mean,secreted_mean,secreted,translated,phylocsf_58m_avg,phylocsf_58m_max,phylocsf_58m_min,phylocsf_vals,ESMFold plddt 90th percentile,swissprot_isoform,ensembl_isoform,refseq_isoform
vtx_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
VTX-0415419,False,VTX-0415419,44,Phase 4_98,Phase 4,chr12:92185302-92185436,"[velia_phase4_nan, openprot]","[IP_760623, IP_760623, Phase 4_98, U7722HJ240-218]","[IP_760623, Phase 4_98, sORF498]",BTG1-DT;ENSG00000245904.6;HGNC:55600;101928617;55600;BTG1-DT;gene-BTG1-DT,ENSE00002312836.1;exon:ENST00000540035.5:11;ENSE00003630041.1;ENSE00003636989.1;exon-CHS.142684....,[ENST00000499685.2],MTFAALFLLEATTSSLYSWEGNDTRCGYQEAEIFESCFKSNLTH,ATGACTTTTGCTGCTCTCTTCTTGTTAGAAGCCACTACATCCAGCCTCTACTCATGGGAAGGAAATGACACAAGGTGTGGGTATCAGGAGGCAGAA...,VTX-0415419,U7722HJ240-218,chr12,+,92185302,92185436,92185302,135,0,,,...,0.515,18,15,-1,18,-1,11,7610.0,5710.0,5300.0,1100.0,1400,1930,6206.666667,1100.0,False,True,-7.974182,-5.397,-11.394,"[-9.598999977111816, -9.598999977111816, -9.598999977111816, -8.060999870300293, -8.060999870300...",-1.0,,,
VTX-0850041,False,VTX-0850041,19,smORF133083,Phase 1,chr12:7130661-7130720,[velia_phase1_secreted_smORFs],"[smORF133083, U4958HF130-83]",[smORF133083],CLSTN3;ENSG00000139182.15;HGNC:18371;18371;9746;CLSTN3;gene-CLSTN3,ENSG00000242686.4;HGNC:40438;PDE6B-AS1;ENSE00001664775.1;exon-CHS.48914.5;exon:ENST00000416384.5...,[],LLPLLLASLLASCSCNKGE,CTGCTGCCCCTTCTGCTGGCCTCTCTGCTCGCGTCCTGCTCCTGTAACAAAGGTGAG,VTX-0850041,U4958HF130-83,chr12,+,7130661,7130720,7130661,60,0,,,...,0.361,-1,-1,15,-1,-1,14,150.0,195.0,335.0,790.0,1275,1660.0,226.666667,790.0,False,False,1.405526,3.638,-0.848,"[2.1500000953674316, 2.1500000953674316, 2.1500000953674316, 2.0940001010894775, 2.0940001010894...",-1.0,,,
VTX-0652562,False,VTX-0652562,81,U738YHG260-514,Phase 3,chr3:30631457-30631702,"[velia_phase3_nan, openprot]","[IP_2373599, TGFBR2/XM_017007106.1/mRNA/5'UTR, U738YHG260-514]","[NaN, TGFBR2/XM_017007106.1/mRNA/5'UTR, U738YHG260-514]",ENSG00000163513.19;HGNC:11773;TGFBR2;11773;7048;gene-TGFBR2;TGFBR2,100873976;39986;CNTN4-AS2;CNTN4-AS2;ENSG00000227588.2;gene-CNTN4-AS2;HGNC:39986;ENSG00000118961....,[ENST00000672866.1],MPRKGEDQPYTFSSNCTVRKQAKLPSLRADSQILFQISCPLLSFLEALTILATSFFFFFFFFYFLCLKNMKGEVSKCLVLS,ATGCCCAGGAAAGGCGAAGATCAACCTTACACTTTTTCATCTAACTGCACTGTGAGAAAACAAGCAAAATTGCCCAGTTTAAGGGCAGATTCTCAG...,VTX-0652562,U738YHG260-514,chr3,+,30631457,30631702,30631457,246,0,,,...,0.114,-1,-1,-1,-1,-1,-1,3380.0,3640.0,3300.0,150.0,420.0,480.0,3440.000000,150.0,False,True,-7.225975,-3.121,-13.379,"[-9.371999740600586, -9.371999740600586, -9.371999740600586, -9.38700008392334, -9.3870000839233...",-1.0,,,
VTX-0778657,False,VTX-0778657,29,Phase_5_1026,Phase 5,chr7:117604791-117604880,"[openprot, velia_phase5_autoimmune lncRNA]","[IP_579589, IP_579589|chr7:117604790-117604880-, OpenProt_ORF414409, IP_579589, Phase_5_1026, U7...","[IP_579589|chr7:117604790-117604880-;OpenProt_ORF414409;IP_579589, Phase_5_1026, sORF51026]",,ENSE00003540810.1;ENSE00003677734.1;exon-CHS.11878.19;exon-CHS.11878.19;exon-CHS.11878.19;exon:E...,[ENST00000456270.1],MSDSSRNVGKSPAFCGSQISIKNKINGST,ATGTCTGACTCTTCTAGAAATGTGGGCAAATCCCCTGCCTTCTGTGGGTCTCAGATTTCCATAAAAAATAAAATCAATGGATCAACT,VTX-0778657,U7849HL230-96,chr7,-,117604791,117604880,117604791,90,0,,,...,0.108,-1,-1,-1,-1,-1,-1,20.0,50.0,30.0,80.0,190.0,180.0,33.333333,80.0,False,False,-9.034138,-6.512,-12.881,"[-8.032999992370605, -8.032999992370605, -8.032999992370605, -6.835000038146973, -6.835000038146...",-1.0,,,
VTX-0551313,False,VTX-0551313,39,Phase 2_265,Phase 2,chr19:13841730-13841849,"[openprot, velia_phase2_lncRNA_Jen]","[AC020916.1/ENST00000587762.2/ncRNA/-, IP_3430617, Phase 2_265, U9096HI220-220]","[AC020916.1/ENST00000587762.2/ncRNA/-, Phase 2_265, sORF2265]",ENSG00000267519.6;HGNC:27620;MIR23AHG;27620;284454;gene-MIR23AHG;MIR23AHG,ENSE00003532284.1;ENSE00003567668.1;exon-CHS.28650.1;exon-CHS.28650.1;exon-CHS.28650.1;exon:ENST...,[ENST00000587762.2],MFWLCSWLLVTCPLPNLPQLAASDSWSMRVRLGAGGPLA,ATGTTTTGGCTGTGCAGCTGGCTCCTTGTCACCTGCCCCCTTCCCAACCTTCCCCAGCTGGCAGCCTCTGACTCCTGGTCTATGAGGGTGAGGCTG...,VTX-0551313,U9096HI220-220,chr19,-,13841730,13841849,13841730,120,0,52.0,1.0,...,0.553,23,-1,22,23,-1,38,10690.0,10900.0,11830.0,9940.0,9700,9940,11140.000000,9940.0,False,True,-7.534820,-3.534,-12.420,"[-6.308000087738037, -6.308000087738037, -6.308000087738037, -4.8420000076293945, -4.84200000762...",-1.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
VTX-0850859,False,VTX-0850859,40,Phase_5_1075,Phase 5,chr10:52300806-52301350,[velia_phase5_autoimmune lncRNA],"[IP_190571|chr10:52300805-52301350-, OpenProt_ORF168664, IP_190571, Phase_5_1075, U7849HL230-358]","[IP_190571|chr10:52300805-52301350-;OpenProt_ORF168664;IP_190571, Phase_5_1075, sORF51075]",ENSG00000236671.9;HGNC:45029;PRKG1-AS1;100506939;45029;gene-PRKG1-AS1;PRKG1-AS1,ENSG00000261540;ENSG00000261540.1;ARL8A;ENSG00000143862.8;HGNC:25192;ENSE00002117121.1;exon:ENST...,[ENST00000420193.1],MSNIISTGLFASHALVPVLLNLKVFMQIAQLTGQPLLRKC,ATGTCTAACATAATTTCTACTGGCCTCTTTGCTTCTCATGCTTTAGTACCAGTGCTTCTGAATTTGAAAGTCTTCATGCAAATTGCCCAGCTGACA...,VTX-0850859,U7849HL230-358,chr10,-,52300806,52301350,52300806;52301261,33;90,0;0,,,...,0.303,-1,-1,-1,-1,-1,32,2200.0,1790.0,1990.0,250.0,190.0,300.0,1993.333333,250.0,False,False,-7.455400,-5.268,-11.043,"[-6.815000057220459, -6.815000057220459, -6.815000057220459, -5.26800012588501, -5.2680001258850...",-1.0,,,
VTX-0756026,False,VTX-0756026,44,U738YHG260-52,Phase 3,chr6:157167790-157167924,"[velia_phase3_nan, openprot]","[ARID1B/ENST00000636426.1/ncRNA/-, ARID1B/ENST00000637722.1/ncRNA/-, IP_3422098, U738YHG260-52]","[ARID1B/ENST00000636426.1/ncRNA/-, ARID1B/ENST00000637722.1/ncRNA/-, NaN, U738YHG260-52]",ARID1B;ENSG00000049618.25;HGNC:18040;18040;57492;ARID1B;gene-ARID1B,18464;392862;ENSG00000215045.9;gene-GRID2IP;GRID2IP;GRID2IP;HGNC:18464;ENSG00000275413;ENSG00000...,"[ENST00000636426.1, ENST00000637722.1]",MCGKYQECLRLCISPMILDSSRPLNPLLRFRNYLKTMSTTFYIK,ATGTGTGGAAAATATCAAGAGTGTCTGAGACTGTGCATATCTCCCATGATCTTAGACTCTTCAAGACCCTTGAACCCCCTCCTCAGGTTCCGTAAC...,VTX-0756026,U738YHG260-52,chr6,+,157167790,157167924,157167790,135,0,52.0,1.0,...,0.137,-1,-1,-1,-1,-1,-1,1460.0,1040.0,1160.0,40.0,70.0,110.0,1220.000000,40.0,False,True,-8.405273,-3.406,-14.116,"[-8.274999618530273, -8.274999618530273, -8.274999618530273, -9.930999755859375, -9.930999755859...",-1.0,,,
VTX-0316974,False,VTX-0316974,70,U738YHG260-420,Phase 3,chr1:119994151-119994363,"[velia_phase3_nan, openprot]","[IP_679014, NOTCH2/ENST00000602566.5/mRNA/3'UTR, U738YHG260-420]","[NaN, NOTCH2/ENST00000602566.5/mRNA/3'UTR, U738YHG260-420]",ENSG00000134250.21;HGNC:7882;NOTCH2;4853;7882;gene-NOTCH2;NOTCH2,ENSG00000268836;ENSG00000268836.1;ENSE00003737120.1;ENSE00003752171.1;exon-CHS.45017.3;exon:ENST...,[],MLFIQFVRFHAHCSYVPGGVQKIFTSFPQREIVPRDGFLLKSPFHNAVPFLPLPFIPLLPMLVKRLKCYL,ATGTTATTTATACAGTTTGTAAGATTTCATGCTCATTGTTCGTATGTCCCAGGTGGAGTTCAGAAAATATTCACTTCATTTCCACAAAGGGAAATA...,VTX-0316974,U738YHG260-420,chr1,-,119994151,119994363,119994151,213,0,57.0,1.0,...,0.184,-1,-1,-1,-1,-1,-1,,,,,,,,,False,False,-8.525957,-5.181,-15.000,"[-5.181000232696533, -5.181000232696533, -5.181000232696533, -5.211999893188477, -5.211999893188...",-1.0,,,
VTX-0745409,False,VTX-0745409,31,Phase 2_222,Phase 2,chr6:71408901-71408996,"[openprot, velia_phase2_lncRNA_Jen]","[18-214, 6xhis-HSA-HRV3C-sORF2222_3_02, IP_2347036, LINC00472/NR_026807.2/ncRNA/-, LINC00472/NR_...","[18-214, 6xhis-HSA-HRV3C-sORF2222_3_02, LINC00472/NR_026807.2/ncRNA/-, LINC00472/NR_121614.1/ncR...",ENSG00000269966;ENSG00000269966.1;ENSG00000233237.8;HGNC:21380;LINC00472;21380;79940;gene-LINC00...,ENSG00000126524.12;HGNC:19440;SBDS;ENSG00000232723.1;HGNC:35796;RPS17P7;ENSE00001703363.1;exon:E...,[ENST00000651660.1],MCLPTLAKALAGLWTVLIDHRIGQGGATSSC,ATGTGTCTTCCTACCTTGGCAAAAGCATTGGCTGGTCTCTGGACAGTTCTCATAGACCACAGGATTGGGCAGGGGGGAGCCACATCTTCATGT,VTX-0745409,U9096HI220-118,chr6,-,71408901,71408996,71408901,96,0,52.0,1.0,...,0.435,27,-1,-1,-1,-1,18,,,,,,,,,True,True,-8.786517,-5.038,-15.000,"[-15.0, -15.0, -15.0, -7.531000137329102, -7.531000137329102, -7.531000137329102, -6.73000001907...",-1.0,,,


### BLASTp

In [15]:
data_path = Path('isoform_data').absolute()
blast_db_path = Path('/efs/databases/blast')
blast_db = '-db mouse.protein.genbank.faa'
output_fmt = '6 qaccver saccver stitle bitscore qcovs length pident gaps evalue'
options = f'-outfmt "{output_fmt}" -num_threads 8'
query = f'-query /blast/data/{vtx_fasta.name}'
output = f'-out /blast/data/{vtx_fasta.stem}.blastp.out'

In [16]:
base_cmd = f'docker run --rm -it -v {blast_db_path}:/blast/blastdb -v {data_path}:/blast/data ncbi/blast'
full_cmd = f'{base_cmd} blastp {options} {blast_db} {query} {output}'

In [17]:
!$full_cmd

In [18]:
header = ['vtx_id', 'blastp_hit_id', 'blastp_description', 'blastp_score',
          'blastp_query_coverage', 'blastp_align_length', 'blastp_align_identity', 
          'blastp_gaps', 'blastp_evalue']

blastp_df = pd.read_csv(f'isoform_data/{vtx_fasta.stem}.blastp.out', sep='\t', names=header)
bdf = blastp_df.sort_values(by='blastp_score', ascending=False).groupby('vtx_id').first()

### tBLASTn

In [24]:
data_path = Path('isoform_data').absolute()
blast_db_path = Path('/efs/databases/blast')
blast_db = '-db mouse.rna.fna'
output_fmt = '6 qaccver saccver stitle score qcovs length pident gaps evalue'
options = f'-outfmt "{output_fmt}" -num_threads 8'
query = f'-query /blast/data/{vtx_fasta.name}'
output = f'-out /blast/data/{vtx_fasta.stem}.tblastn.out'

In [25]:
base_cmd = f'docker run --rm -it -v {blast_db_path}:/blast/blastdb -v {data_path}:/blast/data ncbi/blast'
full_cmd = f'{base_cmd} tblastn {options} {blast_db} {query} {output}'

In [21]:
#!$full_cmd



In [26]:
header = ['vtx_id', 'tblastn_hit_id', 'tblastn_description', 'tblastn_score',
          'tblastn_query_coverage', 'tblastn_align_length', 'tblastn_align_identity', 
          'tblastn_gaps', 'tblastn_evalue']

tblastn_df = pd.read_csv(f'isoform_data/{vtx_fasta.stem}.tblastn.out', sep='\t', names=header)
tdf = tblastn_df.sort_values(by='tblastn_score', ascending=False).groupby('vtx_id').first()

In [28]:
tdf.to_csv('../../cache/protein_data/tblastn.csv')

In [25]:
swissprot_isoform_df

Unnamed: 0_level_0,swissprot_isoform,evalue,qstart,qend,qlen,qcov,gapopen,pident,fident,alnlen,raw,bits
query,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
VTX-0001233,[Q99622],[4.28e-75],[1],[126],[126],[1.0],[0],[100.0],[1.0],[126],[607],[244]
VTX-0002499,[Q9BYQ0],[1.102e-108],[1],[159],[159],[1.0],[0],[100.0],[1.0],[159],[855],[342]
VTX-0006906,"[Q08648, Q6PDA7]","[3.7739999999999997e-41, 3.7739999999999997e-41]","[1, 1]","[71, 71]","[82, 82]","[0.866, 0.866]","[0, 0]","[100.0, 100.0]","[1.0, 1.0]","[71, 71]","[351, 351]","[143, 143]"
VTX-0007082,[A0A024R3A3],[4.263e-26],[1],[59],[59],[1.0],[0],[100.0],[1.0],[59],[239],[99]
VTX-0007146,[A6NF36],[2.555e-93],[1],[153],[153],[1.0],[0],[100.0],[1.0],[153],[744],[298]
...,...,...,...,...,...,...,...,...,...,...,...,...
VTX-0860402,[A5PLN9],[1.412e-09],[1],[22],[38],[0.579],[0],[100.0],[1.0],[22],[115],[50]
VTX-0860431,[P27701],[2.484e-11],[1],[26],[46],[0.565],[0],[100.0],[1.0],[26],[131],[56]
VTX-0860435,[Q6UWT4],[1.735e-51],[1],[87],[87],[1.0],[0],[100.0],[1.0],[87],[427],[173]
VTX-0860437,[A0A0C5B5G6],[2.145e-06],[1],[16],[16],[1.0],[0],[100.0],[1.0],[16],[90],[40]


In [34]:
session.query(OrfXref.orf_id).\
        join(Dataset, Dataset.id == OrfXref.xref_dataset_id).\
        filter(OrfXref.xref_dataset_id == 104).count()

7264