In [47]:
import pandas as pd
from pathlib import Path
from ast import literal_eval

from dashboard.data_load import load_sorf_df_conformed, load_protein_feature_string_representations
from dashboard.etl import CACHE_DIR, DATA_DIR

In [54]:
df = load_sorf_df_conformed()

## BLASTp

In [7]:
with open('isoform_data/nonsignal_seq_aa.fa', 'w') as outfile:
    for i, row in df[df['nonsignal_seqs'] != ''].iterrows():
        outfile.write(f">{row.name}\n{row['nonsignal_seqs']}\n")

In [8]:
vtx_fasta = Path('isoform_data/nonsignal_seq_aa.fa').absolute()

In [12]:
data_path = Path('isoform_data').absolute()
blast_db_path = Path('/home/ubuntu/velia-data-dev/VDC_004_annotation/primary/blast/')
blast_db = '-db mouse.protein.genbank.faa'
output_fmt = '6 qaccver saccver stitle bitscore qcovs length pident gaps evalue'
options = f'-outfmt "{output_fmt}" -num_threads 8'
query = f'-query /blast/data/{vtx_fasta.name}'
output = f'-out /blast/data/{vtx_fasta.stem}.blastp.out'

base_cmd = f'docker run --rm -it -v {blast_db_path}:/blast/blastdb -v {data_path}:/blast/data ncbi/blast'
full_cmd = f'{base_cmd} blastp {options} {blast_db} {query} {output}'

In [14]:
!$full_cmd

In [15]:
header = ['vtx_id', 'blastp_hit_id', 'blastp_description', 'blastp_score',
          'blastp_query_coverage', 'blastp_align_length', 'blastp_align_identity', 
          'blastp_gaps', 'blastp_evalue']

blastp_df = pd.read_csv(f'isoform_data/{vtx_fasta.stem}.blastp.out', sep='\t', names=header)
bdf = blastp_df.sort_values(by='blastp_score', ascending=False).groupby('vtx_id').first()


## tBLASTn

In [17]:
data_path = Path('isoform_data').absolute()
blast_db_path = Path('/home/ubuntu/velia-data-dev/VDC_004_annotation/primary/blast/')
blast_db = '-db mouse.rna.fna'
output_fmt = '6 qaccver saccver stitle score qcovs length pident gaps evalue'
options = f'-outfmt "{output_fmt}" -num_threads 8'
query = f'-query /blast/data/{vtx_fasta.name}'
output = f'-out /blast/data/{vtx_fasta.stem}.tblastn.out'

base_cmd = f'docker run --rm -it -v {blast_db_path}:/blast/blastdb -v {data_path}:/blast/data ncbi/blast'
full_cmd = f'{base_cmd} tblastn {options} {blast_db} {query} {output}'

In [18]:
!$full_cmd



In [19]:
header = ['vtx_id', 'tblastn_hit_id', 'tblastn_description', 'tblastn_score',
          'tblastn_query_coverage', 'tblastn_align_length', 'tblastn_align_identity', 
          'tblastn_gaps', 'tblastn_evalue']

tblastn_df = pd.read_csv(f'isoform_data/{vtx_fasta.stem}.tblastn.out', sep='\t', names=header)
tdf = tblastn_df.sort_values(by='tblastn_score', ascending=False).groupby('vtx_id').first()

In [42]:
with open('../cache_updates/all_vtx_gencode_231218.txt', 'w') as outfile:
    for i, row in df.iterrows():
        outfile.write(f"{row.name}\n")



In [55]:
isoform_data_path = Path('/home/ubuntu/dashboard/scripts/notebooks/isoform_data/')
vtx_fasta = isoform_data_path.joinpath('nonsignal_seq_aa.fa')

header = [
    'vtx_id', 'blastp_refseq_id', 'nonsig_blastp_hit_id', 'nonsig_blastp_description', 'blastp_score',
    'nonsig_blastp_align_length', 'nonsig_blastp_align_identity', 'blastp_gaps', 'nonsig_blastp_evalue']

blastp_df = pd.read_csv(isoform_data_path.joinpath(f'{vtx_fasta.stem}.blastp.out'), sep='\t', names=header)
bdf = blastp_df.sort_values(by='blastp_score', ascending=False).groupby('vtx_id').first()

header = [
    'vtx_id', 'tblastn_refseq_id', 'nonsig_tblastn_hit_id', 'nonsig_tblastn_description', 'tblastn_score',
    'nonsig_tblastn_align_length', 'nonsig_tblastn_align_identity', 'tblastn_gaps', 'nonsig_tblastn_evalue']

tblastn_df = pd.read_csv(isoform_data_path.joinpath(f'{vtx_fasta.stem}.tblastn.out'), sep='\t', names=header)
tdf = tblastn_df.sort_values(by='tblastn_score', ascending=False).groupby('vtx_id').first()

df = df.merge(bdf, left_index=True, right_index=True, how='left')
df = df.merge(tdf, left_index=True, right_index=True, how='left')

#return df


In [58]:
df

Unnamed: 0_level_0,show_details,vtx_id,aa_length,ucsc_track,source,screening_phase_id,screening_phase,genscript_id,orf_xrefs,protein_xrefs,...,blastp_gaps,nonsig_blastp_evalue,tblastn_refseq_id,nonsig_tblastn_hit_id,nonsig_tblastn_description,tblastn_score,nonsig_tblastn_align_length,nonsig_tblastn_align_identity,tblastn_gaps,nonsig_tblastn_evalue
vtx_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
VTX-0852017,False,VTX-0852017,54,chr1:-1--1,[velia_phase1_Bona fide],-1,Phase 1,,"[NaN, PIGBOS1 NT control ]",[PIGBOS1 NT control ],...,,,,,,,,,,
VTX-0852018,False,VTX-0852018,53,chr1:-1--1,[velia_phase1_Bona fide],-1,Phase 1,,"[HOXB-AS3 NT control , NaN]","[HOXB-AS3 NT control , HOXB-AS3 NT control , N...",...,,,,,,,,,,
VTX-0852019,False,VTX-0852019,45,chr1:-1--1,[velia_phase2_Bonafide_Bianca],Phase 2_154,Phase 2,U9096HI220-282,"[Phase 2_154, U9096HI220-282, ZBTB37]","[Phase 2_154, Phase 4_352, sORF2154, sORF4352,...",...,,,,,,,,,,
VTX-0852020,False,VTX-0852020,42,chr1:-1--1,[velia_phase2_Bianca_Chen],Phase 2_155,Phase 2,U9096HI220-246,"[17-242, 6xhis-HSA-HRV3C-sORF2155_1_02, Phase ...","[17-242, 57729, 6xhis-HSA-HRV3C-sORF2155_1_02,...",...,,,,,,,,,,
VTX-0852021,False,VTX-0852021,59,chr1:-1--1,[velia_phase2_Bianca_Chen],Phase 2_156,Phase 2,U312HHH170-182,"[Phase 2_156, U312HHH170-182, ZFAND2A_1199767_...","[c7riboseqorf5, Phase 2_156, Phase 4_727, sORF...",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
VTX-0851963,False,VTX-0851963,79,chr1:120005395-120029951,[velia_phase3_nan],Phase 3_94,Phase 3,U738YHG260-500,"[NOTCH2/ENST00000579475.7/mRNA/5'UTR, NOTCH2/N...","[NOTCH2/ENST00000579475.7/mRNA/5'UTR, NOTCH2/N...",...,,,,,,,,,,
VTX-0851964,False,VTX-0851964,81,chr12:6388403-6388942,[velia_phase3_nan],U738YHG260-508,Phase 3,U738YHG260-508,"[LTBR/ENST00000543542.1/ncRNA/-, U738YHG260-508]","[LTBR/ENST00000543542.1/ncRNA/-, NaN, U738YHG2...",...,4.0,2.1,XM_036165897.1,XM_036165897.1 PREDICTED: Mus musculus lymphot...,93.0,39.0,23.0,73.913,0.0,0.000129
VTX-0851965,False,VTX-0851965,81,chr4:125487463-125490005,[velia_phase3_nan],U738YHG260-510,Phase 3,U738YHG260-510,"[FAT4/NM_001291285.1/mRNA/CDS, U738YHG260-510]","[FAT4/NM_001291285.1/mRNA/CDS, NaN, U738YHG260...",...,,,,,,,,,,
VTX-0851966,False,VTX-0851966,82,chr6:15246540-15452073,[velia_phase3_nan],U738YHG260-516,Phase 3,U738YHG260-516,"[JARID2/XM_017010834.2/mRNA/-, U738YHG260-516]","[JARID2/XM_017010834.2/mRNA/-, NaN, U738YHG260...",...,,,,,,,,,,
