In [1]:
import re
import numpy as np
import pandas as pd
import seaborn as sns

from pathlib import Path
from scipy import stats
from veliadb import base, settings
from veliadb import benchling_orm as bo
from veliadb.base import (Session, Orf, OrfXref, Transcript, Gene, 
                          TranscriptOrf, SequenceRegionXref, Protein, 
                          ProteinXref, Dataset, ProteinOrf)

from sqlalchemy.sql.expression import func, and_, or_

from dashboard import data_load
import pyarrow.parquet as pq
from dashboard.etl import CACHE_DIR, TPM_DESEQ2_FACTOR, DATA_DIR

pd.options.display.max_columns = 50
pd.options.display.max_colwidth = 100



In [3]:
session = base.Session()

In [3]:
session.query(Orf).count()

16282850

In [4]:
dashboard_orfs = session.query(Orf).filter(Orf.vtx_id != '').all()

In [5]:
len(dashboard_orfs)

22820

In [7]:
session.query(Dataset).all()

[Data Set (3): CCDS,
 Data Set (4): HGNC_ID,
 Data Set (5): HGNC_SYMBOL,
 Data Set (6): swissprot,
 Data Set (7): trembl,
 Data Set (10): RefSeqFE,
 Data Set (12): BestRefSeq%2CGnomon,
 Data Set (17): CHESS,
 Data Set (18): havana,
 Data Set (9): Gnomon,
 Data Set (11): cmsearch,
 Data Set (21): ensembl,
 Data Set (13): BestRefSeq,
 Data Set (23): FANTOM,
 Data Set (24): ensembl_havana,
 Data Set (2): HAVANA,
 Data Set (1): ENSEMBL,
 Data Set (14): Curated Genomic,
 Data Set (8): RefSeq,
 Data Set (29): StringTie,
 Data Set (16): tRNAscan-SE,
 Data Set (31): openprot,
 Data Set (81): velia_phase6_viral_sORF,
 Data Set (33): velia_phase1_secreted_smORFs,
 Data Set (34): velia_phase1_Prensner,
 Data Set (37): velia_phase2_Chothani2022_SignalP,
 Data Set (38): velia_phase2_lncRNA_Jen,
 Data Set (40): velia_phase1_Chen,
 Data Set (41): velia_phase5_autoimmune lncRNA,
 Data Set (43): velia_phase2_Mudge2022_SignalP,
 Data Set (96): velia_phase7_tcga-DE_conserved_signalp+,
 Data Set (97): vel

In [11]:
score_df = pd.DataFrame(session.query(Orf.vtx_id, OrfXref.xref).join(OrfXref).filter(and_(OrfXref.xref_dataset_id == 145,
                                                           OrfXref.type == 'score',)).all())

In [30]:
low_vtx = list(score_df[score_df['xref'].astype(float) < .95]['vtx_id'])

In [31]:
len(set([d.vtx_id for d in dashboard_orfs]))

22820

In [33]:
with open('../cache_updates/all_vtx_240606.txt', 'w') as outfile:
    for orf in dashboard_orfs:
        if orf.vtx_id in low_vtx:
            continue
        else:
            outfile.write(f'{orf.vtx_id}\n')


### Swissprot

In [35]:
swissprot_query = \
    session.query(Protein, Orf)\
           .join(ProteinXref, ProteinXref.protein_id == Protein.id)\
           .join(Dataset, Dataset.id == ProteinXref.xref_dataset_id)\
           .join(ProteinOrf, ProteinOrf.protein_id == Protein.id)\
           .join(Orf, Orf.id == ProteinOrf.orf_id)\
           .filter(Dataset.name == 'swissprot')\
           .filter(func.length(Protein.aa_seq) < 150)\
           .distinct(ProteinXref.protein_id)



In [32]:
alan_df = pd.read_csv('../../data/orfs_under_150codons_with_function-2.csv')

In [37]:
with open('../cache_updates/swissprot_sORF_150aa_vtx.txt', 'w') as outfile:
    for prot, orf in swissprot_query.all():
        outfile.write(f'VTX-{orf.id:07d}\n')
    

In [28]:
prot, orf = swissprot_query.first()

In [19]:
prot.

SyntaxError: invalid syntax (2621462385.py, line 1)

In [15]:
prot.ensembl_protein_id

'ENSP00000319240.2'

In [14]:
orf.ensembl_protein_id

'ENSP00000319240.2'

In [12]:
session.query(ProteinOrf).count()

187319

In [None]:
from dashboard.etl.dashboard_etl import parse_deepsig, parse_phobius, parse_signalp41, par

In [None]:
with open(f'{OUTPUT_PREFIX}/phobius.results.txt', 'r') as fopen:
    phobius_data = ''.join(fopen.readlines())
phobius_results = parse_phobius(phobius_data, seqs)
deep_tmhmm_results = parse_deeptmhmm(f'{OUTPUT_PREFIX}/biolib_results/predicted_topologies.3line')
signalp_6_results = parse_signalp6(f'{OUTPUT_PREFIX}/output.json', seqs)
signalp_5_results = parse_signalp5(f'{OUTPUT_PREFIX}/results_summary.signalp5', seqs)
with open(f'{OUTPUT_PREFIX}/signalp41.results.txt', 'r') as fopen:
    signalp41_data = ''.join(fopen.readlines())
signalp_41_results = parse_signalp41(signalp41_data, seqs)
deepsig_results = parse_deepsig(f'{OUTPUT_PREFIX}/deepsig.results', seqs)

deepsig_results = pd.DataFrame(deepsig_results).T
signalp_6_results = pd.DataFrame(signalp_6_results).T
signalp_5_results = pd.DataFrame(signalp_5_results).T
signalp_41_results = pd.DataFrame(signalp_41_results).T
phobius_results = pd.DataFrame(phobius_results).T
deep_tmhmm_results = pd.DataFrame(deep_tmhmm_results).T
string_representations = pd.DataFrame()
string_representations['Deepsig'] = deepsig_results['string_rep']
string_representations['SignalP 6slow'] = signalp_6_results['string_rep']
string_representations['SignalP 5b'] = signalp_5_results['string_rep']
string_representations['SignalP 4.1'] = signalp_41_results['string_rep']
string_representations['Phobius'] = phobius_results['string_rep']
string_representations['DeepTMHMM'] = deep_tmhmm_results['string_rep']

vtx_ids = []
df_ordr = []
for k, v in seqs.items():
    vtx_ids.append(k)
    df_ordr.append(v)
string_representations = string_representations.loc[df_ordr].copy()
string_representations.index = vtx_ids
string_representations['Sequence'] = [seqs[i] for i in string_representations.index]

string_representations.to_csv(f'{OUTPUT_PREFIX}/sequence_features_strings.csv')