In [35]:
import pandas as pd
from pathlib import Path
from ast import literal_eval

from dashboard.data_load import load_sorf_df_conformed

from dashboard.etl import CACHE_DIR as cache_dir
from dashboard.etl import DATA_DIR

In [36]:
CACHE_DIR = Path('cache_20231213_v_1_1')

In [21]:
feature_df = pd.read_csv(cache_dir.joinpath('protein_data', 'sequence_features_strings.csv'))


In [22]:
feature_df.columns

Index(['Unnamed: 0', 'Deepsig', 'SignalP 6slow', 'SignalP 5b', 'SignalP 4.1',
       'Phobius', 'DeepTMHMM', 'Sequence'],
      dtype='object')

In [23]:
signal_cols = ['SignalP 6slow', 'SignalP 4.1', 'SignalP 5b', 'Deepsig']
nonsignal_seqs = []

for i, row in feature_df.iterrows():
    nonsignal_aa = ''
    for col in signal_cols:
        if row[col].startswith('S'):
            signal_len = row[col].count('S')
            nonsignal_aa = row['Sequence'][signal_len:-1]
            break
    nonsignal_seqs.append(nonsignal_aa)

feature_df['nonsignal_seqs'] = nonsignal_seqs




In [24]:
feature_df[['SignalP 5b', 'Sequence', 'nonsignal_seqs']]

Unnamed: 0,SignalP 5b,Sequence,nonsignal_seqs
0,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,MEQQQYCLLTEYITEILSTLLRTSVALMCLLYHLEEFHPIKHIEMP...,
1,SSSSSSSSSSSSSSSSSSSSSOOOOOOOOOOOOOOOOOOOOOOOOO...,MRWVATTLVLTIQLMGCQIWGMPRTSWMGCLQVTAISWPGLILTCK...,MPRTSWMGCLQVTAISWPGLILTCKSSFSLSLFIQLYCLNLHCDWP...
2,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,MYEVIIASFSIGRSLPHITDVSWRLEYQIKTNQLHRMYRPAYLVTL...,
3,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,MQKPLQGSGNHAQRAVWCPKHKKGLPLGVCPLVSRRTSQPAPSVGG...,
4,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,MTVPGVPIALMVVSAWIGLEATVVAACLALLGSVVRETSTSASPTP...,
...,...,...,...
13524,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,MKECVLPTTMAQDTANVQKASWGNIVNIETPVRRTAARMVGLVWPR...,
13525,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,MLMLAVLLPLAFFLLLATVFSCIWKSHPSLCRKLGSLLKRRPQVMA...,CIWKSHPSLCRKLGSLLKRRPQVMAGAEKAARRGRGDEGTRWSRQE...
13526,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,MEQQQYCLLTEYITEILSTLLRTSVALMCLLYHLEEFHPIKHIEMP...,
13527,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,MSKERPKRNIIQKKYDDSDGIPWSEERVVRKVLYLSLKEFKNSQKR...,


In [29]:
tm_df = feature_df[feature_df.apply(lambda x: 'M' in x['DeepTMHMM'], axis=1)]

In [31]:
tm_df[tm_df['Unnamed: 0'] == 'VTX-0086083']

Unnamed: 0.1,Unnamed: 0,Deepsig,SignalP 6slow,SignalP 5b,SignalP 4.1,Phobius,DeepTMHMM,Sequence,nonsignal_seqs
6773,VTX-0086083,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOMMMMMMMMMMMM...,OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOMMMMMMMMM...,MMEAHFKYHGNLTGRAHFPTLATEVDTSSDKYSNLYMYVGLFLSLL...,


In [38]:
df = pd.read_parquet(CACHE_DIR.joinpath('sorf_df.parq'))
    
#df = df[df['aa_length'] <= 150].copy()

# TODO remove this as temp addition
ribo_df = pd.read_excel(DATA_DIR.joinpath('Secreted_mP_Riboseq_SAF.xlsx'))
ribo_vtx = set(ribo_df[ribo_df['manual_check'] == 1]['vtx_id'])
ccle_df = pd.read_excel(DATA_DIR.joinpath('SummaryIdentification_CCLE_strongerConfidence.xlsx'), index_col=0)
gtex_df = pd.read_excel(DATA_DIR.joinpath('SummaryIdentification_GTEX_strongerConfidence.xlsx'), index_col=0)

ccle_vtx = set(ccle_df['vtx_id'])
gtex_vtx = set(gtex_df['vtx_id'])

ms_vtx = gtex_vtx.union(ccle_vtx)
support_vtx = ms_vtx.union(ribo_vtx)

df['manual_riboseq'] = df.apply(lambda x: True if x.vtx_id in ribo_vtx else False, axis=1)
df['MS_evidence'] = df.apply(lambda x: True if x.vtx_id in ms_vtx else False, axis=1)
df['MS or Riboseq'] = df.apply(lambda x: True if x.vtx_id in support_vtx else False, axis=1)

df.drop(columns=['swissprot_isoform', 
                    'ensembl_isoform', 
                    'refseq_isoform'], inplace=True)

swissprot_isoform_df = pd.read_csv(CACHE_DIR.joinpath('protein_data', 'swissprot_isoform.csv'), index_col=0)
ensembl_isoform_df = pd.read_csv(CACHE_DIR.joinpath('protein_data', 'ensembl_isoform.csv'), index_col=0)
refseq_isoform_df = pd.read_csv(CACHE_DIR.joinpath('protein_data', 'refseq_isoform.csv'), index_col=0)

df = df.merge(swissprot_isoform_df[['swissprot_isoform']], how='left', left_index=True, right_index=True)
df = df.merge(ensembl_isoform_df[['ensembl_isoform']], how='left', left_index=True, right_index=True)
df = df.merge(refseq_isoform_df[['refseq_isoform']], how='left', left_index=True, right_index=True)
df.replace(pd.NA, 'None', inplace=True)
    
tblastn_df = pd.read_csv(CACHE_DIR.joinpath('protein_data', 'tblastn.csv'))
tblastn_df.set_index('vtx_id', inplace=True)
df = df.merge(tblastn_df[['tblastn_hit_id', 'tblastn_description',
                            'tblastn_score', 'tblastn_query_coverage', 'tblastn_align_length',
                            'tblastn_align_identity', 'tblastn_gaps', 'tblastn_evalue']], how='left', left_index=True, right_index=True)               
df.drop('phylocsf_vals', axis=1, inplace=True)

from dashboard.tabs.riboseq_atlas import get_average_coverage
ribo_df = get_average_coverage()
vtx_with_any_support = ribo_df[(ribo_df.sum(axis=1)>50) & (ribo_df.max(axis=1)>10)].index
array_to_add = ['True' if i in vtx_with_any_support else 'False' for i in df.index]
df['Ribo-Seq RPKM Support'] = array_to_add

df.index.name = 'vtx_id'
df['vtx_id'] = df.index

df['Ribo-Seq sORF'] = (
    (df['source'].apply(lambda x: 'gencode_riboseq' in x)) | \
    (df['source'].apply(lambda x: 'velia_phase1_Bona fide' in x)) | \
    (df['source'].apply(lambda x: 'velia_phase1_Chen' in x)) | \
    (df['source'].apply(lambda x: 'velia_phase1_Prensner' in x)) | \
    (df['source'].apply(lambda x: 'velia_phase2_Chang_Saghatelian' in x)) | \
    (df['source'].apply(lambda x: 'velia_phase2_Chothani2022_SignalP' in x)) | \
    (df['source'].apply(lambda x: 'velia_phase2_Bianca_Chen' in x)) | \
    (df['source'].apply(lambda x: 'velia_phase2_Bonafide_Bianca' in x)) | \
    (df['source'].apply(lambda x: 'velia_phase2_Cao_Slavoff_MINAS60' in x)) | \
    (df['source'].apply(lambda x: 'velia_phase2_Rat_Cardiac_Huang' in x)) | \
    (df['source'].apply(lambda x: 'velia_phase2_Mudge2022_SignalP' in x)) | \
    (df['source'].apply(lambda x: 'velia_phase5_Blume_Mudge' in x)) | \
    (df['source'].apply(lambda x: 'velia_phase5_bona fide' in x)) | \
    (df['source'].apply(lambda x: 'velia_phase6_plasma_mass_spec' in x)) | \
    (df['source'].apply(lambda x: 'velia_phase6_public_mass_spec' in x)) | \
    (df['source'].apply(lambda x: 'velia_phase7_Ribo-seq_PBMC_LPS_R848' in x)) | \
    (df['source'].apply(lambda x: 'velia_phase9_orfrater' in x)) | \
    (df['source'].apply(lambda x: 'velia_phase7_Ribo-seq_PBMC_LPS_R848' in x)) | \
    (df['screening_phase'] == 'Not Screened') |
    (df['orf_xrefs'].astype(str).str.contains('RibORF')))


ribo_df = df[df['Ribo-Seq sORF']].copy()
x = ribo_df.groupby('aa').aggregate(list)

vtx_to_keep = []

for i, row in x.iterrows():
    vtx_id = ''
    if len(row['vtx_id']) > 1:
        for j, phase in enumerate(row['screening_phase']):
            if 'phase' in phase.lower():
                vtx_id = row['vtx_id'][j]
        if vtx_id == '':
            vtx_id = row['vtx_id'][0]
    else:
        vtx_id = row['vtx_id'][0]
    vtx_to_keep.append(vtx_id)
    
ribo_df = ribo_df[ribo_df['vtx_id'].isin(vtx_to_keep)].copy()

ribo_aa = set(ribo_df['aa'])

non_ribo_df = df[~df['Ribo-Seq sORF']].copy()
non_ribo_df = non_ribo_df[~non_ribo_df['aa'].isin(ribo_aa)]

df = pd.concat([ribo_df, non_ribo_df])

isoform_cols = ['swissprot_isoform', 'ensembl_isoform', 'refseq_isoform'] 
df[isoform_cols] = df[isoform_cols].apply(lambda x: [literal_eval(y) for y in x])

In [42]:
with open('../cache_updates/all_vtx_gencode_231218.txt', 'w') as outfile:
    for i, row in df.iterrows():
        outfile.write(f"{row.name}\n")



'VTX-0851967'