In [1]:
import pandas as pd
from Bio.Seq import Seq
from veliadb.base import Session, Assembly

In [2]:
metaorfs = pd.read_table('s3://velia-piperuns-dev/VPR_orfcalling_20240307222241_iPSC-rep2_SRR9113065/output/VPR_orfcalling_20240307222241_iPSC-rep2_SRR9113065_orf_features.csv')

In [3]:
session = Session()
ucsc_style2assembly_id = {}
for a in session.query(Assembly).all():
    if a.ucsc_style_name != 'na':
        if "_" in a.ucsc_style_name:
            chrom_name = a.genbank_accession
        else:
            chrom_name = a.ucsc_style_name        
        ucsc_style2assembly_id[chrom_name] = a.id

In [4]:
def exon_block_to_veliadb_style(exon_blocks, exon_delimiter = '|'):
    block_sizes = []
    chrom_starts = []
    for e in exon_blocks.split(exon_delimiter):
        e_start, e_end = e.split('-')
        block_sizes.append(str(int(e_end)-int(e_start)))
        chrom_starts.append(str(int(e_start)+1)) # Add 1 to starts to convert to 1-indexed
    return ';'.join(chrom_starts), ';'.join(block_sizes)

def metaorf_call_to_veliadb_hash_string(row, chrom_id_to_assembly_id):
    assembly_id = chrom_id_to_assembly_id[row['chrom_id']]
    chrom_starts, block_sizes = exon_block_to_veliadb_style(row['exon_blocks'])
    chrom_start = int(row['orf_start'])+1 # Add 1 to start to convert to 1-indexed
    hash_string = f"{assembly_id}_{chrom_start}_{row['orf_end']}_{row['strand']}_{chrom_starts}_{block_sizes}"
    return hash_string

In [5]:
metaorfs.index = metaorfs.apply(lambda x: metaorf_call_to_veliadb_hash_string(x, ucsc_style2assembly_id), axis=1)

In [6]:
metaorfs.index

Index(['1_67686378_67687774_+_67686378;67687661_210;114',
       '1_54741739_54741798_+_54741739_60',
       '1_26169984_26170716_+_26169984;26170418;26170575_200;63;142',
       '1_26227802_26227846_+_26227802_45',
       '1_26276712_26277293_+_26276712;26277133_49;161',
       '1_26280798_26281123_+_26280798;26281058_135;66',
       '1_19907648_19907746_+_19907648_99',
       '1_20589130_20618568_+_20589130;20604928;20613842;20618452_154;112;58;117',
       '1_78013078_78013221_+_78013078_144',
       '1_78013392_78016247_+_78013392;78016014_228;234',
       ...
       '2_96013730_96019499_+_96013730;96015109;96019201;96019481_242;217;74;19',
       '7_96121191_96121995_-_96121191;96121655;96121839_187;91;157',
       '3_180604901_180607795_+_180604901;180605766;180606253;180606481;180607648_107;72;120;123;148',
       '3_180602262_180604868_+_180602262;180602891;180603124;180604225;180604852_161;125;200;85;17',
       '3_180607734_180608814_+_180607734;180608718_32;97',
       '1_22

In [18]:
metaorfs.index[0]

'1_67686378_67687774_+_67686378;67687661_210;114'

In [8]:
metaorfs

Unnamed: 0,chrom_id,orf_start,orf_end,strand,exon_blocks,orf_sequence,mean,sum,std,n_reads_orf_vs_transcript,...,cds_utr_vs_stop_codon_mean,cds_utr_vs_stop_codon_max,dist_neg_100,dist_neg_150,dist_pos_100,dist_pos_150,price,ribotish,ribocode,tis_transformer_score
1_67686378_67687774_+_67686378;67687661_210;114,chr1,67686377,67687774,+,67686377-67686587|67687660-67687774,CTGGCGGCGGACGAGGACGACGACAGAGATGTGGCTCTGCAGATCC...,0.006314,2.045621,0.021704,1.000000,...,0.898725,0.397940,-0.000000,-0.000000,-0.000000,-0.000000,999.000000,0.0,0.000000,0.000000
1_54741739_54741798_+_54741739_60,chr1,54741738,54741798,+,54741738-54741798,ATGATGTCCTTGGACTCCATCGCTAAAGGGACCATCTGCTGCAGTT...,0.004158,0.249466,0.012193,1.000000,...,0.301030,-0.301030,-0.000000,-0.000000,-0.778151,-1.778151,2.664101,0.0,0.000000,0.000000
1_26169984_26170716_+_26169984;26170418;26170575_200;63;142,chr1,26169983,26170716,+,26169983-26170183|26170417-26170480|26170574-2...,ATGGGTCGCTCCCGCCGGACAGGCGCGCACCGAGCGCACTCTCTAG...,0.037882,15.342155,0.098024,1.000000,...,1.023878,0.212089,-1.778151,-1.778151,-1.778151,-1.778151,999.000000,0.0,18.059895,0.323213
1_26227802_26227846_+_26227802_45,chr1,26227801,26227846,+,26227801-26227846,ATGGCCATCTTTCTTCTGATAGCAATAAGCTGGATGCTGGGATAG,0.009979,0.449039,0.027022,1.000000,...,0.698970,0.096910,-1.278754,-1.778151,-0.301030,-0.301030,2.210314,0.0,0.000000,0.000074
1_26276712_26277293_+_26276712;26277133_49;161,chr1,26276711,26277293,+,26276711-26276760|26277132-26277293,AGGGCCCAGGGCCATGACCCCAATCTCTCCCTGCTCCTGGGCATTC...,0.006652,1.397009,0.021575,1.000000,...,0.778151,0.176091,-1.653213,-1.778151,-1.079181,-1.079181,12.209820,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1_229271355_229297595_+_229271355;229286486;229288729;229295848;229297520_16;81;115;63;76,chr1,229271354,229297595,+,229271354-229271370|229286485-229286566|229288...,ATGTCCGAAACCTACGATTTTTTGTTTAAGTTCTTGGTTATTGGAA...,0.009453,3.317897,0.027047,0.847134,...,0.477121,-0.301030,-0.301030,-0.301030,-0.477121,-0.954243,0.000000,0.0,3.177935,0.010213
1_230867958_230868503_-_230867958_546,chr1,230867957,230868503,-,230867957-230868503,ATGGCGGCGGCGATCGCGGCTTCGCGCTCGGCGGTCATGAGCGGGA...,0.004478,2.444766,0.018504,1.000000,...,-3.000000,-3.000000,-0.000000,-0.000000,-0.000000,-0.000000,0.000000,0.0,6.236236,0.009422
2_232850277_232856860_+_232850277;232856793_133;68,chr2,232850276,232856860,+,232850276-232850409|232856792-232856860,ATGAACCACAGTACACTCCATTCAGTATTTCAGACCAATCAAAGCA...,0.008564,1.721315,0.027107,1.000000,...,0.176091,-0.698970,-1.113943,-1.230449,-0.954243,-1.079181,0.000000,0.0,5.379650,0.000044
2_241722991_241725031_+_241722991;241723210;241724989_84;62;43,chr2,241722990,241725031,+,241722990-241723074|241723209-241723271|241724...,ATGCCCGTGGACCCAAACGAACCCACGTACTGCCTGTGCCACCAGG...,0.015839,2.993591,0.031209,1.000000,...,-3.000000,-3.000000,-1.146128,-1.778151,-0.301030,-1.079181,0.000000,0.0,6.422757,0.000065


In [9]:
bp_df = pd.read_csv('s3://velia-data-dev/VDC_004_annotation/big_prot/v0.8.1_minlen_15_maxlen_150/orfset_v0.8.1_minlen_15_maxlen_150_orfs.csv.gz')

In [10]:
bp_df = bp_df.set_index("orf.orf_idx_str")

In [11]:
bp_df

Unnamed: 0_level_0,orf.aa_seq,orf.assembly_id,orf.attrs,orf.benchling_id,orf.block_sizes,orf.chrom_starts,orf.end,orf.ensembl_protein_id,orf.exon_frames,orf.id,orf.nt_seq,orf.openprot_id,orf.orf_idx,orf.phases,orf.refseq_protein_id,orf.secondary_orf_id,orf.start,orf.strand,orf.uniprot_id,orf.velia_id
orf.orf_idx_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
7_127591572_127591637_+_127591572_66,LGFLFFFCFGCTLGARLGGGR*,7,{},,66,127591572,127591637,,2,1,CTGGGTTTCCTTTTTTTTTTCTGTTTTGGGTGTACTCTAGGGGCCA...,,842842347331e38ee344f45331e1a7b745d55ab73660f7...,0,,,127591572,+,,-1
7_127589083_127589127_+_127589083_45,LAWMRLARPQSCTN*,7,{},,45,127589083,127589127,,0,2,TTGGCTTGGATGCGGCTGGCAAGACCACAATCCTGTACAAACTGA,,d5cf4631945895faf913e408c7550693d113fc9d785a7b...,0,,,127589083,+,,-1
7_127591259_127591321_+_127591259_63,MVWTGCPTSCQSANQPGAGP*,7,{},,63,127591259,127591321,,1,3,ATGGTCTGGACTGGCTGTCCCACGAGCTGTCAAAGCGCTAACCAGC...,,d2c1cc63c6d79def523734bc79ab83555ca7a66fed2e08...,0,,,127591259,+,,-1
7_127590078_127591299_+_127590078;127590963;127591213_60;126;87,VVDSNDRERVQESADELQKMLQEDELRDAVLLVFANKQDMPNAMPV...,7,{},,60;126;87,127590078;127590963;127591213,127591299,,2;2;0,4,GTGGTGGACAGTAATGACCGGGAGCGGGTCCAAGAATCTGCTGATG...,,36bdbb7d2f653e5e4136b1721c9483cc2adff5cdb7cddf...,0;0;0,,,127590078,+,,-1
7_127591038_127591299_+_127591038;127591213_51;87,VSELTDKLGLQHLRSRTWYVQATCATQGTGLYDGLDWLSHELSKR*,7,{},,51;87,127591038;127591213,127591299,,2;0,5,GTGAGCGAGCTGACTGACAAGCTGGGGCTACAGCACTTACGCAGCC...,,4e3d7ea91ec116a8e17496d0bedb5ff2c78f72ddd4bdc3...,0;0,,,127591038,+,,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11_112768714_112768830_+_112768714_117,LFSPIFRSYLEWVELTFSHLTMMLLFLCSRTTTVMVPV*,11,{},,117,112768714,112768830,,0,11992546,CTGTTTTCACCTATTTTTCGAAGTTACCTCGAGTGGGTAGAGCTCA...,,74ba2a30de74bdec5a566af80209a40ce3dbe56fe1b39e...,0,,,112768714,+,,-1
11_112768473_112768571_+_112768473_99,VARSRLTTTSTSWAPAILLPQPLSSWDYRLKS*,11,{},,99,112768473,112768571,,2,11992547,GTGGCGCGATCTCGACTCACCACAACCTCCACCTCCTGGGCTCCAG...,,3e3ab5bb58a6e13f8b8437006e36f6c40b902e497ff22f...,0,,,112768473,+,,-1
11_112768786_112768830_+_112768786_45,LFLCSRTTTVMVPV*,11,{},,45,112768786,112768830,,0,11992548,TTGTTTCTATGCAGCCGGACCACCACCGTCATGGTACCCGTTTGA,,d4ed423b3b24e69ac6890545d017bba62b3caede838d2c...,0,,,112768786,+,,-1
11_112768655_112768753_+_112768655_99,MTFIKLRPSILPNYLLISTLCFHLFFEVTSSG*,11,{},,99,112768655,112768753,,1,11992549,ATGACCTTCATAAAGTTAAGGCCTTCTATACTTCCTAATTACCTGC...,,265b0b8f5e5d585f8993fd2583e86c41a06159dbc7defc...,0,,,112768655,+,,-1


In [36]:
count = 0
count_total = 0
for i in range(len(metaorfs.index)):
    if (metaorfs.iloc[i]["orf_sequence"][:3] in ["ATG", "CTG", "GTG", "TTG"]
        and metaorfs.iloc[i]["orf_sequence"][-3:] in ["TTA", "TTG", "TGA"]
        and len(metaorfs.iloc[i]["orf_sequence"]) >= 45
        and len(metaorfs.iloc[i]["orf_sequence"]) <= 450):
        
        count_total += 1
        if metaorfs.index[i] not in bp_df.index:
            count += 1
            if count < 20:
                print(i, metaorfs.index[i], metaorfs.iloc[i]["orf_sequence"][:3])
print(count, count_total)

10767 17_1676308_1677155_-_1676308;1676505;1676976_63;207;180 GTG
10778 17_1771853_1776603_+_1771853;1775058;1776532_223;143;72 CTG
11154 11_3360417_3361887_-_3360417;3361743_149;145 ATG
11157 11_3360453_3366910_-_3360453;3366691_113;220 GTG
11162 11_3368821_3373591_-_3368821;3370975;3371577;3373577_35;96;127;15 ATG
11176 16_3476228_3483385_+_3476228;3479471;3482502;3483363_110;130;97;23 ATG
11750 2_8858859_8860764_-_8858859;8860613_46;152 ATG
11880 2_10122992_10123852_+_10122992;10123387;10123736_66;144;117 CTG
12233 12_12975317_12975427_-_12975317_111 CTG
12315 12_14423916_14424092_+_14423916_177 ATG
12545 8_17242994_17243510_-_17242994;17243442_192;69 GTG
12921 2_20701576_20774911_-_20701576;20739971;20774810_77;235;102 CTG
13332 18_22936831_22968809_+_22936831;22946444;22949618;22968806_130;43;96;4 GTG
13435 16_23557952_23559529_+_23557952;23559468_328;62 ATG
13604 1_25982367_25998032_-_25982367;25983946;25984460;25998025_111;142;69;8 TTG
13732 10_27145420_27145530_-_27145420_111 C

In [13]:
len(metaorfs.index)

117730

In [50]:
for row in bp_df.index:
    if row.startswith("16_23557952"):
        print(row)

In [15]:
metaorfs.index[0]

'1_67686378_67687774_+_67686378;67687661_210;114'

In [51]:
metaorfs.loc["16_23557952_23559529_+_23557952;23559468_328;62"]

chrom_id                                                                    chr16
orf_start                                                                23557951
orf_end                                                                  23559529
strand                                                                          +
exon_blocks                                   23557951-23558279|23559467-23559529
orf_sequence                    ATGGAGGAACCTGGCATGGACACGGAGGCCGAGACTGTGGCTACTG...
mean                                                                     0.032239
sum                                                                     12.573083
std                                                                      0.070136
n_reads_orf_vs_transcript                                                     1.0
pos_1_vs_0                                                                   10.0
pos_2_vs_0                                                                   10.0
frames_1_vs_0   

In [49]:
len("ATGGAGGAACCTGGCATGGACACGGAGGCCGAGACTGTGGCTACTGAGGCTCCCGCGCGGCCCGTCAACTGCCTGGAGGCTGAAGCCGCGGCGGGGGCGGCGGCCGAGGACTCCGGCGCCGCACGAGGCAGCCTGCAGCCGGCCCCGGCCCAGCCCCCTGGGGACCCCGCAGCCCAGGCCTCGGTCAGCAACGGCGAAGACGCGGGCGGCGGCGCGGGCAGGGAGCTGGTGGACTTG")

237

In [17]:
bp_df.loc[metaorfs.index[1]]

orf.aa_seq                                             MMSLDSIAKGTICCSYHSN*
orf.assembly_id                                                           1
orf.attrs                                                                {}
orf.benchling_id                                                        NaN
orf.block_sizes                                                          60
orf.chrom_starts                                                   54741739
orf.end                                                            54741798
orf.ensembl_protein_id                                                  NaN
orf.exon_frames                                                           0
orf.id                                                              1912343
orf.nt_seq                ATGATGTCCTTGGACTCCATCGCTAAAGGGACCATCTGCTGCAGTT...
orf.openprot_id                                                         NaN
orf.orf_idx               be51412d9d359c4a5b810be3eea4a804880c09d9245acd...
orf.phases  