In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
import pickle
import warnings

from Bio.Seq import Seq
from pathlib import Path

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 500)

warnings.filterwarnings('ignore')

In [2]:
data_dir = Path('../data').absolute()

qc_samples = []
with open('../data/StringentQC_samples.txt', 'r') as infile:
    for line in infile.readlines():
        qc_samples.append(line.rstrip('\n').replace('_', '-'))

In [3]:
dataset_names = []
for feature_file in data_dir.glob('*orf_features.csv'):
    name = '-'.join(feature_file.name.split('_')[3:-2])
    dataset_names.append(feature_file.name)

In [4]:
print(dataset_names[:10])
print(len(dataset_names))

['VPR_orfcalling_20240307222241_MDA-MB-231_rep3_orf_features.csv', 'VPR_orfcalling_20240307222241_K562-HiRes-rep1_SRR8449579_orf_features.csv', 'VPR_orfcalling_20240307222241_HeLaS3-HiRes-rep1_SRR8449577_orf_features.csv', 'VPR_orfcalling_20240307222241_HEK293T-TG-rep1_SRR8449573_orf_features.csv', 'VPR_orfcalling_20240307222208_YL5_R1_01_orf_features.csv', 'VPR_orfcalling_20240307222241_iPSC-rep3_SRR9113066_orf_features.csv', 'VPR_orfcalling_20240307222241_iPSC-rep2_SRR9113065_orf_features.csv', 'VPR_orfcalling_20240308012528_SRX3884294_orf_features.csv', 'VPR_orfcalling_20240308012528_SRX3884298_orf_features.csv', 'VPR_orfcalling_20240308012528_SRX3884312_orf_features.csv']
190


In [5]:
with open('../data/top_model_all_gb.pkl', 'rb') as file:
    ds = pickle.load(file)
    ds.model = ds.model.fit(ds.X.drop(columns=['chrom_id']), ds.y)

In [6]:
def load_features(data_dir, dataset):
    feature_df = pd.read_csv(data_dir.joinpath(f'{dataset}'), sep='\t')
    feature_df['dataset'] = '-'.join(dataset.split('_')[:-2])
    feature_df['orf_id'] = feature_df.apply(lambda x: f'{x.chrom_id}_{x.orf_start}_{x.orf_end}_{x.strand}_{x.exon_blocks}_{x.dataset}', axis=1)
    feature_df['orf_idx_str'] =  feature_df.apply(lambda x: f'{x.chrom_id}_{x.orf_start}_{x.orf_end}_{x.strand}_{x.exon_blocks}', axis=1)
    feature_df.set_index('orf_id', inplace=True)
    return feature_df

In [7]:
pred_df_list = []
for dataset in dataset_names:
    feature_all_df = load_features(data_dir, dataset)
    drop_cols=['orf_start', 'orf_end']
    feature_df = feature_all_df.drop(columns=drop_cols)
    feature_df = feature_df.select_dtypes(include='number')
    prediction_proba = ds.model.predict_proba(feature_df)
    feature_df['prediction_proba'] = prediction_proba[:,1]
    
    pred_df = feature_df.copy()
    pred_df['chrom'] = pred_df.apply(lambda x: x.name.split('_')[0], axis=1)
    pred_df['dataset'] = pred_df.apply(lambda x: x.name.split('_')[-1], axis=1)
    pred_df['orf_idx'] = pred_df.apply(lambda x: '_'.join(x.name.split('_')[:-1]), axis=1)
    str_cols = ['orf_idx_str', 'chrom_id', 'orf_start', 'orf_end', 'strand', 'exon_blocks',
                'orf_sequence', 'bigprot_id']
    pred_all_df = pred_df.merge(feature_all_df[str_cols], left_index=True, right_on='orf_id', how='left')
    pred_all_df['aa'] = pred_all_df.apply(lambda x: str(Seq(x.orf_sequence).translate())[:-1], axis=1)
    pred_all_df['length'] = pred_all_df.apply(lambda x: len(x.aa), axis=1)
    pred_all_df = pred_all_df[pred_all_df['length'] >= 15]
    pred_all_df = pred_all_df[(pred_all_df['orf_sequence'].str.startswith('ATG')) | \
                 (pred_all_df['orf_sequence'].str.startswith('CTG')) | \
                 (pred_all_df['orf_sequence'].str.startswith('GTG')) | \
                 (pred_all_df['orf_sequence'].str.startswith('TTG'))]
    pred_df_list.append(pred_all_df)

In [8]:
pred_df = pd.concat(pred_df_list)

In [9]:
print(len(set(pred_df['orf_sequence'])))
print(len(pred_df))

375026
8103354


In [10]:
pred_df.to_csv('../data/top_orfs_gb-all_241101.csv')
pred_df.to_parquet('../data/top_orfs_gb-all_241101.parquet.gzip', compression='gzip') 

In [28]:
pred_df_conf = pred_df[pred_df["prediction_proba"] > 0.95]
pred_df_conf

Unnamed: 0_level_0,mean,sum,std,n_reads_orf_vs_genome,pos_1_vs_0,pos_2_vs_0,frames_1_vs_0,frames_2_vs_0,periodicity_first_60_1_vs_0,periodicity_first_60_2_vs_0,periodicity_last_60_1_vs_0,periodicity_last_60_2_vs_0,n_empty_codons,longest_empty_length_whole,longest_empty_length_first_30,longest_empty_length_last_30,five_utr_vs_cds_mean,five_utr_vs_cds_max,five_utr_vs_start_codon_mean,five_utr_vs_start_codon_max,cds_utr_vs_start_codon_mean,cds_utr_vs_start_codon_max,three_utr_vs_cds_mean,three_utr_vs_cds_max,three_utr_vs_stop_codon_mean,three_utr_vs_stop_codon_max,cds_utr_vs_stop_codon_mean,cds_utr_vs_stop_codon_max,dist_neg_100,dist_neg_150,dist_pos_100,dist_pos_150,five_utr_periodicity,five_utr_in_frame_bins,three_utr_periodicity,three_utr_in_frame_bins,price,ribotish,ribocode,tis_transformer_score,size_peak_frac,size_gini,periodicity_score,prediction_proba,chrom,dataset,orf_idx,orf_idx_str,chrom_id,orf_start,orf_end,strand,exon_blocks,orf_sequence,bigprot_id,aa,length
orf_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1
chr1_161314405_161362433_+_161314405-161314425|161323613-161323670|161328395-161328497|161340593-161340655|161356676-161356840|161362328-161362433_VPR-orfcalling-20240307222241-MDA-MB-231-rep3,1.052492,-1.655078,0.203009,0.999421,0.040000,0.000000,0.167029,0.334928,0.185714,0.221429,0.137255,0.286275,-1.000000,-1.041393,-0.698970,-0.602060,1.366991,1.397940,1.730603,1.397940,0.470590,0.119186,1.482191,1.469822,0.907246,0.176091,-0.592300,-1.293731,-1.778151,-1.778151,-1.778151,-1.778151,0.0,0.0,0.00000,0.1,999.0,4.371552,28.671937,0.938041,0.568773,0.738213,0.621080,0.999981,chr1,VPR-orfcalling-20240307222241-MDA-MB-231-rep3,chr1_161314405_161362433_+_161314405-161314425...,chr1_161314405_161362433_+_161314405-161314425...,chr1,161314405,161362433,+,161314405-161314425|161323613-161323670|161328...,ATGGCTGCGCTGTTGCTGAGACACGTTGGTCGTCATTGCCTCCGAG...,1_161314406_161362433_+_161314406;161323614;16...,MAALLLRHVGRHCLRAHFSPQLCIRNAVPLGTTAKEEMERFWNKNI...,169
chr1_2395790_2403124_+_2395790-2395871|2397115-2397220|2399414-2399514|2400856-2400935|2402206-2402342|2403034-2403124_VPR-orfcalling-20240307222241-MDA-MB-231-rep3,1.342105,-1.429482,0.076072,0.998056,0.315789,0.289474,0.258014,0.347928,0.370787,0.247191,0.192308,0.394231,-1.079181,-1.431364,-1.361728,-0.845098,1.192623,0.880814,1.843025,0.880814,1.025598,0.625541,2.252981,1.255272,2.600973,1.146128,0.378690,-0.109144,-1.778151,-1.778151,-1.778151,-1.778151,2.0,0.1,0.00000,0.0,999.0,32.698025,22.372865,0.967530,0.568773,0.738213,0.621080,0.999998,chr1,VPR-orfcalling-20240307222241-MDA-MB-231-rep3,chr1_2395790_2403124_+_2395790-2395871|2397115...,chr1_2395790_2403124_+_2395790-2395871|2397115...,chr1,2395790,2403124,+,2395790-2395871|2397115-2397220|2399414-239951...,ATGTCTGAAGGGGACAGTGTGGGAGAATCCGTCCATGGGAAACCTT...,1_2395791_2403124_+_2395791;2397116;2399415;24...,MSEGDSVGESVHGKPSVVYRFFTRLGQIYQSWLDKSTPYTAVRWVV...,196
chr1_160211544_160213486_+_160211544-160211716|160213109-160213265|160213421-160213486_VPR-orfcalling-20240307222241-MDA-MB-231-rep3,0.846575,-1.747818,0.337865,0.999766,0.090674,0.059585,0.161556,0.247939,0.133236,0.102489,0.128898,0.340956,-0.301030,-1.041393,-1.041393,-0.602060,2.610711,2.586587,3.000000,2.586587,1.071979,0.486217,2.109170,1.146128,2.430559,1.146128,0.348672,0.038918,-1.778151,-1.778151,-1.778151,-1.778151,2.0,0.0,2.00000,0.0,999.0,30.751160,21.737999,0.942205,0.568773,0.738213,0.621080,0.999998,chr1,VPR-orfcalling-20240307222241-MDA-MB-231-rep3,chr1_160211544_160213486_+_160211544-160211716...,chr1_160211544_160213486_+_160211544-160211716...,chr1,160211544,160213486,+,160211544-160211716|160213109-160213265|160213...,ATGGCTGAGTACGGGACCCTCCTGCAAGACCTGACCAACAACATCA...,1_160211545_160213486_+_160211545;160213110;16...,MAEYGTLLQDLTNNITLEDLEQLKSACKEDIPSEKSEEITTGSAWF...,130
chr1_232950710_232978231_+_232950710-232950744|232955556-232955719|232956346-232956443|232969908-232970118|232978162-232978231_VPR-orfcalling-20240307222241-MDA-MB-231-rep3,1.727459,-1.030696,0.035630,0.996355,0.000000,0.000000,0.220676,0.409543,0.136364,0.151515,0.132530,0.325301,-1.556303,-1.544068,-1.113943,-0.845098,1.226047,0.819544,1.715446,0.819544,0.674932,0.372386,2.111652,1.342423,2.100371,0.602060,-0.011899,-0.740363,-1.778151,-1.778151,-1.778151,-1.778151,2.0,0.0,0.00000,0.0,999.0,22.504551,11.688273,0.971736,0.568773,0.738213,0.621080,0.999999,chr1,VPR-orfcalling-20240307222241-MDA-MB-231-rep3,chr1_232950710_232978231_+_232950710-232950744...,chr1_232950710_232978231_+_232950710-232950744...,chr1,232950710,232978231,+,232950710-232950744|232955556-232955719|232956...,ATGGCCCGGCACGTGTTCCTAACGGGGCCCCCAGGAGTTGGAAAAA...,1_232950711_232978231_+_232950711;232955557;23...,MARHVFLTGPPGVGKTTLIHKASEVLKSSGVPVDGFYTEEVRQGGR...,190
chr1_7962785_7985054_+_7962785-7962875|7965323-7965425|7969344-7969404|7970893-7970963|7977651-7977738|7984893-7985054_VPR-orfcalling-20240307222241-MDA-MB-231-rep3,0.811624,-1.944251,0.323104,0.998811,0.529412,0.019608,0.206936,0.336472,0.253298,0.171504,0.237908,0.180392,-0.000000,-0.903090,-0.903090,-0.477121,1.846236,1.537819,1.947760,1.406540,0.121017,-0.131279,2.597753,2.618048,3.000000,2.618048,1.215654,0.644920,-1.778151,-1.778151,-1.477121,-1.778151,0.0,0.2,0.00000,0.1,999.0,29.303547,31.090939,0.930311,0.568773,0.738213,0.621080,0.999995,chr1,VPR-orfcalling-20240307222241-MDA-MB-231-rep3,chr1_7962785_7985054_+_7962785-7962875|7965323...,chr1_7962785_7985054_+_7962785-7962875|7965323...,chr1,7962785,7985054,+,7962785-7962875|7965323-7965425|7969344-796940...,ATGGCTTCCAAAAGAGCTCTGGTCATCCTGGCTAAAGGAGCAGAGG...,1_7962786_7985054_+_7962786;7965324;7969345;79...,MASKRALVILAKGAEEMETVIPVDVMRRAGIKVTVAGLAGKDPVQC...,189
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chr11_63974680_63976320_+_63974680-63974794|63976224-63976320_VPR-orfcalling-20241101000923-SRX24843303,0.435059,-1.887160,0.697614,1.000000,0.470588,0.078431,0.328257,0.209391,0.419192,0.207071,0.345650,0.182360,-0.301030,-1.000000,-0.477121,-0.602060,0.307387,0.648260,0.244931,0.017374,-0.072044,-0.630886,0.980395,-0.006531,1.470481,-0.006531,0.543713,0.030669,-1.778151,-1.778151,-1.079181,-1.079181,2.0,0.1,0.08209,0.1,0.0,0.000000,13.250686,0.562227,0.573426,0.708974,0.605196,0.972052,chr11,VPR-orfcalling-20241101000923-SRX24843303,chr11_63974680_63976320_+_63974680-63974794|63...,chr11_63974680_63976320_+_63974680-63974794|63...,chr11,63974680,63976320,+,63974680-63974794|63976224-63976320,ATGTCCGTCCTGACGCCGCTGCTGCTGCGGGGCTTGACAGGCTCGG...,11_63974681_63976320_+_63974681;63976225_114;96,MSVLTPLLLRGLTGSARRLPVPRAKIHSLPPEGKLGIMELAVGLTS...,69
chr10_73782075_73783187_+_73782075-73782199|73782322-73782441|73783073-73783187_VPR-orfcalling-20241101000923-SRX24843303,1.150466,-1.402202,0.121493,0.995816,10.000000,1.000000,0.325490,0.230065,0.537037,0.185185,0.452055,0.315068,-1.041393,-1.301030,-0.845098,-1.000000,3.000000,3.000000,3.000000,3.000000,-0.094270,-0.414973,1.250965,0.753328,1.118099,0.221849,-0.139179,-0.531479,-0.000000,-0.000000,-0.000000,-0.000000,0.0,0.0,0.00000,0.2,0.0,0.000000,11.918380,0.445287,0.573426,0.708974,0.605196,0.998130,chr10,VPR-orfcalling-20241101000923-SRX24843303,chr10_73782075_73783187_+_73782075-73782199|73...,chr10_73782075_73783187_+_73782075-73782199|73...,chr10,73782075,73783187,+,73782075-73782199|73782322-73782441|73783073-7...,ATGGCGACACCCAGCCTGCGGGGTCGTCTGGCGCGGTTTGGGAACC...,10_73782076_73783187_+_73782076;73782323;73783...,MATPSLRGRLARFGNPRKPVLKPNKPLILANRVGERRREKGEATCI...,118
chr15_90230483_90233885_-_90230483-90230505|90230933-90231022|90231094-90231213|90231356-90231507|90232218-90232327|90233668-90233703|90233834-90233885_VPR-orfcalling-20241101000923-SRX24843303,0.982134,-1.778288,0.166378,0.987779,1.000000,1.000000,0.373066,0.248138,0.473214,0.343750,0.368421,0.157895,-1.000000,-1.204120,-0.903090,-1.000000,0.853288,0.569875,0.000000,-0.669007,0.000000,-1.238882,1.204834,0.903090,1.585461,0.477121,0.415852,-0.425969,-0.000000,-0.000000,-0.000000,-0.000000,2.0,0.0,0.00000,0.1,0.0,0.000000,27.441197,0.843217,0.573426,0.708974,0.605196,0.999857,chr15,VPR-orfcalling-20241101000923-SRX24843303,chr15_90230483_90233885_-_90230483-90230505|90...,chr15_90230483_90233885_-_90230483-90230505|90...,chr15,90230483,90233885,-,90230483-90230505|90230933-90231022|90231094-9...,ATGGGGGGCTCGGGCAGTCGCCTGTCCAAGGAGCTGCTGGCCGAGT...,15_90230484_90233885_-_90230484;90230934;90231...,MGGSGSRLSKELLAEYQDLTFLTKQEILLAHRRFCELLPQEQRSVE...,191
chr8_143020939_143021789_+_143020939-143020991|143021313-143021433|143021565-143021789_VPR-orfcalling-20241101000923-SRX24843303,0.239026,-2.358669,1.345287,0.999072,0.674699,0.192771,0.387315,0.160938,0.360998,0.139872,0.976974,0.131579,-0.000000,-1.000000,-0.602060,-0.698970,0.204858,0.409878,-0.066832,-0.468312,-0.304111,-0.878189,2.063274,2.086360,2.711385,2.086360,0.740445,0.433147,-1.531479,-1.633468,-0.954243,-1.079181,2.0,0.3,0.00000,0.1,0.0,0.000000,30.011928,0.961352,0.573426,0.708974,0.605196,0.999852,chr8,VPR-orfcalling-20241101000923-SRX24843303,chr8_143020939_143021789_+_143020939-143020991...,chr8_143020939_143021789_+_143020939-143020991...,chr8,143020939,143021789,+,143020939-143020991|143021313-143021433|143021...,ATGAAGATCTTCTTGCCAGTGCTGCTGGCTGCCCTTCTGGGTGTGG...,8_143020940_143021789_+_143020940;143021314;14...,MKIFLPVLLAALLGVERASSLMCFSCLNQKSNLYCLKPTICSDQDN...,131


In [11]:
pred_df.iloc[0]

mean                                                                      1.480379
sum                                                                      -0.376953
std                                                                       0.089622
n_reads_orf_vs_genome                                                     0.994536
pos_1_vs_0                                                                0.145833
pos_2_vs_0                                                                0.083333
frames_1_vs_0                                                             0.155556
frames_2_vs_0                                                             0.192593
periodicity_first_60_1_vs_0                                               0.150794
periodicity_first_60_2_vs_0                                               0.150794
periodicity_last_60_1_vs_0                                                 0.12963
periodicity_last_60_2_vs_0                                                0.407407
n_em

In [12]:
orf_idx_str_high_conf = set(pred_df['orf_idx_str'])

In [13]:
merged_df = pd.read_csv(data_dir.joinpath("merged_orfs_found_by_any_caller.csv"), sep='\t', index_col=[0])
merged_df["key"] = merged_df.apply(lambda x: f'{x.chrom_id}_{x.orf_start}_{x.orf_end}_{x.strand}_{x.exon_blocks}', axis=1)
merged_df_high_conf = merged_df[merged_df["key"].isin(orf_idx_str_high_conf)]
merged_df_high_conf.reset_index(drop=True, inplace=True)

In [14]:
merged_df_high_conf.to_csv('../data/top_unique_orfs_gb-all_241101.csv')