In [1]:
import os
import copy
import numpy as np
import pandas as pd
import seaborn as sns
from collections import defaultdict
import pickle
import warnings

from metaorf.modeling.etl import generate_orf_id, load_features
from metaorf.modeling.ensemble import Dataset

from pathlib import Path

warnings.filterwarnings('ignore')

In [2]:
def load_truth_datasets(truth_df, data_dir, overwrite, dataset_names=['iPSC', 'MB1', 'Gaertner']):
    """
    """
    datasets = {}

    X_dfs = []
    y_arrays = []
    orf_ids = []

    if not overwrite and data_dir.joinpath('datasets.pkl').exists():
        with open(data_dir.joinpath('datasets.pkl'), 'rb') as file:
            datasets = pickle.load(file)
            return datasets

    for dataset_name in dataset_names:
        dataset_file_names = []
        with open(data_dir.joinpath(f'{dataset_name}.txt'), 'r') as infile:
            for line in infile.readlines():
                exp_name = line.rstrip('\n')
                dataset_file_names.append(f"{exp_name}_orf_features.csv")

        truth_label = f'score.({dataset_name})'
        feature_df = load_features(data_dir, dataset_file_names)
        
        merged_df = feature_df.merge(truth_df, left_on='orf_idx_str', right_on='orf_id')
        y = merged_df[truth_label].copy()
        y[y > 0] = 1
        merged_df["label"] = y
        y = y.values
        X = merged_df

        X_dfs.append(X)
        y_arrays.append(y)
        orf_ids.append(merged_df['orf_id'])

        datasets[dataset_name] = Dataset(X, y, dataset_name, merged_df['orf_id'])

    datasets['all'] = Dataset(pd.concat(X_dfs), np.concatenate(y_arrays), 'all',  pd.concat(orf_ids))
    datasets['all'].X.reset_index(inplace=True, drop=True)

    with open(data_dir.joinpath('datasets.pkl'), 'wb') as file:
        pickle.dump(datasets, file)

    return datasets

In [3]:
truth_df = pd.read_csv("s3://velia-piperuns-dev/summary/truthset_batch1to4_240411_2.txt", sep='\t')
truth_df['orf_id'] = truth_df.apply(lambda x: generate_orf_id(x), axis=1)

In [4]:
truth_df

Unnamed: 0,orf.name,ORF.type,score.(Agg),score.(iPSC),score.(MB1),score.(Gaertner),coverage.(Agg),coverage.(iPSC),coverage.(MB1),coverage.(Gaertner),...,orf.name.1,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts,orf_id
0,truthset0001,batch1_PE1,3.0,3,2,2,3.0,2.0,1.0,2.0,...,truthset0001,0,-,119298778,119300535,000,2,1200,01557,chr4_119298778_119300535_-_119298778-119298779...
1,truthset0002,batch1_PE1,2.0,2,0,1,2.0,2.0,0.0,1.0,...,truthset0002,0,+,164877331,164879008,000,2,7689,01588,chr4_164877331_164879008_+_164877331-164877407...
2,truthset0003,batch1_PE1,1.0,0,0,0,2.0,1.0,1.0,0.0,...,truthset0003,0,+,211391639,211426222,000,4,581506512,0187512734571,chr1_211391639_211426222_+_211391639-211391697...
3,truthset0004,batch1_PE1,2.0,2,1,2,3.0,2.0,1.0,2.0,...,truthset0004,0,-,103951522,103956738,000,2,108108,05108,chr12_103951522_103956738_-_103951522-10395163...
4,truthset0005,batch1_PE1,3.0,0,0,1,3.0,0.0,0.0,1.0,...,truthset0005,0,+,4795734,4796231,000,3,1248642,0213455,chr16_4795734_4796231_+_4795734-4795858|479594...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6253,top_orfs8964,batch4_error_mode_new,3.0,1,1,3,,,,,...,top_orfs8964,0,-,170290703,170290802,000,1,99,0,chr6_170290703_170290802_-_170290703-170290802
6254,top_orfs90,batch4_error_mode_new,2.0,2,2,2,,,,,...,top_orfs90,0,-,55686425,55693823,000,3,1561161,012197237,chr2_55686425_55693823_-_55686425-55686440|556...
6255,top_orfs904,batch4_error_mode_new,3.0,3,0,3,,,,,...,top_orfs904,0,+,119818812,119819133,000,1,321,0,chr10_119818812_119819133_+_119818812-119819133
6256,top_orfs9139,batch4_error_mode_C-term ext,0.0,0,0,0,,,,,...,top_orfs9139,0,-,74072196,74084394,000,4,168756348,0340275812150,chr14_74072196_74084394_-_74072196-74072364|74...


In [5]:
data_dir = Path('../data').absolute()
datasets = load_truth_datasets(truth_df, data_dir, overwrite=True, dataset_names=['iPSC', 'MB1', 'Gaertner'])

In [6]:
dataset_name = 'all'
ds = copy.deepcopy(datasets[dataset_name])

In [9]:
ds.X

Unnamed: 0,chrom_id,orf_start,orf_end,strand_x,exon_blocks,orf_sequence,mean,sum,std,n_reads_orf_vs_genome,...,score,strand_y,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts,orf_id,label
0,chr1,145096847,145111597,+,145096847-145096999|145098987-145099065|145103...,ATGTCCACCAACATTTGTAGTTTCAAGGACAGGTGCGTGTCCATCC...,1.762145,-0.891068,0.038937,1.000000,...,0,+,145096846,145111597,000,4,1527812595,02140615914655,chr1_145096847_145111597_+_145096847-145096999...,0
1,chr1,89821052,89821259,+,89821052-89821259,ATGTGCTGCTGCTCCCGTCGCCGCTGCTGCCGCTGCCGCTGCCGCC...,1.799196,-0.516774,0.035301,1.000000,...,0,+,89821052,89821259,000,1,207,0,chr1_89821052_89821259_+_89821052-89821259,1
2,chr1,19597045,19626401,+,19597045-19597109|19622099-19622147|19623473-1...,ATGTCTGAGTCGGAGCTCGGCAGGAAGTGGGACCGGTGTCTGGCGG...,0.544504,-1.830245,0.587380,0.464327,...,0,+,19597044,19626401,000,4,644811015,0250542642829341,chr1_19597045_19626401_+_19597045-19597109|196...,0
3,chr1,156054889,156058371,+,156054889-156054957|156055262-156055425|156057...,ATGCTGCGCCCCAAGGCTTTGACCCAGGTGCTAAGCCAAGCCAACA...,1.280714,-1.296777,0.097380,0.986014,...,0,+,156054889,156058371,000,4,681639057,037330883425,chr1_156054889_156058371_+_156054889-156054957...,1
4,chr1,2586870,2589414,+,2586870-2586948|2587090-2587295|2587740-258779...,CTGGCAGCGGCCGCCATGAGCACGGTGGACCTTGCTCGCGTGGGCG...,1.806684,-0.946899,0.035862,0.972222,...,0,+,2586870,2589414,000,7,78205526476875,02208701519167920832539,chr1_2586870_2589414_+_2586870-2586948|2587090...,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153387,chr5,176392773,176410289,-,176392773-176392945|176397606-176397718|176397...,ATGGGGACCACAGTCAATGGAGATGTGTTTCAGGAGGCCAACGGTC...,1.559996,-1.078494,0.059143,0.974619,...,0,-,176392773,176410289,000,4,17211211833,04833515617483,chr5_176392773_176410289_-_176392773-176392945...,0
153388,chr15,79845209,79845422,-,79845209-79845422,ATGCCAGGCCTTGGGTTTGACAAACATGGCAACCGACTGGGGAGGG...,1.641093,-0.687287,0.043169,1.000000,...,0,-,79845209,79845422,000,1,213,0,chr15_79845209_79845422_-_79845209-79845422,0
153389,chr15,79845209,79889300,-,79845209-79845442|79889092-79889300,ATGCAAGATGAAATTGAGACAGAAGAGATCATCAAGGACATTTTCC...,1.553679,-1.090759,0.064475,0.997475,...,0,-,79845209,79889300,000,2,233208,043883,chr15_79845209_79889300_-_79845209-79845442|79...,0
153390,chr16,85804977,85806874,+,85804977-85805104|85805732-85805864|85806737-8...,ATGGATCGGCGTGACCACCCCTTGCCGGAGGTGGCCCATGTCAAGC...,0.516946,-2.080750,0.709536,0.998706,...,0,+,85804977,85806874,000,3,127132137,07551760,chr16_85804977_85806874_+_85804977-85805104|85...,0


In [8]:
ds.X.to_csv('../data/truth_set_features_240414.csv')