In [1]:
import pandas as pd

from Bio.Seq import Seq
from pathlib import Path
from veliadb import base

import dask.dataframe as dd
from dask.distributed import Client, LocalCluster

import pandas as pd

In [2]:
cluster = LocalCluster(memory_limit='50GB')  # Adjust as per your system's capacity
client = Client(cluster)

In [3]:
session = base.Session()

In [4]:
bp_dir = Path('/home/ec2-user/velia-data-dev/VDC_004_annotation/big_prot/v0.8.1_minlen_15_maxlen_150/')
data_dir = Path('../data').absolute()

In [5]:
assembly_ids = {}

for assembly in session.query(base.Assembly).all():
    if assembly.ucsc_style_name.startswith('chr') and len(assembly.ucsc_style_name) < 6:
        assembly_ids[assembly.id] = assembly.ucsc_style_name
    else:
        assembly_ids[assembly.id] = assembly.genbank_accession

In [6]:
dtypes = {'ribocode': 'float64',
          'ribotish': 'float64',
          'three_utr_vs_stop_codon_max': 'float64',
          'three_utr_vs_stop_codon_mean': 'float64',
          'three_utr_vs_cds_max': 'float64',
          'three_utr_vs_cds_mean': 'float64'}

dataset_names = []
for i, feature_file in enumerate(data_dir.glob('*orf_features.csv')):
    #if i > 10:
    #    continue
    dataset_names.append(feature_file.name)

feature_dds = []
for dataset in dataset_names:
    feature_dds.append(dd.read_csv(data_dir.joinpath(f'{dataset}'), sep='\t', dtype=dtypes))

feature_dd = dd.concat(feature_dds, axis=0)

#for column in feature_dd.columns:
#    if feature_dd[column].dtype != 'object':  # Assuming 'object' dtype for strings
#        feature_dd[column] = feature_dd[column].astype('float64')

#feature_df['orf_id'] = feature_df.apply(lambda x: f'{x.chrom_id}_{x.orf_start}_{x.orf_end}_{x.strand}_{x.exon_blocks}', axis=1)
#feature_df.set_index('orf_id', inplace=True)
#feature_df = feature_df.select_dtypes(include='number')
#feature_df = feature_df.groupby('orf_id').mean()

In [8]:
def translate_nt(row):
    """
    """
    try: 
        seq = str(Seq(row.orf_sequence).translate())
    except:
        seq = ''

    return seq

In [9]:
feature_dd['aa'] = feature_dd.apply(translate_nt, axis=1, meta=('aa', 'object'))

In [10]:
feature_df = feature_dd.compute()

In [11]:
feature_df.to_csv('../data/features_all_aa.csv')

In [18]:
#feature_df['length'] = feature_df.apply(lambda x: len(x['orf_sequence']), axis=1)
#feature_df[(feature_df['length'] > 45) & (feature_df['length'] < 451)]

In [12]:
bp_df = pd.read_csv(bp_dir.joinpath('orfset_v0.8.1_minlen_15_maxlen_150_orfs.csv.gz'))

In [13]:
feature_bp_df = feature_df.merge(bp_df, left_on='aa', right_on='orf.aa_seq')

In [None]:
feature_bp_df.drop_duplicates(inplace=True)

In [15]:
feature_bp_df.to_csv('../data/features_all_aa-bp.csv')

In [69]:
bp_df['orf.ucsc_style_name'] = bp_df.apply(lambda x: assembly_ids[x['orf.assembly_id']], axis=1)

In [70]:
feature_df = pd.read_csv('../data/orf_features_all.csv', index_col='orf_id')

In [76]:
bp_df['orf.orf_ribo_idx_str'] = bp_df.apply(lambda x: generate_ribo_style_idx_str(x), axis=1)

#### Custom lists

In [8]:
negatives_lvn_df = pd.read_csv('../data/negatives_to_map.txt', sep='\t', header=None)
negatives_lvn_df[1] = negatives_lvn_df[1] + '*'
neg_df = negatives_lvn_df.merge(bp_df, left_on=1, right_on='orf.aa_seq')
neg_df.to_csv('../data/negatives_bp-v081_240313.csv', index=None)

In [14]:
list2_df = pd.read_csv('../data/list2.txt', sep='\t', header=None)
list2_df[1] = list2_df[1] + '*'
list2_bp_df = list2_df.merge(bp_df, left_on=1, right_on='orf.aa_seq')
list2_bp_df.to_csv('../data/list2_bp-v081_240313.csv', index=None)

In [20]:
prot_df = pd.read_csv('../data/prot_to_map.txt', sep='\t', header=None)
prot_df[1] = prot_df[1] + '*'
prot_bp_df = prot_df.merge(bp_df, left_on=1, right_on='orf.aa_seq')
prot_bp_df.to_csv('../data/prot2map_bp-v081_240313.csv', index=None)