In [None]:
# library imports
import pandas as pd
import itertools
from scipy import sparse
import torch_geometric.transforms as T
from rna_struct_utils import codonidx_to_ntsequence, mergeNTGraphToCodonGraph, getRNASS
from graph_processing_utils import RiboDataset

In [None]:
id_to_codon = {idx:''.join(el) for idx, el in enumerate(itertools.product(['A', 'T', 'C', 'G'], repeat=3))}
codon_to_id = {v:k for k,v in id_to_codon.items()}

# load training and testing files
df_train_path = 'ribogl/src/data/train.csv'
df_test_path = 'ribogl/src/data/test.csv'

# load data
df_train = pd.read_csv(df_train_path)
df_test = pd.read_csv(df_test_path)

# apply codonidx_to_ntsequence on codon_sequence column to get nt_sequence column
df_train['nt_sequence'] = df_train['sequence'].apply(lambda x: codonidx_to_ntsequence(x))

In [None]:
'''
1. generate mRNA secondary structures and convert them to graphs
2. convert nucleotide graph into a codon graph
3. save the graphs for all the genes
'''

# get sequences
train_seqs = df_train['sequence'].tolist()
test_seqs = df_test['sequence'].tolist()

# train secondary structures
ss_vecs_sparse_train = []

for i in range(len(train_seqs)):
    print(i, len(train_seqs))
    ss_adj = getRNASS(train_seqs[i])
    codon_ss_graph = mergeNTGraphToCodonGraph(ss_adj)
    ss_vecs_sparse_train.append(sparse.csr_matrix(codon_ss_graph))

df_train['codon_RNA_SS'] = ss_vecs_sparse_train

# test secondary structures
ss_vecs_sparse_test = []

for i in range(len(test_seqs)):
    print(i, len(test_seqs))
    ss_adj = getRNASS(test_seqs[i])
    codon_ss_graph = mergeNTGraphToCodonGraph(ss_adj)
    ss_vecs_sparse_test.append(sparse.csr_matrix(codon_ss_graph))

df_train.to_pickle('ribogl/src/data/Train_RNA_SS.pkl')
df_test.to_pickle('ribogl/src/data/Test_RNA_SS.pkl')

print("Made mRNA secondary structures for train and test")

In [None]:
# convert the graphs into torch_geometric data objects
feature_folder = 'ribogl/src/data/'
data_folder = 'ribogl/src/data/'
random_walk_length = 32

transforms = T.Compose([T.AddRandomWalkPE(walk_length=random_walk_length)])

out_folder = 'ribogl/src/data/LiverGraphs/' 

print("Train Process")
dat = RiboDataset('train', feature_folder, data_folder, transforms, out_folder=out_folder)
print("Test Process")
dat = RiboDataset('test', feature_folder, data_folder, transforms, out_folder=out_folder)