In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import itertools
import numpy as np
import os
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv("/content/drive/MyDrive/codone_mapping.csv")

In [None]:
df.head()

Unnamed: 0,protein_sequence,synthetic_dna
0,MKRISTTITTTITITTGNGAG,ATGAAGCGAATTTCTACGACAATTACCACAACCATAACTATTACAA...
1,MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNH...,ATGCGGGTTCTAAAGTTCGGTGGAACTTCCGTCGCAAATGCAGAGC...
2,MVKVYAPASSANMSVGFDVLGAAVTPVDGALLGDVVTVEAAETFSL...,ATGGTTAAAGTTTACGCTCCAGCTTCCTCAGCCAATATGTCCGTGG...
3,MKLYNLKDHNEQVSFAQAVTQGLGKNQGLFFPHDLPEFSLTEIDEM...,ATGAAGCTCTACAATTTGAAGGACCACAACGAGCAAGTTAGCTTTG...
4,MKKMQSIVLALSLVLVAPMAAQAAEITLVPSVKLQIGDRDNRGYYW...,ATGAAAAAGATGCAAAGTATAGTTCTAGCACTGTCTCTTGTATTGG...


In [None]:
proteins = df["protein_sequence"].tolist()
dnas = df["synthetic_dna"].tolist()


In [None]:
# Define vocab
amino_acids = list("ACDEFGHIKLMNPQRSTVWY")
amino_vocab = {aa: idx for idx, aa in enumerate(amino_acids)}
amino_vocab["<PAD>"] = 20
amino_vocab["<SOS>"] = 21

In [None]:
nucleotides = ["A", "T", "C", "G"]

In [None]:
codons = ["".join(c) for c in itertools.product(nucleotides, repeat=3)]

In [None]:
codon_vocab = {codon: idx for idx, codon in enumerate(codons)}  # 0–63
codon_vocab["<PAD>"] = 64
codon_vocab["<SOS>"] = 65
codon_vocab["<EOS>"] = 66

In [None]:
def tokenize_protein(seq):
    return [amino_vocab[aa] for aa in seq if aa in amino_vocab]

In [None]:
def tokenize_dna_to_codons(seq):
    codon_seq = [seq[i:i+3] for i in range(0, len(seq), 3)]
    codon_ids = [codon_vocab[c] for c in codon_seq if c in codon_vocab]
    return [65] + codon_ids + [66]  # Add <SOS> at start, <EOS> at end


In [None]:

# Tokenize all sequences
X, Y = [], []
for prot, dna in zip(proteins, dnas):
    p_tok = tokenize_protein(prot)
    d_tok = tokenize_dna_to_codons(dna)
    X.append(p_tok)
    Y.append(d_tok)

In [None]:
# Padding to MAX_LEN
MAX_LEN = 1160
def pad_sequence(seq, max_len, pad_val):
    return seq + [pad_val] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len]

In [None]:

X = [pad_sequence(x, MAX_LEN, amino_vocab["<PAD>"]) for x in X]
Y = [pad_sequence(y, MAX_LEN, codon_vocab["<PAD>"]) for y in Y]


In [None]:
# Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
os.makedirs("/content/drive/MyDrive/", exist_ok=True)
np.save("/content/drive/MyDrive/X_train_t.npy", np.array(X_train))
np.save("/content/drive/MyDrive/Y_train_t.npy", np.array(Y_train))
np.save("/content/drive/MyDrive/X_test_t.npy", np.array(X_test))
np.save("/content/drive/MyDrive/Y_test_t.npy", np.array(Y_test))