<a href="https://colab.research.google.com/github/yanos84/Tutorials/blob/main/codon_translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dense, MultiHeadAttention, LayerNormalization, Dropout
from tensorflow.keras.models import Model
import numpy as np

# Define DNA codons and their corresponding amino acids
codon_table = {
    'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
    # Add all the codon-to-amino acid mappings here (total 64 codons)
    'TAA': 'Stop', 'TAG': 'Stop', 'TGA': 'Stop'
}
amino_acids = list(set(codon_table.values()))

# Mapping from codons and amino acids to integer indices
codon_to_idx = {codon: i for i, codon in enumerate(codon_table.keys())}
amino_acid_to_idx = {aa: i for i, aa in enumerate(amino_acids)}

# Parameters
num_codons = len(codon_to_idx)
num_amino_acids = len(amino_acids)
embedding_dim = 64
num_heads = 4
ff_dim = 128
max_len = 256

# Positional encoding
class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, max_len, d_model):
        super().__init__()
        self.positional_encoding = self.get_positional_encoding(max_len, d_model)

    def get_positional_encoding(self, max_len, d_model):
        positions = np.arange(max_len)[:, np.newaxis]
        dimensions = np.arange(d_model)[np.newaxis, :]
        angle_rates = 1 / np.power(10000, (2 * (dimensions // 2)) / np.float32(d_model))
        angle_rads = positions * angle_rates
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
        return tf.constant(angle_rads, dtype=tf.float32)

    def call(self, inputs):
        return inputs + self.positional_encoding[: tf.shape(inputs)[1], :]

# Transformer block
def transformer_block(inputs, head_size, ff_dim, dropout_rate):
    attention_output = MultiHeadAttention(num_heads=head_size, key_dim=inputs.shape[-1])(inputs, inputs)
    attention_output = Dropout(dropout_rate)(attention_output)
    attention_output = LayerNormalization(epsilon=1e-6)(inputs + attention_output)
    ff_output = Dense(ff_dim, activation="relu")(attention_output)
    ff_output = Dense(inputs.shape[-1])(ff_output)
    ff_output = Dropout(dropout_rate)(ff_output)
    return LayerNormalization(epsilon=1e-6)(attention_output + ff_output)

# Build the model
def build_model(max_len, num_codons, num_amino_acids, embedding_dim, num_heads, ff_dim):
    inputs = Input(shape=(max_len,))
    x = Embedding(input_dim=num_codons, output_dim=embedding_dim)(inputs)
    x = PositionalEncoding(max_len, embedding_dim)(x)

    # Add multiple Transformer blocks
    for _ in range(2):  # Stack 2 transformer blocks
        x = transformer_block(x, num_heads, ff_dim, dropout_rate=0.1)

    outputs = Dense(num_amino_acids, activation="softmax")(x)
    model = Model(inputs, outputs)
    return model

# Compile and summarize the model
model = build_model(max_len, num_codons, num_amino_acids, embedding_dim, num_heads, ff_dim)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.summary()


In [7]:
import numpy as np

# Example DNA sequence (length must not exceed `max_len`)
dna_sequence = "ATGTTTCTTAAA"

# Step 1: Split DNA into codons
def split_into_codons(sequence):
    return [sequence[i:i+3] for i in range(0, len(sequence), 3)]

# Step 2: Convert codons to indices
def codons_to_indices(codons, codon_to_idx):
    return [codon_to_idx[codon] for codon in codons if codon in codon_to_idx]

# Step 3: Map model predictions to amino acids
def predictions_to_amino_acids(predictions, amino_acid_to_idx):
    predicted_amino_acids = [list(amino_acid_to_idx.keys())[np.argmax(p)] for p in predictions]
    return predicted_amino_acids

# Preprocess DNA sequence
codons = split_into_codons(dna_sequence)
codon_indices = codons_to_indices(codons, codon_to_idx)

# Pad sequence to `max_len`
padded_indices = np.zeros(max_len, dtype=np.int32)
padded_indices[:len(codon_indices)] = codon_indices

# Predict using the model
predictions = model.predict(np.array([padded_indices]))

# Translate predictions to amino acids
translated_sequence = predictions_to_amino_acids(predictions[0], amino_acid_to_idx)

# Print translation result
print("DNA sequence:", dna_sequence)
print("Codons:", codons)
print("Translated protein sequence:", translated_sequence)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
DNA sequence: ATGTTTCTTAAA
Codons: ['ATG', 'TTT', 'CTT', 'AAA']
Translated protein sequence: ['F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'L', 'F', 'F', 'F', 'F', 'Stop', 'Stop', 'Stop', 'F', 'F', 'F', 'L', 'L', 'L', 'L', 'L', 'F', 'F', 'F', 'F', 'F', 'F', 'L', 'L', 'L', 'L', 'L', 'L', 'F', 'Stop', 'Stop', 'Stop', 'F', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'L', 'L', 'L', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'L', 'L', 'L', 'L', 'Stop', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'L', 'L', 'L', 'L', 'L', 'L', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'L', 'L', 'L', 'L', 'L', 'F', 'F', 'F', 'F', 'F', 'F', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L'