In [None]:
import pandas as pd
import numpy as np
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import SimpleRNN, LSTM, GRU, Dense, Embedding, Flatten, Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, confusion_matrix


In [None]:
def one_hot_encode_sequence(sequence, max_length=120):
    mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    encoded = np.zeros((max_length, 4))
    for i, base in enumerate(sequence[:max_length]):
        if base in mapping:
            encoded[i, mapping[base]] = 1

In [None]:
df1 = pd.read_csv('/content/train.csv')

In [None]:
sequence_length = 120
embedding_dim = 4

X = np.array([encode_DNA_sequence(seq, sequence_length) for seq in df1['Forward_Sequence']])
y = df1['Beta'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
def GRU_model(hyperparameters):
    model = Sequential([
        tf.keras.layers.Input(shape=(sequence_length, embedding_dim)),
        tf.keras.layers.GRU(
            units=hyperparameters.Int('units', min_value=64, max_value=256, step=64),
            activation='relu',
            dropout=hyperparameters.Float('dropout', min_value=0.2, max_value=0.4, step=0.1),
            recurrent_dropout=hyperparameters.Float('recurrent_dropout', min_value=0.1, max_value=0.3, step=0.1)
        ),
        Dense(
            units=hyperparameters.Int('dense_units', min_value=32, max_value=128, step=32), activation='relu'
        ),
        Dense(1, activation='sigmoid')
    ])
    model.compile(
        optimizer=tf.keras.optimizers.Adam(hyperparameters.Choice('learning_rate', [0.001, 0.0005, 0.0001])),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

gru_tuner = kt.RandomSearch(
    GRU_model,
    objective='val_accuracy',
    max_trials=10,
)

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_accuracy', patience=5, restore_best_weights=True, verbose=1
)

gru_tuner.search(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=10,
    batch_size=32,
    callbacks=[early_stopping]
)

best_hyperparameters = gru_tuner.get_best_hyperparameters(num_trials=1)[0]
print(f"Best GRU Hyperparameters: {best_hyperparameters.values}")

In [None]:
def LSTM_model(hyperparameters):
    model = Sequential([
        tf.keras.layers.Input(shape=(sequence_length, embedding_dim)),
        tf.keras.layers.LSTM(
            units=hyperparameters.Int('units', min_value=64, max_value=256, step=64),
            activation='relu',
            dropout=hyperparameters.Float('dropout', min_value=0.2, max_value=0.4, step=0.1),
            recurrent_dropout=hyperparameters.Float('recurrent_dropout', min_value=0.1, max_value=0.3, step=0.1)
        ),
        Dense(
            units=hyperparameters.Int('dense_units', min_value=32, max_value=128, step=32), activation='relu'
        ),
        Dense(1, activation='sigmoid')
    ])
    model.compile(
        optimizer=tf.keras.optimizers.Adam(hyperparameters.Choice('learning_rate', [0.001, 0.0005, 0.0001])),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

LSTM_tuner = kt.RandomSearch(
    LSTM_model,
    objective='val_accuracy',
    max_trials=10,
)

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_accuracy', patience=5, restore_best_weights=True, verbose=1
)

LSTM_tuner.search(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=10,
    batch_size=32,
    callbacks=[early_stopping]
)

best_hyperparameters = LSTM_tuner.get_best_hyperparameters(num_trials=1)[0]
print(f"Best LSTM Hyperparameters: {best_hyperparameters.values}")

In [None]:
def SimpleRNN_model(hyperparameters):
    model = Sequential([
        tf.keras.layers.Input(shape=(sequence_length, embedding_dim)),
        tf.keras.layers.SimpleRNN(
            units=hyperparameters.Int('units', min_value=64, max_value=256, step=64),
            activation='relu',
            dropout=hyperparameters.Float('dropout', min_value=0.2, max_value=0.4, step=0.1),
            recurrent_dropout=hyperparameters.Float('recurrent_dropout', min_value=0.1, max_value=0.3, step=0.1)
        ),
        Dense(
            units=hyperparameters.Int('dense_units', min_value=32, max_value=128, step=32), activation='relu'
        ),
        Dense(1, activation='sigmoid')
    ])
    model.compile(
        optimizer=tf.keras.optimizers.Adam(hyperparameters.Choice('learning_rate', [0.001, 0.0005, 0.0001])),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

tuner = kt.RandomSearch(
    GRU_model,
    objective='val_accuracy',
    max_trials=10,
)

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_accuracy', patience=5, restore_best_weights=True, verbose=1
)

tuner.search(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=10,
    batch_size=32,
    callbacks=[early_stopping]
)

best_hyperparameters = tuner.get_best_hyperparameters(num_trials=1)[0]
print(f"Best Simple RNN Hyperparameters: {best_hyperparameters.values}")

In [None]:
def SimpleRNN(sequence_length, embedding_dim, learning_rate=0.001, dropout_rate=0.3):
    model = Sequential([
        tf.keras.layers.Input(shape=(sequence_length, embedding_dim)),
        SimpleRNN(192, activation='relu', return_sequences=False),
        Dropout(0.2),
        Dense(128, activation='relu'),
        Dropout(0.2),
        Dense(1, activation='sigmoid')
    ])
    optimizer = Adam(learning_rate=0.0001)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

sequence_length = 120
embedding_dim = 4

X = np.array([encode_DNA_sequence(seq, sequence_length) for seq in df1['Forward_Sequence']])
y = df1['Beta'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rnn_model = SimpleRNN(sequence_length=120, embedding_dim=4)
history = rnn_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, batch_size=32)

y_pred_probs = rnn_model.predict(X_test)

report = classification_report(y_test, y_pred, target_names=['Unmethylated', 'Methylated'])
print("Classification Report:")
print(report)


Epoch 1/20
[1m727/727[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 14ms/step - accuracy: 0.6925 - loss: 0.6167 - val_accuracy: 0.6895 - val_loss: 0.6067
Epoch 2/20
[1m727/727[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 7ms/step - accuracy: 0.6997 - loss: 0.5973 - val_accuracy: 0.6895 - val_loss: 0.6041
Epoch 3/20
[1m727/727[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 6ms/step - accuracy: 0.6979 - loss: 0.5928 - val_accuracy: 0.6895 - val_loss: 0.6022
Epoch 4/20
[1m727/727[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - accuracy: 0.6996 - loss: 0.5887 - val_accuracy: 0.6895 - val_loss: 0.6021
Epoch 5/20
[1m727/727[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.7005 - loss: 0.5881 - val_accuracy: 0.6897 - val_loss: 0.6019
Epoch 6/20
[1m727/727[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 10ms/step - accuracy: 0.6985 - loss: 0.5885 - val_accuracy: 0.6893 - val_loss: 0.5997
Epoch 7/20
[1m727/727[

In [None]:
def GRU(sequence_length, embedding_dim, learning_rate=0.001, dropout_rate=0.3):
    model = Sequential([
        tf.keras.layers.Input(shape=(sequence_length, embedding_dim)),
        GRU(192, activation='relu', return_sequences=False),
        Dropout(0.2),
        Dense(128, activation='relu'),
        Dropout(0.1),
        Dense(1, activation='sigmoid')
    ])

    optimizer = Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

sequence_length = 120
embedding_dim = 4

X = np.array([encode_DNA_sequence(seq, sequence_length) for seq in df1['Forward_Sequence']])
y = df1['Beta'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

gru_model = GRU(sequence_length=120, embedding_dim=4)
history = rnn_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, batch_size=32)

y_pred_probs = gru_model.predict(X_test)

report = classification_report(y_test, y_pred, target_names=['Unmethylated', 'Methylated'])
print("Classification Report:")
print(report)


Epoch 1/20
[1m727/727[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 20ms/step - accuracy: 0.6964 - loss: 0.6096 - val_accuracy: 0.6895 - val_loss: 0.6040
Epoch 2/20
[1m727/727[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 25ms/step - accuracy: 0.7002 - loss: 0.5955 - val_accuracy: 0.6895 - val_loss: 0.6014
Epoch 3/20
[1m727/727[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 23ms/step - accuracy: 0.7008 - loss: 0.5905 - val_accuracy: 0.6895 - val_loss: 0.6023
Epoch 4/20
[1m727/727[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 21ms/step - accuracy: 0.6987 - loss: 0.5904 - val_accuracy: 0.6895 - val_loss: 0.6031
Epoch 5/20
[1m727/727[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 21ms/step - accuracy: 0.7003 - loss: 0.5901 - val_accuracy: 0.6895 - val_loss: 0.6025
Epoch 6/20
[1m727/727[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 21ms/step - accuracy: 0.6928 - loss: 0.5930 - val_accuracy: 0.6895 - val_loss: 0.6011
Epoch 7/20
[1m7

In [None]:
def LSTM(sequence_length, embedding_dim, learning_rate=0.001, dropout_rate=0.3):
    model = Sequential([
        tf.keras.layers.Input(shape=(sequence_length, embedding_dim)),
        LSTM(64, activation='relu', return_sequences=False),
        Dropout(0.2),
        Dense(96, activation='relu'),
        Dropout(0.1),
        Dense(1, activation='sigmoid')
    ])

    optimizer = Adam(learning_rate=0.0005)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

sequence_length = 120
embedding_dim = 4

X = np.array([encode_DNA_sequence(seq, sequence_length) for seq in df1['Forward_Sequence']])
y = df1['Beta'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lstm_model = LSTM(sequence_length=120, embedding_dim=4)
history = rnn_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, batch_size=32)

y_pred_probs = lstm_model.predict(X_test)

report = classification_report(y_test, y_pred, target_names=['Unmethylated', 'Methylated'])
print("Classification Report:")
print(report)


Epoch 1/20
[1m727/727[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - accuracy: 0.6947 - loss: 0.6179 - val_accuracy: 0.6895 - val_loss: 0.6092
Epoch 2/20
[1m727/727[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.7021 - loss: 0.5970 - val_accuracy: 0.6895 - val_loss: 0.6068
Epoch 3/20
[1m727/727[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 6ms/step - accuracy: 0.7036 - loss: 0.5921 - val_accuracy: 0.6895 - val_loss: 0.6068
Epoch 4/20
[1m727/727[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.7005 - loss: 0.5958 - val_accuracy: 0.6895 - val_loss: 0.6041
Epoch 5/20
[1m727/727[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.6987 - loss: 0.5969 - val_accuracy: 0.6895 - val_loss: 0.6072
Epoch 6/20
[1m657/727[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 5ms/step - accuracy: 0.6992 - loss: 0.5944

KeyboardInterrupt: 