<a href="https://colab.research.google.com/github/yyyynwa/ToxDL/blob/master/PTP8_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

accuracy of 0.5000 and MCC of 0.0000>>
searching over more parameters, or using different hyperparameter optimization algorithm.>>
find out a better model architecture

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.metrics import accuracy_score, matthews_corrcoef
from hyperopt import hp, tpe, fmin, Trials

amino_acids = 'ACDEFGHIKLMNPQRSTVWYU'
amino_acid_to_index = {acid: index for index, acid in enumerate(amino_acids)}
num_classes = 1

data = pd.read_csv('/content/drive/MyDrive/Data_ToxIBTL/peptide.csv')

peptide_sequences = data['sequence'].tolist()
labels = data['label'].tolist()

def one_hot_encoding(sequence, max_length=1002):
    encoding = np.zeros((max_length, len(amino_acids)))
    for i, amino_acid in enumerate(sequence):
        try:
            encoding[i, amino_acid_to_index[amino_acid]] = 1
        except KeyError:
            encoding[i, amino_acid_to_index['U']] = 1
    return encoding

# 將資料集胜肽進行 one-hot encoding 並 zero-padding
max_sequence_length = max(len(seq) for seq in peptide_sequences)
encoded_sequences = [one_hot_encoding(seq, max_length=1002) for seq in peptide_sequences]

# 將資料集的標籤轉換成 numpy array
y = np.array(labels)

# 切分訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(encoded_sequences, y, test_size=0.15, random_state=66, stratify=y)

# 將 X_train 和 X_test 轉換為 numpy array
X_train = np.array(X_train)
X_test = np.array(X_test)

# Create the base model with variable hyperparameters
def create_model(params):
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Conv1D(filters=params['filters1'], kernel_size=params['kernel_size1'], activation=params['activation'], input_shape=(None, len(amino_acids))))
    model.add(tf.keras.layers.BatchNormalization())

    # Additional convolutional layers
    model.add(tf.keras.layers.Conv1D(filters=params['filters2'], kernel_size=params['kernel_size2'], activation=params['activation']))
    model.add(tf.keras.layers.BatchNormalization())

    model.add(tf.keras.layers.MaxPooling1D(pool_size=2))

    # Additional convolutional layers
    model.add(tf.keras.layers.Conv1D(filters=params['filters3'], kernel_size=params['kernel_size3'], activation=params['activation']))
    model.add(tf.keras.layers.BatchNormalization())

    model.add(tf.keras.layers.Conv1D(filters=params['filters4'], kernel_size=params['kernel_size4'], activation=params['activation']))
    model.add(tf.keras.layers.BatchNormalization())

    model.add(tf.keras.layers.MaxPooling1D(pool_size=2))
    model.add(tf.keras.layers.GlobalMaxPooling1D())

    model.add(tf.keras.layers.Dense(256, activation=params['activation']))
    model.add(tf.keras.layers.Dropout(params['dropout_rate']))

    model.add(tf.keras.layers.Dense(128, activation=params['activation']))
    model.add(tf.keras.layers.Dropout(params['dropout_rate']))

    model.add(tf.keras.layers.Dense(num_classes, activation='sigmoid'))  # Use sigmoid activation for binary classification

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Define the hyperparameter search space using hyperopt library
param_space = {
    'activation': hp.choice('activation', ['relu', 'sigmoid', 'tanh', 'elu']),
    'dropout_rate': hp.uniform('dropout_rate', 0.2, 0.5),
    'filters1': hp.choice('filters1', [32, 64, 128]),
    'kernel_size1': hp.choice('kernel_size1', [3, 5]),
    'filters2': hp.choice('filters2', [64, 128]),
    'kernel_size2': hp.choice('kernel_size2', [3, 5]),
    'filters3': hp.choice('filters3', [64, 128]),
    'kernel_size3': hp.choice('kernel_size3', [3, 5]),
    'filters4': hp.choice('filters4', [128, 256]),
    'kernel_size4': hp.choice('kernel_size4', [3, 5]),
}

# Objective function for hyperopt optimization
def objective(params):
    model = create_model(params)
    model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)
    y_pred_prob = model.predict(X_test)
    y_pred_classes = (y_pred_prob > 0.5).astype(int).flatten()
    accuracy = accuracy_score(y_test, y_pred_classes)
    return -accuracy  # We want to maximize accuracy, so we use a negative sign for minimization

# Bayesian optimization using TPE algorithm
trials = Trials()
best_hyperparams_tpe = fmin(
    fn=objective,
    space=param_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=trials
)

# Define a mapping function to convert integer choices back to their corresponding values
def map_hyperparams(hyperparams):
    return {
        'activation': ['relu', 'sigmoid', 'tanh', 'elu'][hyperparams['activation']],
        'dropout_rate': hyperparams['dropout_rate'],
        'filters1': [32, 64, 128][hyperparams['filters1']],
        'kernel_size1': [3, 5][hyperparams['kernel_size1']],
        'filters2': [64, 128][hyperparams['filters2']],
        'kernel_size2': [3, 5][hyperparams['kernel_size2']],
        'filters3': [64, 128][hyperparams['filters3']],
        'kernel_size3': [3, 5][hyperparams['kernel_size3']],
        'filters4': [128, 256][hyperparams['filters4']],
        'kernel_size4': [3, 5][hyperparams['kernel_size4']],
    }

# Retrieve the best hyperparameters found by TPE optimization and map them
best_hyperparams_tpe_mapped = map_hyperparams(best_hyperparams_tpe)

# Create and compile the best model based on the mapped hyperparameters
best_model_tpe = create_model(best_hyperparams_tpe_mapped)

# Train the best model on the full training set
best_model_tpe.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)

# Evaluate the best model on the test set and print performance metrics
y_pred_prob_tpe = best_model_tpe.predict(X_test)
y_pred_classes_tpe = (y_pred_prob_tpe > 0.5).astype(int).flatten()
accuracy_tpe = accuracy_score(y_test, y_pred_classes_tpe)
mcc_tpe = matthews_corrcoef(y_test, y_pred_classes_tpe)

print("Best Hyperparameters (TPE Optimization):")
print(best_hyperparams_tpe_mapped)
print(f"Accuracy: {accuracy_tpe:.4f}")
print(f"MCC: {mcc_tpe:.4f}")


 1/19 [>.............................] - ETA: 5s

 1/19 [>.............................] - ETA: 2s

 1/19 [>.............................] - ETA: 3s

 1/19 [>.............................] - ETA: 4s

 1/19 [>.............................] - ETA: 3s

 1/19 [>.............................] - ETA: 3s

 1/19 [>.............................] - ETA: 4s

 1/19 [>.............................] - ETA: 3s

 1/19 [>.............................] - ETA: 2s

 1/19 [>.............................] - ETA: 4s

 1/19 [>.............................] - ETA: 4s

 1/19 [>.............................] - ETA: 4s

 1/19 [>.............................] - ETA: 4s

 1/19 [>.............................] - ETA: 2s

 1/19 [>.............................] - ETA: 5s

 1/19 [>.............................] - ETA: 3s

 1/19 [>.............................] - ETA: 3s

 1/19 [>.............................] - ETA: 3s

 1/19 [>.............................] - ETA: 2s

 1/19 [>.............................] - ETA: 2s

