In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import mean_squared_error
from sklearn.neural_network import MLPRegressor
import optuna

In [2]:
df = pd.read_csv('Datasets/sars_cov_ML_dataset.csv')

X = df.drop(columns=['pIC50'])
y = df['pIC50']

selection = VarianceThreshold(threshold=(.8 * (1 - .8)))
X_var = selection.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_var, y, test_size=0.2, random_state=21)

In [3]:
def objective(trial):

    hidden_layer_sizes = trial.suggest_categorical('hidden_layer_sizes', [(50,), (100,), (50, 50), (100, 50), (100, 100)])
    activation = trial.suggest_categorical('activation', ['relu', 'tanh', 'logistic'])
    solver = trial.suggest_categorical('solver', ['adam', 'sgd'])
    alpha = trial.suggest_float('alpha', 1e-5, 1e-1, log=True)
    learning_rate = trial.suggest_categorical('learning_rate', ['constant', 'invscaling', 'adaptive'])
    
    model = MLPRegressor(
        hidden_layer_sizes=hidden_layer_sizes,
        activation=activation,
        solver=solver,
        alpha=alpha,
        learning_rate=learning_rate,
        random_state=42,
        max_iter=2000
    )
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred, squared=False)
    
    return mse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=200)

print("Best hyperparameters: ", study.best_params)
print("Best MSE: ", study.best_value)

[I 2024-06-24 18:15:55,245] A new study created in memory with name: no-name-4f6bbbed-396f-433d-93ba-a50bfb5b3899
[I 2024-06-24 18:15:56,179] Trial 0 finished with value: 0.6769650556467525 and parameters: {'hidden_layer_sizes': (100,), 'activation': 'tanh', 'solver': 'sgd', 'alpha': 0.0002411479802067281, 'learning_rate': 'constant'}. Best is trial 0 with value: 0.6769650556467525.
[I 2024-06-24 18:15:57,005] Trial 1 finished with value: 0.6849395414174472 and parameters: {'hidden_layer_sizes': (100, 100), 'activation': 'tanh', 'solver': 'sgd', 'alpha': 0.00026542688885353605, 'learning_rate': 'adaptive'}. Best is trial 0 with value: 0.6769650556467525.
[I 2024-06-24 18:15:57,180] Trial 2 finished with value: 1.027072804126722 and parameters: {'hidden_layer_sizes': (50,), 'activation': 'logistic', 'solver': 'sgd', 'alpha': 4.243780885091303e-05, 'learning_rate': 'adaptive'}. Best is trial 0 with value: 0.6769650556467525.
[I 2024-06-24 18:15:57,249] Trial 3 finished with value: 1.1147

Best hyperparameters:  {'hidden_layer_sizes': (50,), 'activation': 'relu', 'solver': 'adam', 'alpha': 0.0045045665226120715, 'learning_rate': 'adaptive'}
Best MSE:  0.5656037664723716


In [4]:
best_params = study.best_params

best_model = MLPRegressor(
    hidden_layer_sizes=best_params['hidden_layer_sizes'],
    activation=best_params['activation'],
    solver=best_params['solver'],
    alpha=best_params['alpha'],
    learning_rate=best_params['learning_rate'],
    random_state=42,
    max_iter=2000  # Increase iterations if needed
)

best_model.fit(X_train, y_train)
r2 = best_model.score(X=X_test, y=y_test)
y_pred = best_model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print("R2 with best hyperparameters: ", r2)
print("RMSE with best hyperparameters: ", rmse)

R2 with best hyperparameters:  0.7334078843458398
RMSE with best hyperparameters:  0.5656037664723716
