### machine learning models

In [1]:
import classicalml as cml
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# read data from csv file
x_train,y_train = cml.read_data("BBBP_train_processed.csv")
x_test,y_test = cml.read_data("BBBP_test_processed.csv")
x_train,x_test = cml.scale(x_train,x_test)

In [3]:
y_train = [int(i) for i in y_train ]
y_test = [int(i) for i in y_test ]

In [None]:
# classical machine learning methods
methods = ["svm","lda","rf","dt","lr","gmm"]

plt.figure(figsize=(8, 6))
for method in methods:
    print("Method: ",method)
    cml.classical(x_train,y_train,x_test,y_test,method)
plt.plot([0, 1], [0, 1], 'k--')  # Add a diagonal line for reference
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.savefig("classical_roc.png")
# Show the plot
plt.show()

### Deep learning model 

In [None]:
# import keras for deep learning
from tensorflow import keras
from keras.layers import Dense, Dropout
from keras.models import Sequential
import wandb
from wandb.keras import WandbCallback


In [8]:
y_train = np.array(y_train)
y_test = np.array(y_test)
x_train = np.array(x_train)
x_test = np.array(x_test)

In [10]:
# log in to wandb for tracking
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mwtguo[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [11]:
# Configure the sweep – specify the parameters to search through, the search strategy, the optimization metric et all.
sweep_config = {
    'method': 'grid', #grid, random
    'metric': {
      'name': 'accuracy',
      'goal': 'maximize'
    },
    'parameters': {
        'epochs': {
            'values': [200]
        },
        'batch_size': {
            'values': [8,16,32]
        },
        'dropout': {
            'values': [0.0]
        },
        'hidden_layer': {
            'values': [2]
        },
        'dense_units': {
            'values': [32,64]
        },
        'learning_rate': {
            'values': [1e-6, 1e-5, 1e-4, 3e-4, 1e-3, 5e-3, 0.1]

        },
        'optimizer': {
            'values': ['adam', 'nadam', 'sgd', 'rmsprop']

        },
        'activation': {
            'values': ['relu', 'elu', 'selu', 'softmax']

        }
    }
}

In [None]:
# define sweep_id according to sweep_config and project name
sweep_id = wandb.sweep(sweep_config, entity="wtguo", project="bbbp")

In [14]:
# define a function to train the model
def train(input_size = x_train.shape[1]):
    # Default values for hyper-parameters we're going to sweep over
    config_defaults = {
        'epochs': 5,
        'batch_size': 8,
        'weight_decay': 0.0005,
        'learning_rate': 1e-3,
        'activation': 'relu',
        'optimizer': 'nadam',
        'hidden_layer': 16,
        'dense_units': 128,
        'dropout': 0.5,
        'momentum': 0.9,
        'seed': 42
    }

    # Initialize a new wandb run
    wandb.init(config=config_defaults)
    config = wandb.config

    # Define the model architecture - This is a simplified version of the VGG19 architecture
    model = Sequential()
    model.add(Dense(units = config.dense_units, activation=config.activation, input_shape=(input_size,)))
    model.add(Dropout(config.dropout))

    # add layers
    for _ in range(config.hidden_layer):
      model.add(Dense(units = config.dense_units, activation=config.activation))
      model.add(Dropout(config.dropout))

    # add activation function
    model.add(Dense(1, activation = "sigmoid"))

    # Define the optimizer
    if config.optimizer=='sgd':
      optimizer = keras.optimizers.legacy.SGD(learning_rate=config.learning_rate, decay=1e-5, momentum=config.momentum, nesterov=True)
    elif config.optimizer=='rmsprop':
      optimizer = keras.optimizers.legacy.RMSprop(learning_rate=config.learning_rate, decay=1e-5)
    elif config.optimizer=='adam':
      optimizer = keras.optimizers.legacy.Adam(learning_rate=config.learning_rate, beta_1=0.9, beta_2=0.999, clipnorm=1.0)
    elif config.optimizer=='nadam':
      optimizer = keras.optimizers.legacy.Nadam(learning_rate=config.learning_rate, beta_1=0.9, beta_2=0.999, clipnorm=1.0)

    model.compile(loss = "binary_crossentropy", optimizer = optimizer, metrics=['accuracy'])

    model.fit(x_train, y_train, batch_size=config.batch_size,
              epochs=config.epochs,
              validation_data=(x_test, y_test),
#              callbacks=[WandbCallback(validation_data=(X_test, y_test)),
#                          EarlyStopping(patience=10, restore_best_weights=True)])
              callbacks=[WandbCallback(validation_data=(x_test, y_test))])
    y_pred = model.predict(x_test)
    y_pred = y_pred > 0.5
    #print(classification_report(y_test, y_pred))

In [None]:
# Run an agent to execute the sweep
wandb.agent(sweep_id, train)