In [None]:
from optuna.visualization import plot_optimization_history, plot_intermediate_values

from qsar.utils import utils

import optuna
import pandas as pd

from sklearn.linear_model import Lasso

In [None]:
full_train = pd.read_csv("../../data/full/train/full_train_unfiltered.csv")
full_test = pd.read_csv("../../data/full/test/full_test_unfiltered.csv")

neutral_train = pd.read_csv("../../data/neutral/train/neutral_train_unfiltered.csv")
neutral_test = pd.read_csv("../../data/neutral/test/neutral_test_unfiltered.csv")

ionizable_train = pd.read_csv("../../data/ionizable/train/ionizable_train_unfiltered.csv")
ionizable_test = pd.read_csv("../../data/ionizable/test/ionizable_test_unfiltered.csv")

# TRAIN
X_full_train = full_train.loc[:, full_train.columns != "Log_MP_RATIO"]
y_full_train = full_train["Log_MP_RATIO"]

X_neutral_train = neutral_train.loc[:, neutral_train.columns != "Log_MP_RATIO"]
y_neutral_train = neutral_train["Log_MP_RATIO"]

X_ionizable_train = ionizable_train.loc[:, ionizable_train.columns != "Log_MP_RATIO"]
y_ionizable_train = ionizable_train["Log_MP_RATIO"]

# TEST
X_full_test = full_test.loc[:, full_test.columns != "Log_MP_RATIO"]
y_full_test = full_test["Log_MP_RATIO"]

X_neutral_test = neutral_test.loc[:, neutral_test.columns != "Log_MP_RATIO"]
y_neutral_test = neutral_test["Log_MP_RATIO"]

X_ionizable_test = ionizable_test.loc[:, ionizable_test.columns != "Log_MP_RATIO"]
y_ionizable_test = ionizable_test["Log_MP_RATIO"]

In [None]:
def objective(trial, data):
    alpha = trial.suggest_float('alpha', 1e-10, 1e10, log=True)

    tol = trial.suggest_float("tol", 1e-10, 1e-2, log=False)

    selection = trial.suggest_categorical("selection", ["cyclic", "random"])

    clf = Lasso(max_iter=1000000, alpha=alpha, random_state=0, tol=tol, selection=selection)

    # n_ft = trial.suggest_int('n_ft', 1, 10, log=True)
    # clf = RFE(Ridge(max_iter=100000, alpha=alpha, solver=solver), n_features_to_select=n_ft)

    estimator = utils.Utils(data)
    return estimator.cross_value_score(clf)

    # return cross_val_score(clf, X_full_train, y_full_train, cv=3, n_jobs=-1).mean()


# Full

In [None]:
test_utils = utils.Utils(full_train)
test_utils.create_cv_folds(display=True)
test_utils.display_score(Lasso(max_iter=1000000), X_full_train, y_full_train, X_full_test, y_full_test)

In [None]:
func = lambda trial: objective(trial, full_train)

study = optuna.create_study(direction='maximize')
study.optimize(func, n_trials=1000, n_jobs=-1, show_progress_bar=True)
trial = study.best_trial
print(trial.value, trial.params)

In [None]:
test_utils.display_score(Lasso(**study.best_params), X_full_train, y_full_train, X_full_test, y_full_test)
display(plot_optimization_history(study))

rr = Lasso(**study.best_params).fit(X_full_train, y_full_train)
y_full_train_pred = rr.predict(X_full_train)
y_full_test_pred = rr.predict(X_full_test)

from qsar.utils import utils

test_utils.display_graph(rr, X_full_train, X_full_test, y_full_train, y_full_test)

# Ionizable

In [None]:
test_utils = utils.Utils(ionizable_train)
test_utils.create_cv_folds(display=True)
test_utils.display_score(Lasso(random_state=0), X_ionizable_train, y_ionizable_train, X_ionizable_test,
                         y_ionizable_test)

In [None]:
import optuna


def objective(trial):
    alpha = trial.suggest_float('alpha', 1e-10, 1e10, log=True)

    tol = trial.suggest_float("tol", 1e-10, 1e-2, log=True)

    # selection = trial.suggest_categorical("selection", ["cyclic", "random"])

    clf = Lasso(max_iter=100000, alpha=alpha, random_state=0)

    # n_ft = trial.suggest_int('n_ft', 1, 10, log=True)
    # clf = RFE(Ridge(max_iter=100000, alpha=alpha, solver=solver), n_features_to_select=n_ft)

    estimator = utils.Utils(ionizable_train)
    return estimator.cross_value_score(clf)

    # return cross_val_score(clf, X_ionizable_train, y_ionizable_train, cv=3, n_jobs=-1).mean()


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1000, n_jobs=-1, show_progress_bar=True)
trial = study.best_trial
print(trial.value, trial.params)

In [None]:
# 0.3334568442254273 {'alpha': 1.029899802316505, 'tol': 8.492961565138823e-08}
test_utils = utils.Utils(ionizable_train)
test_utils.display_score(Lasso(**study.best_params, random_state=0), X_ionizable_train, y_ionizable_train,
                         X_ionizable_test, y_ionizable_test)
study.best_params

In [None]:
Lasso(**({'alpha': 14573.566055933752, 'tol': 0.002460006941927946})).fit(X_ionizable_train, y_ionizable_train).score(
    X_ionizable_train, y_ionizable_train)

# Neutral

In [None]:
test_utils = utils.Utils(neutral_train)
test_utils.create_cv_folds(display=True)
test_utils.display_score(Lasso(random_state=0), X_neutral_train, y_neutral_train, X_neutral_test, y_neutral_test)

In [None]:
import optuna


def objective(trial):
    alpha = trial.suggest_float('alpha', 1e-10, 1e10, log=True)

    tol = trial.suggest_float("tol", 1e-10, 1e-2, log=True)

    # selection = trial.suggest_categorical("selection", ["cyclic", "random"])

    clf = Lasso(max_iter=100000, alpha=alpha, random_state=0)

    # n_ft = trial.suggest_int('n_ft', 1, 10, log=True)
    # clf = RFE(Ridge(max_iter=100000, alpha=alpha, solver=solver), n_features_to_select=n_ft)

    estimator = utils.Utils(neutral_train)
    return estimator.cross_value_score(clf)

    # return cross_val_score(clf, X_neutral_train, y_neutral_train, cv=3, n_jobs=-1).mean()


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1000, n_jobs=-1, show_progress_bar=True)
trial = study.best_trial
print(trial.value, trial.params)

In [None]:
# 0.3334568442254273 {'alpha': 1.029899802316505, 'tol': 8.492961565138823e-08}
test_utils = utils.Utils(neutral_train)
test_utils.display_score(Lasso(**study.best_params, max_iter=100000, random_state=0), X_neutral_train, y_neutral_train,
                         X_neutral_test, y_neutral_test)
study.best_params

In [None]:
display(plot_optimization_history(study))
plot_intermediate_values(study)