In [None]:
import numpy as np
import pandas as pd
import pywt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit
import pickle

import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", message="X has feature names, but SVC was fitted without feature names")

# Load features and labels

In [None]:
# Load training data

train_features_step_1 = pd.read_csv('data/top_5000_std_step_1_filtered.csv', index_col=0)
train_features_step_2 = pd.read_csv('data/top_5000_std_step_2_filtered.csv', index_col=0)
train_features_step_3 = pd.read_csv('data/top_5000_std_step_3_filtered.csv', index_col=0)

training_anno_step_1 = pd.read_csv('data/step_1_training_anno.csv', index_col=0)
training_anno_step_2 = pd.read_csv('data/step_2_training_anno.csv', index_col=0)
training_anno_step_3 = pd.read_csv('data/step_3_training_anno.csv', index_col=0)


## Logistic Regression

In [None]:
import optuna
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix
from sklearn.decomposition import PCA
from math import sqrt

# Optuna optimization function using LR
def objective_lr(trial, features, labels, trial_data):

    # print('------------------------------------------------------------------------')
    # print(f'Training model number: {trial.number} ...') # Use trial.number

    # --- Logistic Regression Hyperparameter Grid ---
    param_grid = {
        # PCA/Data selection params (kept from original)
        "n_components": trial.suggest_categorical("n_components", [ 3, 5, 7, 10, 13, 15, 20, 25]), # Use features.shape[1] for max components
        # --- Logistic Regression specific params ---
        "solver": trial.suggest_categorical("solver", ["liblinear", "saga", "lbfgs"]), # Common solvers
        "penalty": trial.suggest_categorical("penalty", ["l1", "l2", None]), # Regularization types
        "C": trial.suggest_float("C", 1e-4, 1e2, log=True), # Inverse regularization strength
        "max_iter": trial.suggest_int("max_iter", 50, 1000, log=True), # Max iterations for solver
        "class_weight": trial.suggest_categorical("class_weight", [None, "balanced"]), # Handle class imbalance
    }

    # --- Solver/Penalty Compatibility Check ---
    solver = param_grid["solver"]
    penalty = param_grid["penalty"]

    if solver == "liblinear" and penalty not in ["l1", "l2"]:
        raise optuna.TrialPruned()
    if solver == "lbfgs" and penalty not in ["l2", None]:
         raise optuna.TrialPruned()
    # 'saga' supports 'l1', 'l2', 'none' (and 'elasticnet' if we added it)
    if penalty is None and solver not in ["lbfgs", "saga"]: # Technically newton-cg also supports none
        # Allow saga, prune others if 'none' is selected but solver isn't compatible
        if solver != "saga":
             raise optuna.TrialPruned()


    if param_grid['n_components'] == 0:
        training_features_pca = features.to_numpy() # Assuming features is a DataFrame
        training_labels = np.array(labels)
    else:
        # Prepare PCA features
        pca_model_lr = PCA(n_components=param_grid['n_components'], random_state=42)

        # PCA Fit & Transform
        # Ensure features is suitable for PCA (e.g., numerical, scaled if necessary)
        pca_model_lr.fit(features)
        training_features_pca = pca_model_lr.transform(features)
        training_labels = np.array(labels)

    # --- Cross-Validation and Model Training ---
    train_accuracy = 0
    validation_accuracy = 0
    gmean = 0
    fold_training_accuracies = []
    fold_validation_accuracies = []

    kf = StratifiedShuffleSplit(n_splits=5, test_size=0.25, random_state=42)
    for fold, (train_index, val_index) in enumerate(kf.split(training_features_pca, training_labels)):
        X_train, X_val = training_features_pca[train_index], training_features_pca[val_index]
        y_train, y_val = training_labels[train_index], training_labels[val_index]

        # --- Instantiate and Train Logistic Regression Model ---
        # Handle penalty='none' case where C is not used
        lr_model_params = {
            "solver": param_grid['solver'],
            "penalty": param_grid['penalty'],
            "max_iter": param_grid['max_iter'],
            "class_weight": param_grid['class_weight'],
            "random_state": 42,
            "n_jobs": 1 # Use all available CPU cores
        }
        if param_grid['penalty'] is not None:
            lr_model_params["C"] = param_grid['C']
        # If penalty is 'none', solver must be compatible (checked above) and C is ignored by sklearn

        # Handle solver specific requirements if penalty is 'none'
        if param_grid['penalty'] is None and param_grid['solver'] == 'liblinear':
             # liblinear does not support penalty='none', already pruned.
             # This block is technically redundant due to pruning check but kept for clarity.
             print("Error condition: liblinear with penalty='none'. Should have been pruned.")
             return float('inf') # Should not happen

        model_fold_lr = LogisticRegression(**lr_model_params)
        model_fold_lr.fit(X_train, y_train)

        # --- Evaluation ---
        train_predictions = model_fold_lr.predict(X_train)
        train_accuracy += accuracy_score(y_train, train_predictions)

        validation_predictions = model_fold_lr.predict(X_val)
        validation_accuracy += accuracy_score(y_val, validation_predictions)

        fold_training_accuracies.append(accuracy_score(y_train, train_predictions))
        fold_validation_accuracies.append(accuracy_score(y_val, validation_predictions))

        # Use try-except for confusion matrix if a class might be missing in y_val
        try:
            tn, fp, fn, tp = confusion_matrix(y_val, validation_predictions, labels=[0, 1]).ravel()
             # Calculate sensitivity (True Positive Rate for class 1)
            sensitivity = tp / (tp + fn) if (tp + fn) != 0 else 0
            # Calculate specificity (True Negative Rate for class 0)
            specificity = tn / (tn + fp) if (tn + fp) != 0 else 0
            # Calculate G-Mean.
            gmean += sqrt(max(0, sensitivity * specificity)) # Ensure non-negative before sqrt
        except ValueError:
             print(f"Warning: Could not compute confusion matrix for fold {fold}. Check class distribution in y_val.")
             # Handle G-mean calculation appropriately, e.g., set fold gmean to 0 or skip
             gmean += 0 # Or choose another way to handle this fold's contribution

    # --- Calculate Average Metrics ---
    n_splits = kf.get_n_splits() # Use actual number of splits performed
    avg_training_accuracy = train_accuracy / n_splits
    avg_validation_accuracy = validation_accuracy / n_splits

    # --- Optimization Goal ---
    optimization_goal = gmean / 5

    # --- Log Trial Data ---
    trial_data.append(
        {
            # PCA/Data params
            "n_components": param_grid["n_components"],
            # --- Logistic Regression params ---
            "solver": param_grid["solver"],
            "penalty": param_grid["penalty"],
            "C": param_grid.get("C", None), # Use .get as C might not exist if penalty='none'
            "max_iter": param_grid["max_iter"],
            "class_weight": param_grid["class_weight"],
            # Metrics
            "avg_training_accuracy": avg_training_accuracy,
            "avg_validation_accuracy": avg_validation_accuracy,
            "avg_gmean": gmean / 5,
            "optimization_goal": optimization_goal
        }
    )

    return optimization_goal


## Support Vector Machine

In [None]:
from sklearn.svm import SVC # Import Support Vector Classifier

# Optuna optimization function using SVM
def objective_svm(trial, features, labels, trial_data):

    # print('------------------------------------------------------------------------')
    # print(f'Training model number: {trial.number} ...') # Use trial.number

    # --- Support Vector Machine (SVC) Hyperparameter Grid ---
    param_grid = {
        # PCA/Data selection params
        "n_components": trial.suggest_categorical("n_components", [0, 10, 11, 12, 13, 14, 15, 20, 25]), # Use features.shape[1] for max components

        # --- SVC specific params ---
        "kernel": trial.suggest_categorical("kernel", ["linear", "rbf", "poly", "sigmoid"]),
        "C": trial.suggest_float("C", 1e-3, 1e3, log=True), # Regularization parameter
        "probability": trial.suggest_categorical("probability", [False, True]), # Enable probability estimates (slower)
        "class_weight": trial.suggest_categorical("class_weight", [None, "balanced"]),
        # Conditional gamma (only for rbf, poly, sigmoid)
        "gamma_type": trial.suggest_categorical("gamma_type", ["scale", "auto", "float"]), # How gamma is determined
        # Conditional degree (only for poly)
        "degree": trial.suggest_int("degree", 2, 5), # Degree for poly kernel
    }

    # --- Determine actual gamma value ---
    actual_gamma = None
    gamma_float_value = None # To store the suggested float value for logging
    if param_grid["kernel"] in ["rbf", "poly", "sigmoid"]:
        if param_grid["gamma_type"] == "float":
            # Suggest gamma value only if type is float and kernel needs it
            gamma_float_value = trial.suggest_float("gamma_float", 1e-4, 1e1, log=True)
            actual_gamma = gamma_float_value
        else:
            actual_gamma = param_grid["gamma_type"] # Use 'scale' or 'auto' string

    # --- Data Preparation (PCA and Sampling - kept from original) ---
    if param_grid['n_components'] == 0:
        training_features_pca = features.to_numpy() # Assuming features is a DataFrame
        training_labels = np.array(labels)
    else:
        # Prepare PCA features
        pca_svm_model = PCA(n_components=param_grid['n_components'], random_state=42)

        # PCA Fit & Transform
        # Ensure features is suitable for PCA (e.g., numerical, scaled if necessary)
        pca_svm_model.fit(features)
        training_features_pca = pca_svm_model.transform(features)
        training_labels = np.array(labels)

    # --- Cross-Validation and svm_model Training ---
    train_accuracy = 0
    validation_accuracy = 0
    balanced_accuracy = 0
    gmean = 0
    fold_training_accuracies = []
    fold_validation_accuracies = []
    fold_lgg_correct = [] # Assuming LGG is class 1

    kf = StratifiedShuffleSplit(n_splits=5, test_size=0.25, random_state=42)
    for fold, (train_index, val_index) in enumerate(kf.split(training_features_pca, training_labels)):
        X_train, X_val = training_features_pca[train_index], training_features_pca[val_index]
        y_train, y_val = training_labels[train_index], training_labels[val_index]

        # --- Instantiate and Train SVC svm_model ---
        svm_model_params = {
            "C": param_grid['C'],
            "kernel": param_grid['kernel'],
            "probability": param_grid['probability'],
            "class_weight": param_grid['class_weight'],
            "random_state": 42,
            # Consider adding cache_size if memory/speed becomes an issue
            # "cache_size": 500 # Example: 500MB cache
        }
        # Add gamma only if kernel requires it
        if param_grid["kernel"] in ["rbf", "poly", "sigmoid"]:
            svm_model_params["gamma"] = actual_gamma
        # Add degree only if kernel is poly
        if param_grid["kernel"] == "poly":
            svm_model_params["degree"] = param_grid['degree']

        svm_model = SVC(**svm_model_params)

        svm_model.fit(X_train, y_train)

        # --- Evaluation ---
        train_predictions = svm_model.predict(X_train)
        train_accuracy += accuracy_score(y_train, train_predictions)

        validation_predictions = svm_model.predict(X_val)
        validation_accuracy += accuracy_score(y_val, validation_predictions)

        # Calculate correct predictions for class 1 (assuming LGG is class 1)
        lgg_indices_in_val = (y_val == 1)
        if np.sum(lgg_indices_in_val) > 0: # Avoid division by zero if no class 1 in validation
             correct_lgg = np.sum(validation_predictions[lgg_indices_in_val] == 1)
             fold_lgg_correct.append(correct_lgg / np.sum(lgg_indices_in_val))
        else:
             fold_lgg_correct.append(0) # Or handle as appropriate (e.g., NaN, skip fold metric)


        fold_training_accuracies.append(accuracy_score(y_train, train_predictions))
        fold_validation_accuracies.append(accuracy_score(y_val, validation_predictions))

        balanced_accuracy += balanced_accuracy_score(y_val, validation_predictions)

        # Use try-except for confusion matrix if a class might be missing in y_val
        try:
            tn, fp, fn, tp = confusion_matrix(y_val, validation_predictions, labels=[0, 1]).ravel()
             # Calculate sensitivity (True Positive Rate for class 1)
            sensitivity = tp / (tp + fn) if (tp + fn) != 0 else 0
            # Calculate specificity (True Negative Rate for class 0)
            specificity = tn / (tn + fp) if (tn + fp) != 0 else 0
            # Calculate G-Mean.
            gmean += sqrt(max(0, sensitivity * specificity)) # Ensure non-negative before sqrt
        except ValueError:
             print(f"Warning: Could not compute confusion matrix for fold {fold}. Check class distribution in y_val.")
             # Handle G-mean calculation appropriately, e.g., set fold gmean to 0 or skip
             gmean += 0 # Or choose another way to handle this fold's contribution

    # --- Calculate Average Metrics ---
    n_splits = kf.get_n_splits() # Use actual number of splits performed
    avg_training_accuracy = train_accuracy / n_splits
    avg_validation_accuracy = validation_accuracy / n_splits

    # --- optimization_goal ---
    optimization_goal = gmean / 5

    # --- Log Trial Data ---
    trial_data.append({
            # PCA/Data params
            "n_components": param_grid["n_components"],
            # --- SVC params ---
            "kernel": param_grid["kernel"],
            "C": param_grid["C"],
            "probability": param_grid["probability"],
            "class_weight": param_grid["class_weight"],
            "gamma_type": param_grid["gamma_type"], # Log how gamma was chosen
            "gamma": param_grid.get("actual_gamma", 0),
            "gamma_float": gamma_float_value,
            "degree": param_grid['degree'], # Log degree for poly kernel
            # Metrics
            "avg_training_accuracy": avg_training_accuracy,
            "avg_validation_accuracy": avg_validation_accuracy,
            'avg_gmean': gmean / 5,
            "optimization_goal": optimization_goal
        }
    )

    return optimization_goal


## Train and save the optimal

In [None]:
def train_model_lr(optimal_params, features, labels):
    # --- Data Preparation (PCA and Sampling - kept from original) ---
    if optimal_params['n_components'] == 0:
        pca_model_lr = None
        training_features_pca = features.to_numpy() # Assuming features is a DataFrame
        training_labels = np.array(labels)
    else:
        # Prepare PCA features
        pca_model_lr = PCA(n_components=optimal_params['n_components'], random_state=42)

        # PCA Fit & Transform
        # Ensure features is suitable for PCA (e.g., numerical, scaled if necessary)
        pca_model_lr.fit(features)
        training_features_pca = pca_model_lr.transform(features)
        training_labels = np.array(labels)
    lr_model_params = {
        "solver": optimal_params['solver'],
        "penalty": optimal_params['penalty'],
        "max_iter": optimal_params['max_iter'],
        "class_weight": optimal_params['class_weight'],
        "C": optimal_params.get('C', None), # Use .get as C might not exist if penalty='none''C'],
        "random_state": 42,
        "n_jobs": 1 # Use all available CPU cores
    }

    model_lr = LogisticRegression(**lr_model_params)
    model_lr.fit(training_features_pca, training_labels)

    return model_lr, pca_model_lr



In [None]:
def train_model_svm(optimal_params, features, labels):
    # --- Determine actual gamma value ---
    actual_gamma = None
    gamma_float_value = None # To store the suggested float value for logging
    if optimal_params["kernel"] in ["rbf", "poly", "sigmoid"]:
        if optimal_params["gamma_type"] == "float":
            # Suggest gamma value only if type is float and kernel needs it
            gamma_float_value = 0.001 #optimal_params["gamma_float"]
            actual_gamma = gamma_float_value
        else:
            actual_gamma = optimal_params["gamma_type"] # Use 'scale' or 'auto' string

    # --- Data Preparation (PCA and Sampling - kept from original) ---
    if optimal_params['n_components'] == 0:
        pca_svm_model = None
        training_features_pca = features.to_numpy() # Assuming features is a DataFrame
        training_labels = np.array(labels)
    else:
        # Prepare PCA features
        pca_svm_model = PCA(n_components=optimal_params['n_components'], random_state=42)

        # PCA Fit & Transform
        # Ensure features is suitable for PCA (e.g., numerical, scaled if necessary)
        pca_svm_model.fit(features)
        training_features_pca = pca_svm_model.transform(features)
        training_labels = np.array(labels)

    svm_model_params = {
        "C": optimal_params['C'],
        "kernel": optimal_params['kernel'],
        "probability": optimal_params['probability'],
        "class_weight": optimal_params['class_weight'],
        "random_state": 42,
        # Consider adding cache_size if memory/speed becomes an issue
        # "cache_size": 500 # Example: 500MB cache
    }
    # Add gamma only if kernel requires it
    if optimal_params["kernel"] in ["rbf", "poly", "sigmoid"]:
        svm_model_params["gamma"] = actual_gamma
    # Add degree only if kernel is poly
    if optimal_params["kernel"] == "poly":
        svm_model_params["degree"] = optimal_params['degree']

    svm_model = SVC(**svm_model_params)
    svm_model.fit(training_features_pca, training_labels)

    return svm_model, pca_svm_model

# Step 1 - Training Models on PMOC

In [None]:
# Hyperparameter tuning for LR for PMOC
trial_data_lr_PMOC_step_1 = []
study = optuna.create_study(direction="maximize", study_name="DNN Classifier")
func = lambda trial: objective_lr(trial, train_features_step_1.reset_index(drop=True), training_anno_step_1['Label_PMOC'], trial_data_lr_PMOC_step_1)
optuna.logging.set_verbosity(optuna.logging.FATAL)
study.optimize(func, n_trials=1000, show_progress_bar=True, gc_after_trial=True, n_jobs=16)

In [None]:
# Hyperparameter tuning for SVM for PMOC
trial_data_svm_PMOC_step_1 = []
study = optuna.create_study(direction="maximize", study_name="DNN Classifier")
func = lambda trial: objective_svm(trial, train_features_step_1.reset_index(drop=True), training_anno_step_1['Label_PMOC'], trial_data_svm_PMOC_step_1)
optuna.logging.set_verbosity(optuna.logging.FATAL)
study.optimize(func, n_trials=1000, show_progress_bar=True, gc_after_trial=True, n_jobs=16)

## Save trial data

In [None]:
trial_data_step_1 = {
    'LR_PMOC': trial_data_lr_PMOC_step_1,
    'SVM_PMOC': trial_data_svm_PMOC_step_1,
}
with open('data/trials_step_1.pkl', 'wb') as f:
    pickle.dump(trial_data_step_1, f)

In [None]:
optimal_models_params_step_1 = []

for model in [
    'LR_PMOC',
    'SVM_PMOC',
]:
    best = 0
    best_trial = None
    for trial in trial_data_step_1[model]:

        if trial['optimization_goal'] > best:
            best = trial['optimization_goal']
            best_trial = trial

    optimal_models_params_step_1.append({"model_name": model, "params": best_trial})


In [None]:
for param in optimal_models_params_step_1:
    print(f"{param['model_name']}: {param['params']}")

    if param['model_name'] == 'LR_PMOC':
        lr_model_PMOC_step_1, pca_model_lr_PMOC_step_1 = train_model_lr(param['params'], train_features_step_1, training_anno_step_1['Label_PMOC'])

    elif param['model_name'] == 'SVM_PMOC':
        svm_model_PMOC_step_1, pca_model_svm_PMOC_step_1 = train_model_svm(param['params'], train_features_step_1, training_anno_step_1['Label_PMOC'])


In [None]:
# Save models
with open('models/lr_model_PMOC.pkl', 'wb') as f:
    pickle.dump(lr_model_PMOC_step_1, f)
with open('models/pca_model_lr_PMOC.pkl', 'wb') as f:
    pickle.dump(pca_model_lr_PMOC_step_1, f)
with open('models/svm_model_PMOC.pkl', 'wb') as f:
    pickle.dump(svm_model_PMOC_step_1, f)
with open('models/pca_model_svm_PMOC.pkl', 'wb') as f:
    pickle.dump(pca_model_svm_PMOC_step_1, f)

# Step 2 - Training Models on SMOC

## Load features and labels

## Train models SMOC

In [None]:
# Hyperparameter tuning for LR for SMOC
trial_data_lr_SMOC_step_2 = []
study = optuna.create_study(direction="maximize", study_name="DNN Classifier")
func = lambda trial: objective_lr(trial, train_features_step_2.reset_index(drop=True), training_anno_step_2['Label_SMOC'], trial_data_lr_SMOC_step_2)
optuna.logging.set_verbosity(optuna.logging.FATAL)
study.optimize(func, n_trials=1000, show_progress_bar=True, gc_after_trial=True, n_jobs=16)

In [None]:
# Hyperparameter tuning for SVM for PMOC
trial_data_svm_SMOC_step_2 = []
study = optuna.create_study(direction="maximize", study_name="DNN Classifier")
func = lambda trial: objective_svm(trial, train_features_step_2.reset_index(drop=True), training_anno_step_2['Label_SMOC'], trial_data_svm_SMOC_step_2)
optuna.logging.set_verbosity(optuna.logging.FATAL)
study.optimize(func, n_trials=1000, show_progress_bar=True, gc_after_trial=True, n_jobs=16)

In [None]:
trial_data_step_2 = {
    'LR_SMOC': trial_data_lr_SMOC_step_2,
    'SVM_SMOC': trial_data_svm_SMOC_step_2,
}
with open('data/trials_step_2.pkl', 'wb') as f:
    pickle.dump(trial_data_step_2, f)

In [None]:
optimal_models_params_step_2 = []

for model in [
    'LR_SMOC',
    'SVM_SMOC',
]:
    best = 0
    best_trial = None
    for trial in trial_data_step_2[model]:

        if trial['optimization_goal'] > best:
            best = trial['optimization_goal']
            best_trial = trial

    optimal_models_params_step_2.append({"model_name": model, "params": best_trial})


In [None]:
for param in optimal_models_params_step_2:
    print(f"{param['model_name']}: {param['params']}")

    if param['model_name'] == 'LR_SMOC':
        lr_model_SMOC, pca_model_lr_SMOC = train_model_lr(param['params'], train_features_step_2, training_anno_step_2['Label_SMOC'])

    elif param['model_name'] == 'SVM_SMOC':
        # print(param['params'])
        svm_model_SMOC, pca_model_svm_SMOC = train_model_svm(param['params'], train_features_step_2, training_anno_step_2['Label_SMOC'])


In [None]:
# Save models
with open('models/lr_model_SMOC.pkl', 'wb') as f:
    pickle.dump(lr_model_SMOC, f)
with open('models/pca_model_lr_SMOC.pkl', 'wb') as f:
    pickle.dump(pca_model_lr_SMOC, f)
with open('models/svm_model_SMOC.pkl', 'wb') as f:
    pickle.dump(svm_model_SMOC, f)
with open('models/pca_model_svm_SMOC.pkl', 'wb') as f:
    pickle.dump(pca_model_svm_SMOC, f)


# Step 3 - SMOC Origin

## Train models SMOC Origin

### STAD

In [None]:
# Hyperparameter tuning for LR for STAD
trial_data_lr_STAD = []
study = optuna.create_study(direction="maximize", study_name="DNN Classifier")
func = lambda trial: objective_lr(trial, train_features_step_3.reset_index(drop=True), training_anno_step_3['Label_STAD'], trial_data_lr_STAD)
optuna.logging.set_verbosity(optuna.logging.FATAL)
study.optimize(func, n_trials=1000, show_progress_bar=True, gc_after_trial=True, n_jobs=16)

In [None]:
# Hyperparameter tuning for SVM for PMOC
trial_data_svm_STAD = []
study = optuna.create_study(direction="maximize", study_name="DNN Classifier")
func = lambda trial: objective_svm(trial, train_features_step_3.reset_index(drop=True), training_anno_step_3['Label_STAD'], trial_data_svm_STAD)
optuna.logging.set_verbosity(optuna.logging.FATAL)
study.optimize(func, n_trials=1000, show_progress_bar=True, gc_after_trial=True, n_jobs=16)

### COAD

In [None]:
# Hyperparameter tuning for LR for COAD
trial_data_lr_COAD = []
study = optuna.create_study(direction="maximize", study_name="DNN Classifier")
func = lambda trial: objective_lr(trial, train_features_step_3.reset_index(drop=True), training_anno_step_3['Label_COAD'], trial_data_lr_COAD)
optuna.logging.set_verbosity(optuna.logging.FATAL)
study.optimize(func, n_trials=1000, show_progress_bar=True, gc_after_trial=True, n_jobs=16)

In [None]:
# Hyperparameter tuning for SVM for PMOC
trial_data_svm_COAD = []
study = optuna.create_study(direction="maximize", study_name="DNN Classifier")
func = lambda trial: objective_svm(trial, train_features_step_3.reset_index(drop=True), training_anno_step_3['Label_COAD'], trial_data_svm_COAD)
optuna.logging.set_verbosity(optuna.logging.FATAL)
study.optimize(func, n_trials=100, show_progress_bar=True, gc_after_trial=True, n_jobs=16)

## Save and evaluate models

In [None]:
full_trial_data = {
    'LR_STAD': trial_data_lr_STAD,
    'SVM_STAD': trial_data_svm_STAD,
    'LR_COAD': trial_data_lr_COAD,
    'SVM_COAD': trial_data_svm_COAD,
}
with open('data/trials_step_3.pkl', 'wb') as f:
    pickle.dump(full_trial_data, f)

In [None]:
optimal_models_params_step_3 = []

for model in [
    'LR_STAD',
    'SVM_STAD',
    'LR_COAD',
    'SVM_COAD',
]:
    best = 0
    best_trial = None
    for trial in full_trial_data[model]:

        if trial['optimization_goal'] > best:
            best = trial['optimization_goal']
            best_trial = trial

    optimal_models_params_step_3.append({"model_name": model, "params": best_trial})


In [None]:
for param in optimal_models_params_step_3:
    print(f"{param['model_name']}: {param['params']}")

    if param['model_name'] == 'LR_STAD':
        lr_model_STAD, pca_model_lr_STAD = train_model_lr(param['params'], train_features_step_3, training_anno_step_3['Label_STAD'])

    elif param['model_name'] == 'SVM_STAD':
        svm_model_STAD, pca_model_svm_STAD = train_model_svm(param['params'], train_features_step_3, training_anno_step_3['Label_STAD'])

    elif param['model_name'] == 'LR_COAD':
        lr_model_COAD, pca_model_lr_COAD = train_model_lr(param['params'], train_features_step_3, training_anno_step_3['Label_COAD'])

    elif param['model_name'] == 'SVM_COAD':
        svm_model_COAD, pca_model_svm_COAD = train_model_svm(param['params'], train_features_step_3, training_anno_step_3['Label_COAD'])


In [None]:
# # Save models
with open('models/lr_model_STAD.pkl', 'wb') as f:
    pickle.dump(lr_model_STAD, f)
with open('models/pca_model_lr_STAD.pkl', 'wb') as f:
    pickle.dump(pca_model_lr_STAD, f)
with open('models/svm_model_STAD.pkl', 'wb') as f:
    pickle.dump(svm_model_STAD, f)
with open('models/pca_model_svm_STAD.pkl', 'wb') as f:
    pickle.dump(pca_model_svm_STAD, f)


with open('models/lr_model_COAD.pkl', 'wb') as f:
    pickle.dump(lr_model_COAD, f)
with open('models/pca_model_lr_COAD.pkl', 'wb') as f:
    pickle.dump(pca_model_lr_COAD, f)
with open('models/svm_model_COAD.pkl', 'wb') as f:
    pickle.dump(svm_model_COAD, f)
with open('models/pca_model_svm_COAD.pkl', 'wb') as f:
    pickle.dump(pca_model_svm_COAD, f)
