# 03 - Train Models WITH Optuna

This notebook trains **8 classification models** with Optuna hyperparameter tuning.

## Models
1. Logistic Regression
2. Ridge Classifier
3. HistGradientBoostingClassifier
4. XGBoost

## Conditions
- 4 models WITHOUT PCA + Optuna tuning
- 4 models WITH PCA + Optuna tuning

In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
!pip install optuna xgboost lightgbm "mlflow<3"



In [15]:
base_folder = "/content/drive/MyDrive/Colab Notebooks/drug_review_classification"
db_path = f"{base_folder}/data/drug_reviews.db"

In [23]:
import os
import time
import numpy as np
import pandas as pd
import sqlite3
import joblib
from dotenv import load_dotenv

import optuna
from optuna.samplers import TPESampler

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

from sklearn.base import clone
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier

import mlflow

start_time = time.monotonic()

In [17]:
# Load data from database
def get_dataframe_from_db(db_path):
    conn = sqlite3.connect(db_path)
    df = pd.read_sql("""
        SELECT
            d.drug_name as urlDrugName,
            c.condition_name as condition,
            r.benefits_review as benefitsReview,
            r.side_effects_review as sideEffectsReview,
            r.comments_review as commentsReview,
            r.rating,
            s.side_effect_name as sideEffects,
            e.effectiveness_name as effectiveness,
            r.split
        FROM reviews r
        JOIN drugs d ON r.drug_id = d.drug_id
        JOIN conditions c ON r.condition_id = c.condition_id
        JOIN side_effects s ON r.side_effect_id = s.side_effect_id
        JOIN effectiveness_levels e ON r.effectiveness_id = e.effectiveness_id
    """, conn)
    conn.close()
    return df

df = get_dataframe_from_db(db_path)

# Build combined text
df["combined_text"] = (
    df["benefitsReview"].fillna("") + " " +
    df["sideEffectsReview"].fillna("") + " " +
    df["commentsReview"].fillna("")
).str.strip()

# ✅ Remove empty texts (prevents TF-IDF having tiny feature count)
df = df[df["combined_text"].str.len() > 0].copy()

# Split
df_train = df[df["split"] == "train"].copy()
df_test  = df[df["split"] == "test"].copy()

# Encode target
EFFECTIVENESS_ORDER = ["Ineffective", "Marginally Effective", "Moderately Effective",
                       "Considerably Effective", "Highly Effective"]
le = LabelEncoder()
le.fit(EFFECTIVENESS_ORDER)
y_train = le.transform(df_train["effectiveness"])
y_test  = le.transform(df_test["effectiveness"])

# ✅ TF-IDF (slightly safer settings)
tfidf = TfidfVectorizer(
    max_features=2000,
    ngram_range=(1, 2),
    stop_words="english",
    min_df=2
)
X_train_tfidf = tfidf.fit_transform(df_train["combined_text"])
X_test_tfidf  = tfidf.transform(df_test["combined_text"])

# ✅ Safe SVD components
n_features = X_train_tfidf.shape[1]
n_comp = min(100, n_features - 1)   # must be <= n_features-1
n_comp = max(2, n_comp)             # at least 2

pca = TruncatedSVD(n_components=n_comp, random_state=42)
X_train_pca = pca.fit_transform(X_train_tfidf)
X_test_pca  = pca.transform(X_test_tfidf)

print(f"Train: {len(df_train)}, Test: {len(df_test)}")
print(f"TF-IDF features: {n_features} | Using n_components: {n_comp}")
print(f"TF-IDF shape: {X_train_tfidf.shape}, PCA shape: {X_train_pca.shape}")


Train: 2400, Test: 600
TF-IDF features: 17 | Using n_components: 16
TF-IDF shape: (2400, 17), PCA shape: (2400, 16)


  self.explained_variance_ratio_ = exp_var / full_var


In [18]:
# Configure MLflow
load_dotenv(dotenv_path=f"{base_folder}/notebooks/.env", override=True)

MLFLOW_TRACKING_URI = os.getenv("MLFLOW_TRACKING_URI")
MLFLOW_TRACKING_USERNAME = os.getenv("MLFLOW_TRACKING_USERNAME")
MLFLOW_TRACKING_PASSWORD = os.getenv("MLFLOW_TRACKING_PASSWORD")

if MLFLOW_TRACKING_USERNAME:
    os.environ["MLFLOW_TRACKING_USERNAME"] = MLFLOW_TRACKING_USERNAME
if MLFLOW_TRACKING_PASSWORD:
    os.environ["MLFLOW_TRACKING_PASSWORD"] = MLFLOW_TRACKING_PASSWORD

if MLFLOW_TRACKING_URI:
    mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment("drug_review_classification")

<Experiment: artifact_location='file:///content/mlruns/611218757046066798', creation_time=1766149912356, experiment_id='611218757046066798', last_update_time=1766149912356, lifecycle_stage='active', name='drug_review_classification', tags={}>

In [19]:
# Optuna objective functions (NO PCA)
def objective_logistic(trial, X, y):
    C = trial.suggest_float('C', 0.01, 10.0, log=True)
    solver = trial.suggest_categorical('solver', ['lbfgs', 'saga'])
    clf = LogisticRegression(C=C, solver=solver, max_iter=1000, random_state=42, n_jobs=-1)
    scores = cross_val_score(clf, X, y, cv=3, scoring='f1_macro', n_jobs=-1)
    return scores.mean()

def objective_ridge(trial, X, y):
    alpha = trial.suggest_float('alpha', 0.01, 100.0, log=True)
    clf = RidgeClassifier(alpha=alpha, random_state=42)
    scores = cross_val_score(clf, X, y, cv=3, scoring='f1_macro', n_jobs=-1)
    return scores.mean()

def objective_hgb(trial, X, y):
    lr = trial.suggest_float('learning_rate', 0.01, 0.3)
    max_depth = trial.suggest_int('max_depth', 3, 10)
    max_iter = trial.suggest_int('max_iter', 100, 300)
    clf = HistGradientBoostingClassifier(learning_rate=lr, max_depth=max_depth, max_iter=max_iter, random_state=42)
    scores = cross_val_score(clf, X, y, cv=3, scoring='f1_macro', n_jobs=-1)
    return scores.mean()

def objective_xgb(trial, X, y):
    lr = trial.suggest_float('learning_rate', 0.01, 0.3)
    max_depth = trial.suggest_int('max_depth', 3, 10)
    n_estimators = trial.suggest_int('n_estimators', 100, 300)
    clf = XGBClassifier(learning_rate=lr, max_depth=max_depth, n_estimators=n_estimators, objective='multi:softprob', eval_metric='mlogloss', random_state=42, n_jobs=-1, use_label_encoder=False)
    scores = cross_val_score(clf, X, y, cv=3, scoring='f1_macro', n_jobs=-1)
    return scores.mean()

In [22]:
# Train with Optuna (NO PCA)
print("="*80)
print("TRAINING WITH OPTUNA (NO PCA)")
print("="*80)

def objective_hgb(trial, X, y):
    n_features = X.shape[1]
    max_comp = max(2, min(300, n_features - 1))
    n_components = trial.suggest_int("n_components", 2, max_comp)

    params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "max_iter": trial.suggest_int("max_iter", 100, 400),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 10, 100),
        "l2_regularization": trial.suggest_float("l2_regularization", 0.0, 1.0),
    }

    clf = Pipeline([
        ("svd", TruncatedSVD(n_components=n_components, random_state=42)),
        ("hgb", HistGradientBoostingClassifier(random_state=42, **params))
    ])

    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    scores = cross_val_score(clf, X, y, cv=cv, scoring="f1_macro", n_jobs=-1)
    return float(np.mean(scores))


objectives = {
    'logistic': objective_logistic,
    'ridge': objective_ridge,
    'histgradientboosting': objective_hgb,
    'xgboost': objective_xgb
}

results = {}

for name, obj_func in objectives.items():
    print(f"\nOptimizing {name}...")

    study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=42))
    study.optimize(lambda trial: obj_func(trial, X_train_tfidf, y_train),
                   n_trials=10, show_progress_bar=True, catch=(Exception,))

    best_params = dict(study.best_params)   # copy so we can safely pop
    cv_f1 = float(study.best_value)
    print(f"Best params: {best_params}")
    print(f"Best CV F1: {cv_f1:.4f}")

    # Train final model (NO PCA)
    if name == 'logistic':
        clf = LogisticRegression(**best_params, max_iter=1000, random_state=42, n_jobs=-1)

    elif name == 'ridge':
        clf = RidgeClassifier(**best_params, random_state=42)

    elif name == 'histgradientboosting':
        # ✅ MUST be SVD + HGB even in "NO PCA" because TF-IDF is sparse
        n_components = best_params.pop("n_components")
        clf = Pipeline([
            ("svd", TruncatedSVD(n_components=n_components, random_state=42)),
            ("hgb", HistGradientBoostingClassifier(random_state=42, **best_params))
        ])

    elif name == 'xgboost':
        clf = XGBClassifier(
            **best_params,
            objective='multi:softprob',
            eval_metric='mlogloss',
            random_state=42,
            n_jobs=-1,
            use_label_encoder=False
        )

    clf.fit(X_train_tfidf, y_train)
    y_pred = clf.predict(X_test_tfidf)
    test_f1 = f1_score(y_test, y_pred, average='macro')
    print(f"Test F1: {test_f1:.4f}")

    results[f"{name}_optuna"] = {
        'cv_f1': cv_f1,
        'test_f1': test_f1,
        'uses_pca': False,
        'is_tuned': True,
        'model': clf,
        'params': dict(study.best_params)
    }

    with mlflow.start_run(run_name=f"{name}_optuna"):
        mlflow.log_param("model_family", name)
        mlflow.log_param("uses_pca", False)
        mlflow.log_param("is_tuned", True)
        mlflow.log_params(dict(study.best_params))
        mlflow.log_metric("cv_f1", cv_f1)
        mlflow.log_metric("test_f1", test_f1)

        # ✅ remove MLflow signature warning by providing input_example
        mlflow.sklearn.log_model(
            clf,
            artifact_path="model",
            registered_model_name=f"{name}_pipeline_optuna",
            input_example=X_train_tfidf[:2].toarray()
        )


[I 2025-12-19 14:35:42,790] A new study created in memory with name: no-name-a900a824-a817-45b8-b18a-c3cf26f25060


TRAINING WITH OPTUNA (NO PCA)

Optimizing logistic...


  0%|          | 0/10 [00:00<?, ?it/s]

[I 2025-12-19 14:35:42,962] Trial 0 finished with value: 0.09338863113452328 and parameters: {'C': 0.13292918943162169, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.09338863113452328.
[I 2025-12-19 14:35:43,121] Trial 1 finished with value: 0.09338863113452328 and parameters: {'C': 0.6251373574521749, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.09338863113452328.
[I 2025-12-19 14:35:43,272] Trial 2 finished with value: 0.09338863113452328 and parameters: {'C': 0.014936568554617643, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.09338863113452328.
[I 2025-12-19 14:35:49,166] Trial 3 finished with value: 0.09338863113452328 and parameters: {'C': 1.3311216080736887, 'solver': 'saga'}. Best is trial 0 with value: 0.09338863113452328.
[I 2025-12-19 14:35:49,243] Trial 4 finished with value: 0.09338863113452328 and parameters: {'C': 3.142880890840109, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.09338863113452328.
[I 2025-12-19 14:35:53,579] Trial 5 finished with value: 0.

Registered model 'logistic_pipeline_optuna' already exists. Creating a new version of this model...
Created version '3' of model 'logistic_pipeline_optuna'.
[I 2025-12-19 14:36:05,511] A new study created in memory with name: no-name-9b178e48-b670-4e7e-abc8-34bb1eadd099



Optimizing ridge...


  0%|          | 0/10 [00:00<?, ?it/s]

[I 2025-12-19 14:36:05,576] Trial 0 finished with value: 0.09338863113452328 and parameters: {'alpha': 0.31489116479568624}. Best is trial 0 with value: 0.09338863113452328.
[I 2025-12-19 14:36:05,615] Trial 1 finished with value: 0.09338863113452328 and parameters: {'alpha': 63.512210106407046}. Best is trial 0 with value: 0.09338863113452328.
[I 2025-12-19 14:36:05,662] Trial 2 finished with value: 0.09338863113452328 and parameters: {'alpha': 8.471801418819979}. Best is trial 0 with value: 0.09338863113452328.
[I 2025-12-19 14:36:05,700] Trial 3 finished with value: 0.09338863113452328 and parameters: {'alpha': 2.481040974867813}. Best is trial 0 with value: 0.09338863113452328.
[I 2025-12-19 14:36:05,741] Trial 4 finished with value: 0.09338863113452328 and parameters: {'alpha': 0.04207988669606638}. Best is trial 0 with value: 0.09338863113452328.
[I 2025-12-19 14:36:05,787] Trial 5 finished with value: 0.09338863113452328 and parameters: {'alpha': 0.042070539502879395}. Best is t

Registered model 'ridge_pipeline_optuna' already exists. Creating a new version of this model...
Created version '3' of model 'ridge_pipeline_optuna'.
[I 2025-12-19 14:36:11,800] A new study created in memory with name: no-name-c70185a8-54e0-4d4d-b988-30a0dcac99e7



Optimizing histgradientboosting...


  0%|          | 0/10 [00:00<?, ?it/s]

[I 2025-12-19 14:36:12,817] Trial 0 finished with value: 0.09338863113452328 and parameters: {'n_components': 7, 'learning_rate': 0.2536999076681772, 'max_depth': 10, 'max_iter': 280, 'min_samples_leaf': 24, 'l2_regularization': 0.15599452033620265}. Best is trial 0 with value: 0.09338863113452328.
[I 2025-12-19 14:36:13,680] Trial 1 finished with value: 0.09338863113452328 and parameters: {'n_components': 2, 'learning_rate': 0.19030368381735815, 'max_depth': 9, 'max_iter': 313, 'min_samples_leaf': 11, 'l2_regularization': 0.9699098521619943}. Best is trial 0 with value: 0.09338863113452328.
[I 2025-12-19 14:36:14,270] Trial 2 finished with value: 0.09338863113452328 and parameters: {'n_components': 14, 'learning_rate': 0.020589728197687916, 'max_depth': 4, 'max_iter': 155, 'min_samples_leaf': 37, 'l2_regularization': 0.5247564316322378}. Best is trial 0 with value: 0.09338863113452328.
[I 2025-12-19 14:36:14,775] Trial 3 finished with value: 0.09338863113452328 and parameters: {'n_com

  self.explained_variance_ratio_ = exp_var / full_var


Test F1: 0.0842


Successfully registered model 'histgradientboosting_pipeline_optuna'.
Created version '1' of model 'histgradientboosting_pipeline_optuna'.
[I 2025-12-19 14:36:26,736] A new study created in memory with name: no-name-52e391da-fcf8-484b-9615-7ce17c400119



Optimizing xgboost...


  0%|          | 0/10 [00:00<?, ?it/s]

[I 2025-12-19 14:36:27,326] Trial 0 finished with value: 0.09338863113452328 and parameters: {'learning_rate': 0.11861663446573512, 'max_depth': 10, 'n_estimators': 247}. Best is trial 0 with value: 0.09338863113452328.
[I 2025-12-19 14:36:27,598] Trial 1 finished with value: 0.09338863113452328 and parameters: {'learning_rate': 0.18361096041714062, 'max_depth': 4, 'n_estimators': 131}. Best is trial 0 with value: 0.09338863113452328.
[I 2025-12-19 14:36:27,996] Trial 2 finished with value: 0.09338863113452328 and parameters: {'learning_rate': 0.026844247528777843, 'max_depth': 9, 'n_estimators': 220}. Best is trial 0 with value: 0.09338863113452328.
[I 2025-12-19 14:36:28,517] Trial 3 finished with value: 0.09338863113452328 and parameters: {'learning_rate': 0.21534104756085318, 'max_depth': 3, 'n_estimators': 294}. Best is trial 0 with value: 0.09338863113452328.
[I 2025-12-19 14:36:28,774] Trial 4 finished with value: 0.09338863113452328 and parameters: {'learning_rate': 0.251408365

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Test F1: 0.0842


Successfully registered model 'xgboost_pipeline_optuna'.
Created version '1' of model 'xgboost_pipeline_optuna'.


In [24]:
# Train with Optuna (WITH PCA)
print("\n" + "="*80)
print("TRAINING WITH OPTUNA (WITH PCA)")
print("="*80)

def objective_hgb_pca(trial, X, y):
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "max_iter": trial.suggest_int("max_iter", 100, 400),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 10, 100),
        "l2_regularization": trial.suggest_float("l2_regularization", 0.0, 1.0),
    }
    clf = HistGradientBoostingClassifier(random_state=42, **params)
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    scores = cross_val_score(clf, X, y, cv=cv, scoring="f1_macro", n_jobs=-1)
    return float(np.mean(scores))

# Build objectives dict for PCA run (use HGB_PCA objective here)
objectives_pca = {
    "logistic": objective_logistic,
    "ridge": objective_ridge,
    "histgradientboosting": objective_hgb_pca,
    "xgboost": objective_xgb
}

for name, obj_func in objectives_pca.items():
    print(f"\nOptimizing {name} with PCA...")

    study = optuna.create_study(direction="maximize", sampler=TPESampler(seed=42))
    study.optimize(lambda trial: obj_func(trial, X_train_pca, y_train),
                   n_trials=10, show_progress_bar=True, catch=(Exception,))

    best_params = dict(study.best_params)
    cv_f1 = float(study.best_value)
    print(f"Best params: {best_params}")
    print(f"Best CV F1: {cv_f1:.4f}")

    # Train final model (WITH PCA)
    if name == "logistic":
        clf = LogisticRegression(**best_params, max_iter=1000, random_state=42, n_jobs=-1)

    elif name == "ridge":
        clf = RidgeClassifier(**best_params, random_state=42)

    elif name == "histgradientboosting":
        clf = HistGradientBoostingClassifier(**best_params, random_state=42)

    elif name == "xgboost":
        clf = XGBClassifier(
            **best_params,
            objective="multi:softprob",
            eval_metric="mlogloss",
            random_state=42,
            n_jobs=-1,
            use_label_encoder=False
        )

    clf.fit(X_train_pca, y_train)
    y_pred = clf.predict(X_test_pca)
    test_f1 = f1_score(y_test, y_pred, average="macro")
    print(f"Test F1: {test_f1:.4f}")

    results[f"{name}_pca_optuna"] = {
        "cv_f1": cv_f1,
        "test_f1": test_f1,
        "uses_pca": True,
        "is_tuned": True,
        "model": clf,
        "params": dict(study.best_params)
    }

    with mlflow.start_run(run_name=f"{name}_with_pca_optuna"):
        mlflow.log_param("model_family", name)
        mlflow.log_param("uses_pca", True)
        mlflow.log_param("is_tuned", True)
        mlflow.log_params(dict(study.best_params))
        mlflow.log_metric("cv_f1", cv_f1)
        mlflow.log_metric("test_f1", test_f1)
        mlflow.sklearn.log_model(clf,artifact_path="model",registered_model_name=f"{name}_pipeline_with_pca_optuna",input_example=X_train_pca[:2])

[I 2025-12-19 14:42:57,659] A new study created in memory with name: no-name-0bf33dcb-0c74-4528-855e-5da65029705f



TRAINING WITH OPTUNA (WITH PCA)

Optimizing logistic with PCA...


  0%|          | 0/10 [00:00<?, ?it/s]

[I 2025-12-19 14:42:59,912] Trial 0 finished with value: 0.09338863113452328 and parameters: {'C': 0.13292918943162169, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.09338863113452328.
[I 2025-12-19 14:42:59,992] Trial 1 finished with value: 0.09338863113452328 and parameters: {'C': 0.6251373574521749, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.09338863113452328.
[I 2025-12-19 14:43:00,040] Trial 2 finished with value: 0.09338863113452328 and parameters: {'C': 0.014936568554617643, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.09338863113452328.
[I 2025-12-19 14:43:02,577] Trial 3 finished with value: 0.09338863113452328 and parameters: {'C': 1.3311216080736887, 'solver': 'saga'}. Best is trial 0 with value: 0.09338863113452328.
[I 2025-12-19 14:43:02,643] Trial 4 finished with value: 0.09338863113452328 and parameters: {'C': 3.142880890840109, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.09338863113452328.
[I 2025-12-19 14:43:07,168] Trial 5 finished with value: 0.

Successfully registered model 'logistic_pipeline_with_pca_optuna'.
Created version '1' of model 'logistic_pipeline_with_pca_optuna'.
[I 2025-12-19 14:43:22,276] A new study created in memory with name: no-name-77026aac-ba1e-4469-bd5d-8d342eaeceb6



Optimizing ridge with PCA...


  0%|          | 0/10 [00:00<?, ?it/s]

[I 2025-12-19 14:43:22,337] Trial 0 finished with value: 0.09338863113452328 and parameters: {'alpha': 0.31489116479568624}. Best is trial 0 with value: 0.09338863113452328.
[I 2025-12-19 14:43:22,368] Trial 1 finished with value: 0.09338863113452328 and parameters: {'alpha': 63.512210106407046}. Best is trial 0 with value: 0.09338863113452328.
[I 2025-12-19 14:43:22,404] Trial 2 finished with value: 0.09338863113452328 and parameters: {'alpha': 8.471801418819979}. Best is trial 0 with value: 0.09338863113452328.
[I 2025-12-19 14:43:22,448] Trial 3 finished with value: 0.09338863113452328 and parameters: {'alpha': 2.481040974867813}. Best is trial 0 with value: 0.09338863113452328.
[I 2025-12-19 14:43:22,475] Trial 4 finished with value: 0.09338863113452328 and parameters: {'alpha': 0.04207988669606638}. Best is trial 0 with value: 0.09338863113452328.
[I 2025-12-19 14:43:22,509] Trial 5 finished with value: 0.09338863113452328 and parameters: {'alpha': 0.042070539502879395}. Best is t

Successfully registered model 'ridge_pipeline_with_pca_optuna'.
Created version '1' of model 'ridge_pipeline_with_pca_optuna'.
[I 2025-12-19 14:43:27,488] A new study created in memory with name: no-name-f631404e-c23e-47a4-be4c-9dfcf22e36e8



Optimizing histgradientboosting with PCA...


  0%|          | 0/10 [00:00<?, ?it/s]

[I 2025-12-19 14:43:28,812] Trial 0 finished with value: 0.09338863113452328 and parameters: {'learning_rate': 0.03574712922600244, 'max_depth': 12, 'max_iter': 320, 'min_samples_leaf': 64, 'l2_regularization': 0.15601864044243652}. Best is trial 0 with value: 0.09338863113452328.
[I 2025-12-19 14:43:30,189] Trial 1 finished with value: 0.09338863113452328 and parameters: {'learning_rate': 0.01699897838270077, 'max_depth': 3, 'max_iter': 360, 'min_samples_leaf': 64, 'l2_regularization': 0.7080725777960455}. Best is trial 0 with value: 0.09338863113452328.
[I 2025-12-19 14:43:31,512] Trial 2 finished with value: 0.09338863113452328 and parameters: {'learning_rate': 0.010725209743171997, 'max_depth': 12, 'max_iter': 350, 'min_samples_leaf': 29, 'l2_regularization': 0.18182496720710062}. Best is trial 0 with value: 0.09338863113452328.
[I 2025-12-19 14:43:32,712] Trial 3 finished with value: 0.09338863113452328 and parameters: {'learning_rate': 0.018659959624904916, 'max_depth': 6, 'max_i

Successfully registered model 'histgradientboosting_pipeline_with_pca_optuna'.
Created version '1' of model 'histgradientboosting_pipeline_with_pca_optuna'.
[I 2025-12-19 14:43:43,865] A new study created in memory with name: no-name-3ac96d9c-f506-43fa-a51c-7f93084a28a1



Optimizing xgboost with PCA...


  0%|          | 0/10 [00:00<?, ?it/s]

[I 2025-12-19 14:43:44,374] Trial 0 finished with value: 0.09338863113452328 and parameters: {'learning_rate': 0.11861663446573512, 'max_depth': 10, 'n_estimators': 247}. Best is trial 0 with value: 0.09338863113452328.
[I 2025-12-19 14:43:44,621] Trial 1 finished with value: 0.09338863113452328 and parameters: {'learning_rate': 0.18361096041714062, 'max_depth': 4, 'n_estimators': 131}. Best is trial 0 with value: 0.09338863113452328.
[I 2025-12-19 14:43:45,004] Trial 2 finished with value: 0.09338863113452328 and parameters: {'learning_rate': 0.026844247528777843, 'max_depth': 9, 'n_estimators': 220}. Best is trial 0 with value: 0.09338863113452328.
[I 2025-12-19 14:43:45,511] Trial 3 finished with value: 0.09338863113452328 and parameters: {'learning_rate': 0.21534104756085318, 'max_depth': 3, 'n_estimators': 294}. Best is trial 0 with value: 0.09338863113452328.
[I 2025-12-19 14:43:45,758] Trial 4 finished with value: 0.09338863113452328 and parameters: {'learning_rate': 0.251408365

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Test F1: 0.0842


Successfully registered model 'xgboost_pipeline_with_pca_optuna'.
Created version '1' of model 'xgboost_pipeline_with_pca_optuna'.


In [25]:
# Results summary
print("\n" + "="*80)
print("RESULTS SUMMARY")
print("="*80)

for name, res in results.items():
    print(f"{name}: CV F1={res['cv_f1']:.4f}, Test F1={res['test_f1']:.4f}")

best_name = max(results, key=lambda x: results[x]['test_f1'])
best_result = results[best_name]
print(f"\nBest model: {best_name}")
print(f"Best Test F1: {best_result['test_f1']:.4f}")

model_path = f"{base_folder}/models/global_best_model_optuna.pkl"
joblib.dump(best_result['model'], model_path)
print(f"Best model saved to: {model_path}")

end_time = time.monotonic()
elapsed = end_time - start_time
print(f"\nTotal time: {int(elapsed//60)} minutes {elapsed%60:.2f} seconds")


RESULTS SUMMARY
logistic_optuna: CV F1=0.0934, Test F1=0.0842
ridge_optuna: CV F1=0.0934, Test F1=0.0842
histgradientboosting_optuna: CV F1=0.0934, Test F1=0.0842
xgboost_optuna: CV F1=0.0934, Test F1=0.0842
logistic_pca_optuna: CV F1=0.0934, Test F1=0.0842
ridge_pca_optuna: CV F1=0.0934, Test F1=0.0842
histgradientboosting_pca_optuna: CV F1=0.0934, Test F1=0.0842
xgboost_pca_optuna: CV F1=0.0934, Test F1=0.0842

Best model: logistic_optuna
Best Test F1: 0.0842
Best model saved to: /content/drive/MyDrive/Colab Notebooks/drug_review_classification/models/global_best_model_optuna.pkl

Total time: 6 minutes 12.08 seconds
