# 02 - Train Models WITHOUT Optuna

This notebook trains **8 classification models** (4 baseline + 4 with PCA) without hyperparameter tuning.

## Models
1. Logistic Regression
2. Ridge Classifier
3. HistGradientBoostingClassifier
4. XGBoost

## Conditions
- 4 models WITHOUT PCA (baseline)
- 4 models WITH PCA (TruncatedSVD)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install xgboost lightgbm "mlflow<3"



In [None]:
base_folder = "/content/drive/MyDrive/Colab Notebooks/drug_review_classification"
db_path = f"{base_folder}/data/drug_reviews.db"

In [16]:
import os
import time
import numpy as np
import pandas as pd
import sqlite3
import joblib
from dotenv import load_dotenv

from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier

import mlflow
from mlflow.models import infer_signature

start_time = time.monotonic()

In [None]:
# Load data from database
def get_dataframe_from_db(db_path):
    conn = sqlite3.connect(db_path)
    df = pd.read_sql("""
        SELECT
            d.drug_name as urlDrugName,
            c.condition_name as condition,
            r.benefits_review as benefitsReview,
            r.side_effects_review as sideEffectsReview,
            r.comments_review as commentsReview,
            r.rating,
            s.side_effect_name as sideEffects,
            e.effectiveness_name as effectiveness,
            r.split
        FROM reviews r
        JOIN drugs d ON r.drug_id = d.drug_id
        JOIN conditions c ON r.condition_id = c.condition_id
        JOIN side_effects s ON r.side_effect_id = s.side_effect_id
        JOIN effectiveness_levels e ON r.effectiveness_id = e.effectiveness_id
    """, conn)
    conn.close()
    return df

df = get_dataframe_from_db(db_path)
print(f"Loaded {len(df)} reviews")

Loaded 3000 reviews


In [None]:
# Prepare features
df['combined_text'] = df['benefitsReview'].fillna('') + ' ' + df['sideEffectsReview'].fillna('') + ' ' + df['commentsReview'].fillna('')

# Split
df_train = df[df['split'] == 'train'].copy()
df_test = df[df['split'] == 'test'].copy()

# Encode target
EFFECTIVENESS_ORDER = ['Ineffective', 'Marginally Effective', 'Moderately Effective', 'Considerably Effective', 'Highly Effective']
le = LabelEncoder()
le.fit(EFFECTIVENESS_ORDER)
y_train = le.transform(df_train['effectiveness'])
y_test = le.transform(df_test['effectiveness'])

print(f"Train: {len(df_train)}, Test: {len(df_test)}")
print(f"Classes: {le.classes_}")

Train: 2400, Test: 600
Classes: ['Considerably Effective' 'Highly Effective' 'Ineffective'
 'Marginally Effective' 'Moderately Effective']


In [22]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=2000, ngram_range=(1, 2), stop_words='english')
X_train_tfidf = tfidf.fit_transform(df_train['combined_text'])
X_test_tfidf = tfidf.transform(df_test['combined_text'])

print(f"TF-IDF shape: {X_train_tfidf.shape}")

TF-IDF shape: (2400, 17)


In [None]:
# Configure MLflow
load_dotenv(dotenv_path=f"{base_folder}/notebooks/.env", override=True)

MLFLOW_TRACKING_URI = os.getenv("MLFLOW_TRACKING_URI")
MLFLOW_TRACKING_USERNAME = os.getenv("MLFLOW_TRACKING_USERNAME")
MLFLOW_TRACKING_PASSWORD = os.getenv("MLFLOW_TRACKING_PASSWORD")

if MLFLOW_TRACKING_USERNAME:
    os.environ["MLFLOW_TRACKING_USERNAME"] = MLFLOW_TRACKING_USERNAME
if MLFLOW_TRACKING_PASSWORD:
    os.environ["MLFLOW_TRACKING_PASSWORD"] = MLFLOW_TRACKING_PASSWORD

if MLFLOW_TRACKING_URI:
    mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
    print(f"MLflow tracking: {MLFLOW_TRACKING_URI}")
else:
    print("Using local MLflow tracking")

mlflow.set_experiment("drug_review_classification")

Using local MLflow tracking


<Experiment: artifact_location='file:///content/mlruns/396091345232093958', creation_time=1766146315591, experiment_id='396091345232093958', last_update_time=1766146315591, lifecycle_stage='active', name='drug_review_classification', tags={}>

In [19]:
# Define models
def make_classifier(name):
    if name == 'logistic':
        return LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1)
    elif name == 'ridge':
        return RidgeClassifier(random_state=42)
    elif name == 'histgradientboosting':
        n_features = X_train_tfidf.shape[1]
        n_comp = min(100, n_features - 1)  # must be <= n_features-1
        n_comp = max(2, n_comp)            # keep at least 2

        return Pipeline([
            ("svd", TruncatedSVD(n_components=n_comp, random_state=42)),
            ("hgb", HistGradientBoostingClassifier(random_state=42))
        ])
    elif name == 'xgboost':
        return XGBClassifier(objective='multi:softprob', eval_metric='mlogloss', random_state=42, n_jobs=-1, use_label_encoder=False)

model_names = ['logistic', 'ridge', 'histgradientboosting', 'xgboost']
results = {}

In [20]:
# Train baseline models (NO PCA)
print("="*80)
print("TRAINING BASELINE MODELS (NO PCA)")
print("="*80)

for name in model_names:
    print(f"\nTraining {name}...")

    clf = make_classifier(name)

    # Cross-validation
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    cv_scores = cross_val_score(clf, X_train_tfidf, y_train, cv=cv, scoring='f1_macro', n_jobs=-1)
    cv_f1 = cv_scores.mean()

    # Fit and evaluate
    clf.fit(X_train_tfidf, y_train)
    y_pred = clf.predict(X_test_tfidf)
    test_f1 = f1_score(y_test, y_pred, average='macro')

    print(f"{name} CV F1: {cv_f1:.4f}")
    print(f"{name} Test F1: {test_f1:.4f}")

    results[name] = {'cv_f1': cv_f1, 'test_f1': test_f1, 'uses_pca': False, 'is_tuned': False, 'model': clf}

    # Log to MLflow
    with mlflow.start_run(run_name=f"{name}_baseline"):
        mlflow.log_param("model_family", name)
        mlflow.log_param("uses_pca", False)
        mlflow.log_param("is_tuned", False)
        mlflow.log_metric("cv_f1", cv_f1)
        mlflow.log_metric("test_f1", test_f1)
        mlflow.sklearn.log_model(clf, artifact_path="model", registered_model_name=f"{name}_pipeline")

TRAINING BASELINE MODELS (NO PCA)

Training logistic...
logistic CV F1: 0.0934
logistic Test F1: 0.0842


Registered model 'logistic_pipeline' already exists. Creating a new version of this model...
Created version '5' of model 'logistic_pipeline'.



Training ridge...
ridge CV F1: 0.0934
ridge Test F1: 0.0842


Registered model 'ridge_pipeline' already exists. Creating a new version of this model...
Created version '5' of model 'ridge_pipeline'.



Training histgradientboosting...


  self.explained_variance_ratio_ = exp_var / full_var


histgradientboosting CV F1: 0.0934
histgradientboosting Test F1: 0.0842


Successfully registered model 'histgradientboosting_pipeline'.
Created version '1' of model 'histgradientboosting_pipeline'.



Training xgboost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


xgboost CV F1: 0.0934
xgboost Test F1: 0.0842


Successfully registered model 'xgboost_pipeline'.
Created version '1' of model 'xgboost_pipeline'.


In [24]:
# Train PCA models
print("\n" + "="*80)
print("TRAINING PCA MODELS")
print("="*80)

n_features = X_train_tfidf.shape[1]
n_comp = min(100, n_features - 1)   # ensures <= n_features-1
n_comp = max(2, n_comp)             # at least 2

# Apply PCA (TruncatedSVD for sparse matrices)
pca = TruncatedSVD(n_components=n_comp, random_state=42)
X_train_pca = pca.fit_transform(X_train_tfidf)
X_test_pca = pca.transform(X_test_tfidf)
print(f"TF-IDF features: {n_features} | Using n_components: {n_comp}")
print(f"PCA shape: {X_train_pca.shape}")


for name in model_names:
    print(f"\nTraining {name} with PCA...")

    if name == "histgradientboosting":
        clf = HistGradientBoostingClassifier(random_state=42)
    else:
        clf = make_classifier(name)

    # Cross-validation
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    cv_scores = cross_val_score(clf, X_train_pca, y_train, cv=cv, scoring='f1_macro', n_jobs=-1)
    cv_f1 = cv_scores.mean()

    # Fit and evaluate
    clf.fit(X_train_pca, y_train)
    y_pred = clf.predict(X_test_pca)
    test_f1 = f1_score(y_test, y_pred, average='macro')

    print(f"{name}_with_pca CV F1: {cv_f1:.4f}")
    print(f"{name}_with_pca Test F1: {test_f1:.4f}")

    results[f"{name}_with_pca"] = {'cv_f1': cv_f1, 'test_f1': test_f1, 'uses_pca': True, 'is_tuned': False, 'model': clf}

    # Log to MLflow
    with mlflow.start_run(run_name=f"{name}_with_pca"):
        mlflow.log_param("model_family", name)
        mlflow.log_param("uses_pca", True)
        mlflow.log_param("is_tuned", False)
        mlflow.log_metric("cv_f1", cv_f1)
        mlflow.log_metric("test_f1", test_f1)
        mlflow.sklearn.log_model(clf, artifact_path="model", registered_model_name=f"{name}_pipeline_with_pca")


TRAINING PCA MODELS
TF-IDF features: 17 | Using n_components: 16
PCA shape: (2400, 16)

Training logistic with PCA...


  self.explained_variance_ratio_ = exp_var / full_var


logistic_with_pca CV F1: 0.0934
logistic_with_pca Test F1: 0.0842


Successfully registered model 'logistic_pipeline_with_pca'.
Created version '1' of model 'logistic_pipeline_with_pca'.



Training ridge with PCA...
ridge_with_pca CV F1: 0.0934
ridge_with_pca Test F1: 0.0842


Successfully registered model 'ridge_pipeline_with_pca'.
Created version '1' of model 'ridge_pipeline_with_pca'.



Training histgradientboosting with PCA...
histgradientboosting_with_pca CV F1: 0.0934
histgradientboosting_with_pca Test F1: 0.0842


Successfully registered model 'histgradientboosting_pipeline_with_pca'.
Created version '1' of model 'histgradientboosting_pipeline_with_pca'.



Training xgboost with PCA...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


xgboost_with_pca CV F1: 0.0934
xgboost_with_pca Test F1: 0.0842


Successfully registered model 'xgboost_pipeline_with_pca'.
Created version '1' of model 'xgboost_pipeline_with_pca'.


In [25]:
# Find global best model
print("\n" + "="*80)
print("RESULTS SUMMARY")
print("="*80)

for name, res in results.items():
    print(f"{name}: CV F1={res['cv_f1']:.4f}, Test F1={res['test_f1']:.4f}")

best_name = max(results, key=lambda x: results[x]['test_f1'])
best_result = results[best_name]
print(f"\nBest model: {best_name}")
print(f"Best Test F1: {best_result['test_f1']:.4f}")


RESULTS SUMMARY
logistic: CV F1=0.0934, Test F1=0.0842
ridge: CV F1=0.0934, Test F1=0.0842
histgradientboosting: CV F1=0.0934, Test F1=0.0842
xgboost: CV F1=0.0934, Test F1=0.0842
logistic_with_pca: CV F1=0.0934, Test F1=0.0842
ridge_with_pca: CV F1=0.0934, Test F1=0.0842
histgradientboosting_with_pca: CV F1=0.0934, Test F1=0.0842
xgboost_with_pca: CV F1=0.0934, Test F1=0.0842

Best model: logistic
Best Test F1: 0.0842


In [26]:
# Save best model
model_path = f"{base_folder}/models/global_best_model.pkl"
joblib.dump(best_result['model'], model_path)
print(f"Best model saved to: {model_path}")

end_time = time.monotonic()
elapsed = end_time - start_time
print(f"\nTotal time: {int(elapsed//60)} minutes {elapsed%60:.2f} seconds")

Best model saved to: /content/drive/MyDrive/Colab Notebooks/drug_review_classification/models/global_best_model.pkl

Total time: 8 minutes 20.85 seconds
