In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install optuna xgboost lightgbm "mlflow<3"



In [37]:
# =============================================================================
# 03 TRAIN MODELS WITH OPTUNA (8 EXPERIMENTS)
# Classification + DagsHub MLflow (SAFE)
# =============================================================================

# -------------------- INSTALLS --------------------
%pip install -q dagshub mlflow optuna scikit-learn xgboost

# -------------------- IMPORTS --------------------
import sqlite3
import pandas as pd
import numpy as np
import time
import optuna

import dagshub
import mlflow
from mlflow.tracking import MlflowClient

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.decomposition import PCA

# -------------------- DAGSHUB INIT --------------------
dagshub.init(
    repo_owner="wasiq0",
    repo_name="FinalProjectEAS508",
    mlflow=True
)

mlflow.set_tracking_uri(
    "https://dagshub.com/wasiq0/FinalProjectEAS508.mlflow"
)

client = MlflowClient()

EXPERIMENT_NAME = "student_classification_with_optuna"
exp = client.get_experiment_by_name(EXPERIMENT_NAME)
if exp is None:
    exp_id = client.create_experiment(EXPERIMENT_NAME)
else:
    exp_id = exp.experiment_id

print("Using experiment:", EXPERIMENT_NAME)

# -------------------- LOAD DATA --------------------
DB_PATH = "/content/drive/MyDrive/work/Finalproject/housing_app_fall25-main/data/student_performance.db"
conn = sqlite3.connect(DB_PATH)

df = pd.read_sql_query("""
SELECT
    s.student_id,
    s.age,
    s.gender,
    s.internet_access,
    s.sleep_hours,
    s.sleep_quality,
    s.class_attendance,
    c.course_name AS course,
    c.study_method,
    c.facility_rating,
    e.exam_difficulty,
    e.study_hours,
    e.exam_score,
    e.pass_fail
FROM students s
JOIN exams e ON e.student_id = s.student_id
JOIN courses c ON c.course_id = e.course_id
ORDER BY s.student_id
""", conn)

conn.close()

df["pass_fail"] = df["pass_fail"].map({"fail": 0, "pass": 1})

X = df.drop(columns=["student_id", "pass_fail"])
y = df["pass_fail"]

print("Rows:", len(df))
print("\nClass distribution:\n", y.value_counts(normalize=True))

# -------------------- SPLIT --------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# -------------------- PREPROCESS --------------------
num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

preprocess = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
])

# -------------------- OPTUNA OBJECTIVES --------------------
def objective(trial, model_name, use_pca):
    if model_name == "logreg":
        C = trial.suggest_float("C", 0.01, 10, log=True)
        model = LogisticRegression(C=C, max_iter=3000)
    elif model_name == "ridgeclf":
        alpha = trial.suggest_float("alpha", 0.01, 10, log=True)
        model = RidgeClassifier(alpha=alpha)
    elif model_name == "histgb":
        lr = trial.suggest_float("lr", 0.01, 0.3)
        model = HistGradientBoostingClassifier(learning_rate=lr)
    elif model_name == "xgboost":
        depth = trial.suggest_int("depth", 3, 8)
        n_estimators = trial.suggest_int("n_estimators", 50, 200)
        model = XGBClassifier(
            max_depth=depth,
            n_estimators=n_estimators,
            eval_metric="logloss",
            use_label_encoder=False,
            random_state=42
        )

    steps = [("prep", preprocess)]
    if use_pca:
        steps.append(("pca", PCA(n_components=0.95)))
    steps.append(("model", model))

    pipe = Pipeline(steps)

    return cross_val_score(
        pipe, X_train, y_train,
        scoring="f1", cv=3, n_jobs=-1
    ).mean()

# -------------------- TRAIN LOOP (8 RUNS) --------------------
start = time.time()
run_count = 0

models = ["logreg", "ridgeclf", "histgb", "xgboost"]

for model_name in models:
    for use_pca in [False, True]:

        run_name = f"{model_name}_{'with_pca' if use_pca else 'no_pca'}_optuna"

        print("\n" + "="*80)
        print("Training:", run_name)
        print("="*80)

        study = optuna.create_study(direction="maximize")
        study.optimize(lambda t: objective(t, model_name, use_pca), n_trials=10)

        best_params = study.best_params
        best_cv_f1 = study.best_value

        # Build final model
        trial = study.best_trial
        if model_name == "logreg":
            model = LogisticRegression(C=best_params["C"], max_iter=3000)
        elif model_name == "ridgeclf":
            model = RidgeClassifier(alpha=best_params["alpha"])
        elif model_name == "histgb":
            model = HistGradientBoostingClassifier(learning_rate=best_params["lr"])
        elif model_name == "xgboost":
            model = XGBClassifier(
                max_depth=best_params["depth"],
                n_estimators=best_params["n_estimators"],
                eval_metric="logloss",
                use_label_encoder=False,
                random_state=42
            )

        steps = [("prep", preprocess)]
        if use_pca:
            steps.append(("pca", PCA(n_components=0.95)))
        steps.append(("model", model))

        pipe = Pipeline(steps)
        pipe.fit(X_train, y_train)

        preds = pipe.predict(X_test)
        test_f1 = f1_score(y_test, preds)

        print("Best CV F1:", round(best_cv_f1, 4))
        print("Test F1:", round(test_f1, 4))

        # -------------------- SAFE LOGGING --------------------
        with mlflow.start_run(
            experiment_id=exp_id,
            run_name=run_name
        ):
            mlflow.log_param("model", model_name)
            mlflow.log_param("pca", use_pca)
            mlflow.log_param("optuna", True)
            mlflow.log_params(best_params)
            mlflow.log_metric("cv_f1", best_cv_f1)
            mlflow.log_metric("test_f1", test_f1)
            mlflow.sklearn.log_model(pipe, "model")

        run_count += 1

# -------------------- SUMMARY --------------------
elapsed = time.time() - start
print("\n" + "="*80)
print("‚úì 03 EXPERIMENTS COMPLETE")
print("Runs logged:", run_count)
print("Elapsed time:", round(elapsed, 2), "seconds")
print("="*80)


Using experiment: student_classification_with_optuna
Rows: 20000

Class distribution:
 pass_fail
1    0.87125
0    0.12875
Name: proportion, dtype: float64

Training: logreg_no_pca_optuna
Best CV F1: 0.9985
Test F1: 0.9994




üèÉ View run logreg_no_pca_optuna at: https://dagshub.com/wasiq0/FinalProjectEAS508.mlflow/#/experiments/1/runs/794cebf28ba141d8ad7933aa26a9f701
üß™ View experiment at: https://dagshub.com/wasiq0/FinalProjectEAS508.mlflow/#/experiments/1

Training: logreg_with_pca_optuna
Best CV F1: 0.9808
Test F1: 0.9792




üèÉ View run logreg_with_pca_optuna at: https://dagshub.com/wasiq0/FinalProjectEAS508.mlflow/#/experiments/1/runs/016a5efe8eb544b0996099cd7a269279
üß™ View experiment at: https://dagshub.com/wasiq0/FinalProjectEAS508.mlflow/#/experiments/1

Training: ridgeclf_no_pca_optuna
Best CV F1: 0.9514
Test F1: 0.9531




üèÉ View run ridgeclf_no_pca_optuna at: https://dagshub.com/wasiq0/FinalProjectEAS508.mlflow/#/experiments/1/runs/d834dfeae1194340bc4a650296186d1c
üß™ View experiment at: https://dagshub.com/wasiq0/FinalProjectEAS508.mlflow/#/experiments/1

Training: ridgeclf_with_pca_optuna
Best CV F1: 0.9471
Test F1: 0.9502




üèÉ View run ridgeclf_with_pca_optuna at: https://dagshub.com/wasiq0/FinalProjectEAS508.mlflow/#/experiments/1/runs/d47e5ba0c0ee4fac96cc35fc3a1a5868
üß™ View experiment at: https://dagshub.com/wasiq0/FinalProjectEAS508.mlflow/#/experiments/1

Training: histgb_no_pca_optuna
Best CV F1: 0.9991
Test F1: 0.9993




üèÉ View run histgb_no_pca_optuna at: https://dagshub.com/wasiq0/FinalProjectEAS508.mlflow/#/experiments/1/runs/ada20b4aaaa744848db353e9b31663ef
üß™ View experiment at: https://dagshub.com/wasiq0/FinalProjectEAS508.mlflow/#/experiments/1

Training: histgb_with_pca_optuna
Best CV F1: 0.9879
Test F1: 0.9858




üèÉ View run histgb_with_pca_optuna at: https://dagshub.com/wasiq0/FinalProjectEAS508.mlflow/#/experiments/1/runs/7b1f30d450134f58b8fb689ca94dfee3
üß™ View experiment at: https://dagshub.com/wasiq0/FinalProjectEAS508.mlflow/#/experiments/1

Training: xgboost_no_pca_optuna


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best CV F1: 0.9996
Test F1: 0.9993




üèÉ View run xgboost_no_pca_optuna at: https://dagshub.com/wasiq0/FinalProjectEAS508.mlflow/#/experiments/1/runs/5e5e4d97e3bb4d9b9d968e2687eab1d6
üß™ View experiment at: https://dagshub.com/wasiq0/FinalProjectEAS508.mlflow/#/experiments/1

Training: xgboost_with_pca_optuna


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best CV F1: 0.9891
Test F1: 0.9884




üèÉ View run xgboost_with_pca_optuna at: https://dagshub.com/wasiq0/FinalProjectEAS508.mlflow/#/experiments/1/runs/df6e3656910045b8a9e5734ffaac9616
üß™ View experiment at: https://dagshub.com/wasiq0/FinalProjectEAS508.mlflow/#/experiments/1

‚úì 03 EXPERIMENTS COMPLETE
Runs logged: 8
Elapsed time: 174.47 seconds


In [None]:
import dagshub
dagshub.init(repo_owner="wasiq0", repo_name="FinalProjectEAS508", mlflow=True)


In [None]:
# ---------------- DAGSHUB + MLFLOW INIT (FIRST THING) ----------------
%pip install -q dagshub mlflow optuna xgboost

import dagshub
dagshub.init(
    repo_owner="wasiq0",
    repo_name="FinalProjectEAS508",
    mlflow=True
)

import mlflow


In [None]:
# =============================================================================
# 03 TRAIN MODELS WITH OPTUNA (CLASSIFICATION)
# - 8 Experiments
# - 4 Models √ó (PCA / No PCA)
# - Logs F1-score to Dagshub MLflow
# =============================================================================

# -------------------- INSTALL + INIT (MUST BE FIRST) --------------------
%pip install -q dagshub mlflow optuna xgboost

import dagshub
dagshub.init(
    repo_owner="wasiq0",
    repo_name="FinalProjectEAS508",
    mlflow=True
)

import mlflow

# -------------------- IMPORTS --------------------
import sqlite3
import pandas as pd
import numpy as np
import time
import optuna

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier

# -------------------- LOAD DATA FROM DB --------------------
DB_PATH = "/content/drive/MyDrive/work/Finalproject/housing_app_fall25-main/data/student_performance.db"

conn = sqlite3.connect(DB_PATH)

df = pd.read_sql_query("""
SELECT
    s.student_id,
    s.age,
    s.gender,
    s.internet_access,
    s.sleep_hours,
    s.sleep_quality,
    s.class_attendance,
    c.course_name AS course,
    c.study_method,
    c.facility_rating,
    e.exam_difficulty,
    e.study_hours,
    e.exam_score,
    e.pass_fail
FROM students s
JOIN exams e ON e.student_id = s.student_id
JOIN courses c ON c.course_id = e.course_id
ORDER BY s.student_id
""", conn)

conn.close()

print("Rows:", len(df))
print("\nClass distribution:\n", df["pass_fail"].value_counts(normalize=True))

# -------------------- TARGET + FEATURES --------------------
df["pass_fail"] = df["pass_fail"].map({"fail": 0, "pass": 1})

X = df.drop(columns=["student_id", "pass_fail"])
y = df["pass_fail"]

num_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_features = X.select_dtypes(include=["object"]).columns.tolist()

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), num_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# -------------------- MODEL FACTORY --------------------
def get_model(name, trial):
    if name == "logreg":
        return LogisticRegression(
            C=trial.suggest_float("C", 0.01, 10, log=True),
            max_iter=500
        )
    if name == "ridge":
        return RidgeClassifier(
            alpha=trial.suggest_float("alpha", 0.1, 10, log=True)
        )
    if name == "histgb":
        return HistGradientBoostingClassifier(
            learning_rate=trial.suggest_float("lr", 0.01, 0.2),
            max_depth=trial.suggest_int("depth", 3, 8)
        )
    if name == "xgboost":
        return XGBClassifier(
            n_estimators=trial.suggest_int("n_estimators", 100, 300),
            max_depth=trial.suggest_int("depth", 3, 8),
            learning_rate=trial.suggest_float("lr", 0.01, 0.2),
            eval_metric="logloss",
            use_label_encoder=False
        )

# -------------------- OPTUNA OBJECTIVE --------------------
def objective(trial, model_name, use_pca):
    steps = [("prep", preprocessor)]
    if use_pca:
        steps.append(("pca", PCA(n_components=0.95)))
    steps.append(("model", get_model(model_name, trial)))

    pipe = Pipeline(steps)

    score = cross_val_score(
        pipe,
        X_train,
        y_train,
        scoring="f1",
        cv=3,
        n_jobs=-1
    ).mean()

    return score

# -------------------- RUN 8 EXPERIMENTS --------------------
models = ["logreg", "ridge", "histgb", "xgboost"]
start = time.time()
run_count = 0

for model_name in models:
    for use_pca in [False, True]:

        run_name = f"{model_name}_{'with_pca' if use_pca else 'no_pca'}_optuna"

        print("\n" + "="*80)
        print("Training:", run_name)
        print("="*80)

        study = optuna.create_study(direction="maximize")
        study.optimize(
            lambda t: objective(t, model_name, use_pca),
            n_trials=10,
            show_progress_bar=False
        )

        best_params = study.best_params
        best_cv_f1 = study.best_value

        # Train final model
        steps = [("prep", preprocessor)]
        if use_pca:
            steps.append(("pca", PCA(n_components=0.95)))
        steps.append(("model", get_model(model_name, optuna.trial.FixedTrial(best_params))))

        final_model = Pipeline(steps)
        final_model.fit(X_train, y_train)

        preds = final_model.predict(X_test)
        test_f1 = f1_score(y_test, preds)

        print("Best CV F1:", best_cv_f1)
        print("Test F1:", test_f1)

        # -------------------- DAGSHUB SAFE LOGGING --------------------
        with mlflow.start_run(run_name=run_name):
            mlflow.log_param("model", model_name)
            mlflow.log_param("pca", use_pca)
            mlflow.log_param("optuna", True)

            for k, v in best_params.items():
                mlflow.log_param(k, float(v))

            mlflow.log_metric("cv_f1", best_cv_f1)
            mlflow.log_metric("test_f1", test_f1)

        run_count += 1

print("\n" + "="*80)
print("‚úì 03 EXPERIMENTS COMPLETE")
print("Experiments run:", run_count)
print("Elapsed time:", round(time.time() - start, 2), "seconds")
print("="*80)


Rows: 20000

Class distribution:
 pass_fail
pass    0.87125
fail    0.12875
Name: proportion, dtype: float64

Training: logreg_no_pca_optuna
Best CV F1: 0.9985651506839163
Test F1: 0.9994261119081779


MlflowException: API request to endpoint /api/2.0/mlflow/runs/create failed with error code 404 != 200. Response body: ''

In [None]:
# =============================================================================
# 03_train_models_with_optuna.ipynb
# DAGSHUB SAFE ‚Äì CLASSIFICATION ‚Äì 8 EXPERIMENTS
# =============================================================================

import sqlite3, time
import pandas as pd
import numpy as np
import mlflow
import optuna
from optuna.samplers import TPESampler

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.decomposition import PCA

from xgboost import XGBClassifier

# =============================================================================
# CONFIG
# =============================================================================
BASE_FOLDER = "/content/drive/MyDrive/work/Finalproject/housing_app_fall25-main"
DB_PATH = f"{BASE_FOLDER}/data/student_performance.db"
EXPERIMENT_NAME = "student_classification_with_optuna"
N_TRIALS = 10

# =============================================================================
# LOAD DATA FROM SQLITE
# =============================================================================
conn = sqlite3.connect(DB_PATH)

df = pd.read_sql_query("""
SELECT
    s.student_id,
    s.age,
    s.gender,
    s.internet_access,
    s.sleep_hours,
    s.sleep_quality,
    s.class_attendance,
    c.course_name AS course,
    c.study_method,
    c.facility_rating,
    e.exam_difficulty,
    e.study_hours,
    e.exam_score,
    e.pass_fail
FROM students s
JOIN exams e ON e.student_id = s.student_id
JOIN courses c ON c.course_id = e.course_id
ORDER BY s.student_id
""", conn)

conn.close()

# Encode target
df["pass_fail"] = df["pass_fail"].map({"fail": 0, "pass": 1})

X = df.drop(columns=["student_id", "pass_fail"])
y = df["pass_fail"]

print("Rows:", len(df))
print("Class distribution:\n", y.value_counts(normalize=True))

# =============================================================================
# PREPROCESSING
# =============================================================================
num_features = X.select_dtypes(include=["int64", "float64"]).columns
cat_features = X.select_dtypes(include=["object"]).columns

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), num_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features),
])

# =============================================================================
# TRAIN / TEST SPLIT
# =============================================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)

# =============================================================================
# MODEL DEFINITIONS
# =============================================================================
models = {
    "logreg": LogisticRegression(max_iter=500),
    "ridgeclf": RidgeClassifier(),
    "histgb": HistGradientBoostingClassifier(),
    "xgboost": XGBClassifier(
        eval_metric="logloss",
        use_label_encoder=False
    ),
}

# =============================================================================
# OPTUNA OBJECTIVE
# =============================================================================
def objective(trial, model_name, use_pca):
    if model_name == "logreg":
        model = LogisticRegression(
            C=trial.suggest_float("C", 0.01, 10, log=True),
            max_iter=500
        )
    elif model_name == "ridgeclf":
        model = RidgeClassifier(
            alpha=trial.suggest_float("alpha", 0.1, 10, log=True)
        )
    elif model_name == "histgb":
        model = HistGradientBoostingClassifier(
            learning_rate=trial.suggest_float("lr", 0.01, 0.3),
            max_depth=trial.suggest_int("depth", 3, 10)
        )
    else:
        model = XGBClassifier(
            n_estimators=trial.suggest_int("n_estimators", 100, 300),
            max_depth=trial.suggest_int("max_depth", 3, 8),
            learning_rate=trial.suggest_float("lr", 0.01, 0.3),
            subsample=0.8,
            colsample_bytree=0.8,
            eval_metric="logloss",
            use_label_encoder=False
        )

    steps = [("prep", preprocessor)]
    if use_pca:
        steps.append(("pca", PCA(n_components=0.95)))
    steps.append(("model", model))

    pipe = Pipeline(steps)

    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    scores = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="f1")
    return scores.mean()

# =============================================================================
# RUN 8 EXPERIMENTS
# =============================================================================
start = time.time()
count = 0

for model_name in models:
    for use_pca in [False, True]:
        count += 1
        run_name = f"{model_name}_{'with_pca' if use_pca else 'no_pca'}_optuna"

        print("\n" + "="*80)
        print("Training:", run_name)
        print("="*80)

        study = optuna.create_study(
            direction="maximize",
            sampler=TPESampler(seed=42)
        )

        study.optimize(
            lambda t: objective(t, model_name, use_pca),
            n_trials=N_TRIALS
        )

        best_params = study.best_params

        # Build final model
        final_model = models[model_name].set_params(**best_params)

        steps = [("prep", preprocessor)]
        if use_pca:
            steps.append(("pca", PCA(n_components=0.95)))
        steps.append(("model", final_model))

        pipe = Pipeline(steps)
        pipe.fit(X_train, y_train)

        preds = pipe.predict(X_test)
        f1 = f1_score(y_test, preds)

        print("Best CV F1:", study.best_value)
        print("Test F1:", f1)

        with mlflow.start_run(
    run_name=run_name,
    tags={
        "experiment": str(EXPERIMENT_NAME),
        "model": str(model_name),
        "pca": str(use_pca),
        "optuna": "true"
    }
):

            mlflow.log_params(best_params)
            mlflow.log_metric("f1_test", f1)
            mlflow.sklearn.log_model(pipe, artifact_path="model")



print("\n" + "="*80)
print("‚úì 03 EXPERIMENTS COMPLETE")
print("Total Optuna Experiments:", count)
print("Elapsed time:", round(time.time() - start, 2), "seconds")
print("="*80)


Rows: 20000
Class distribution:
 pass_fail
1    0.87125
0    0.12875
Name: proportion, dtype: float64

Training: logreg_no_pca_optuna
Best CV F1: 0.9986010895908355
Test F1: 0.9994261119081779


MlflowException: API request to endpoint /api/2.0/mlflow/runs/create failed with error code 404 != 200. Response body: ''

In [None]:
# =============================================================================
# 03 - TRAIN CLASSIFICATION MODELS WITH OPTUNA
# Runs remaining 8 experiments:
#   - 4 models WITHOUT PCA + Optuna
#   - 4 models WITH PCA + Optuna
# Logs everything to MLflow / Dagshub
# =============================================================================

import os
import time
import sqlite3
import joblib
import numpy as np
import pandas as pd
import optuna

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier

import mlflow
from mlflow.models import infer_signature


In [None]:
# -----------------------------------------------------------------------------
# Paths
# -----------------------------------------------------------------------------
BASE_FOLDER = "/content/drive/MyDrive/work/Finalproject/housing_app_fall25-main"
DB_PATH = f"{BASE_FOLDER}/data/student_performance.db"
MODEL_DIR = f"{BASE_FOLDER}/models"
os.makedirs(MODEL_DIR, exist_ok=True)

# -----------------------------------------------------------------------------
# MLflow (Dagshub-ready)
# -----------------------------------------------------------------------------
mlflow.set_tracking_uri("file:" + os.path.join(BASE_FOLDER, "mlruns"))
mlflow.set_experiment("student_classification_optuna")
mlflow.set_tracking_uri("https://dagshub.com/<wasiq0>/<FinalProjectEAS508>.mlflow")

print("‚úì MLflow experiment set")


‚úì MLflow experiment set


In [None]:
!pip install dagshub

Collecting dagshub
  Downloading dagshub-0.6.4-py3-none-any.whl.metadata (12 kB)
Collecting appdirs>=1.4.4 (from dagshub)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting dacite~=1.6.0 (from dagshub)
  Downloading dacite-1.6.0-py3-none-any.whl.metadata (14 kB)
Collecting gql[requests] (from dagshub)
  Downloading gql-4.0.0-py3-none-any.whl.metadata (10 kB)
Collecting dataclasses-json (from dagshub)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting treelib>=1.6.4 (from dagshub)
  Downloading treelib-1.8.0-py3-none-any.whl.metadata (3.3 kB)
Collecting pathvalidate>=3.0.0 (from dagshub)
  Downloading pathvalidate-3.3.1-py3-none-any.whl.metadata (12 kB)
Collecting boto3 (from dagshub)
  Downloading boto3-1.42.12-py3-none-any.whl.metadata (6.8 kB)
Collecting semver (from dagshub)
  Downloading semver-3.0.4-py3-none-any.whl.metadata (6.8 kB)
Collecting dagshub-annotation-converter>=0.1.12 (from dagshub)
  Downloading dagshub_annota

In [None]:
base_folder = "/content/drive/MyDrive/work/Finalproject/housing_app_fall25-main"
%cd "{base_folder}"

/content/drive/MyDrive/work/Finalproject/housing_app_fall25-main


In [None]:
import sqlite3

conn = sqlite3.connect("/content/drive/MyDrive/work/Finalproject/housing_app_fall25-main/data/student_performance.db")

tables = conn.execute("""
SELECT name FROM sqlite_master
WHERE type='table'
ORDER BY name;
""").fetchall()

conn.close()

tables


[('courses',), ('exams',), ('sqlite_sequence',), ('students',)]

In [None]:
# =============================================================================
# 03_train_models_with_optuna.ipynb  (Student Performance - Classification)
# FULL PIPELINE with OPTUNA + MLFLOW
# - Load joined modeling data from SQLite
# - Map target: fail->0, pass->1
# - Stratified train/test split
# - Build preprocessing (StandardScaler + MinMaxScaler + OneHotEncoder)
# - Optuna tune 4 classifiers for BOTH scalers
# - Log all runs to MLflow
# - Pick global best by Test F1(macro)
# - Save best model to models/best_optuna_classifier.joblib
# =============================================================================

import os, time, sqlite3
import numpy as np
import pandas as pd

from pathlib import Path

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import HistGradientBoostingClassifier

import joblib
import optuna
from optuna.samplers import TPESampler

import mlflow
from mlflow.models import infer_signature

from xgboost import XGBClassifier

# -----------------------------
# 0) Paths (COLAB)
# -----------------------------


import sqlite3
import pandas as pd

DB_PATH = "/content/drive/MyDrive/work/Finalproject/housing_app_fall25-main/data/student_performance.db"
conn = sqlite3.connect(DB_PATH)

df = pd.read_sql_query(
    """
    SELECT
        s.student_id,
        s.age,
        s.gender,
        s.internet_access,
        s.sleep_hours,
        s.sleep_quality,
        s.class_attendance,

        c.course_name      AS course,
        c.study_method,
        c.facility_rating,

        e.exam_difficulty,
        e.study_hours,
        e.exam_score,
        e.pass_fail
    FROM students s
    JOIN exams e
        ON e.student_id = s.student_id
    JOIN courses c
        ON c.course_id = e.course_id
    ORDER BY s.student_id
    """,
    conn
)

conn.close()

print("Rows:", len(df))
display(df.head())



# -----------------------------
# 2) Target mapping (required for XGBoost)
# -----------------------------
TARGET = "pass_fail"
if TARGET not in df.columns:
    raise ValueError(f"Missing target column: {TARGET}")

# normalize target text
df[TARGET] = df[TARGET].astype(str).str.strip().str.lower()

mapping = {"fail": 0, "pass": 1}
if not set(df[TARGET].unique()).issubset(set(mapping.keys())):
    raise ValueError(f"Unexpected target values: {df[TARGET].unique()} (expected fail/pass)")

y = df[TARGET].map(mapping).astype(int)
X = df.drop(columns=[TARGET])

print("\nTarget mapping:")
print("fail -> 0")
print("pass -> 1")

print("\nClass distribution:")
print(y.value_counts(normalize=True).rename("proportion"))

# -----------------------------
# 3) Stratified split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    random_state=42,
    stratify=y
)
print(f"‚úì Stratified split done. Train={len(X_train)} Test={len(X_test)}")

# -----------------------------
# 4) Build preprocessing (same philosophy as notebook 02)
# -----------------------------
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = [c for c in X.columns if c not in numeric_features]

num_standard = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

num_minmax = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", MinMaxScaler())
])

cat_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessors = {
    "standard": ColumnTransformer(
        transformers=[
            ("num", num_standard, numeric_features),
            ("cat", cat_pipe, categorical_features),
        ],
        remainder="drop"
    ),
    "minmax": ColumnTransformer(
        transformers=[
            ("num", num_minmax, numeric_features),
            ("cat", cat_pipe, categorical_features),
        ],
        remainder="drop"
    )
}

# -----------------------------
# 5) MLflow setup (local file tracking in Colab)
# -----------------------------
# This creates /content/mlruns (normal). You do NOT need to move it.
mlflow.set_tracking_uri("file:/content/mlruns")
mlflow.set_experiment("student_performance_classification_optuna")
print("‚úì MLflow tracking:", mlflow.get_tracking_uri())

# -----------------------------
# 6) Optuna objectives (maximize CV F1 macro)
# -----------------------------
optuna.logging.set_verbosity(optuna.logging.WARNING)
sampler = TPESampler(seed=42)

def cv_f1_macro(pipe):
    scores = cross_val_score(
        pipe, X_train, y_train,
        cv=3, scoring="f1_macro", n_jobs=-1
    )
    return float(scores.mean())

def objective_logreg(trial, preprocessor):
    C = trial.suggest_float("C", 0.01, 10.0, log=True)
    pipe = Pipeline(steps=[
        ("preprocess", preprocessor),
        ("model", LogisticRegression(
            C=C, max_iter=3000, class_weight="balanced"
        ))
    ])
    return cv_f1_macro(pipe)

def objective_ridgeclf(trial, preprocessor):
    alpha = trial.suggest_float("alpha", 0.01, 10.0, log=True)
    pipe = Pipeline(steps=[
        ("preprocess", preprocessor),
        ("model", RidgeClassifier(alpha=alpha))
    ])
    return cv_f1_macro(pipe)

def objective_histgb(trial, preprocessor):
    lr = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
    max_depth = trial.suggest_int("max_depth", 2, 10)
    max_leaf_nodes = trial.suggest_int("max_leaf_nodes", 15, 63)

    pipe = Pipeline(steps=[
        ("preprocess", preprocessor),
        ("model", HistGradientBoostingClassifier(
            learning_rate=lr,
            max_depth=max_depth,
            max_leaf_nodes=max_leaf_nodes,
            random_state=42
        ))
    ])
    return cv_f1_macro(pipe)

def objective_xgb(trial, preprocessor):
    n_estimators = trial.suggest_int("n_estimators", 100, 400, step=50)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
    max_depth = trial.suggest_int("max_depth", 2, 10)
    subsample = trial.suggest_float("subsample", 0.6, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.6, 1.0)

    pipe = Pipeline(steps=[
        ("preprocess", preprocessor),
        ("model", XGBClassifier(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            subsample=subsample,
            colsample_bytree=colsample_bytree,
            objective="binary:logistic",
            eval_metric="logloss",
            tree_method="hist",
            random_state=42,
            n_jobs=-1
        ))
    ])
    return cv_f1_macro(pipe)

OBJECTIVES = {
    "logreg": objective_logreg,
    "ridgeclf": objective_ridgeclf,
    "histgb": objective_histgb,
    "xgboost": objective_xgb
}

# -----------------------------
# 7) Run Optuna (for each scaler + each model)
# -----------------------------
N_TRIALS = 20  # increase to 50 if you want stronger tuning
results = []   # store candidates

start_time = time.monotonic()

for prep_name, preprocessor in preprocessors.items():
    for model_name, obj_func in OBJECTIVES.items():
        run_name = f"{prep_name}_{model_name}_optuna"
        print("\n" + "="*90)
        print(f"OPTUNA: {run_name} | trials={N_TRIALS}")
        print("="*90)

        study = optuna.create_study(direction="maximize", sampler=sampler)
        study.optimize(lambda trial: obj_func(trial, preprocessor), n_trials=N_TRIALS)

        best_cv = study.best_value
        best_params = study.best_params
        print("Best CV F1(macro):", round(best_cv, 6))
        print("Best params:", best_params)

        # Build final best pipeline using best params
        if model_name == "logreg":
            model = LogisticRegression(
                C=best_params["C"],
                max_iter=3000,
                class_weight="balanced"
            )
        elif model_name == "ridgeclf":
            model = RidgeClassifier(alpha=best_params["alpha"])
        elif model_name == "histgb":
            model = HistGradientBoostingClassifier(
                learning_rate=best_params["learning_rate"],
                max_depth=best_params["max_depth"],
                max_leaf_nodes=best_params["max_leaf_nodes"],
                random_state=42
            )
        elif model_name == "xgboost":
            model = XGBClassifier(
                n_estimators=best_params["n_estimators"],
                learning_rate=best_params["learning_rate"],
                max_depth=best_params["max_depth"],
                subsample=best_params["subsample"],
                colsample_bytree=best_params["colsample_bytree"],
                objective="binary:logistic",
                eval_metric="logloss",
                tree_method="hist",
                random_state=42,
                n_jobs=-1
            )
        else:
            raise ValueError("Unknown model")

        final_pipe = Pipeline(steps=[
            ("preprocess", preprocessor),
            ("model", model)
        ])

        final_pipe.fit(X_train, y_train)
        preds = final_pipe.predict(X_test)
        test_f1 = f1_score(y_test, preds, average="macro")

        print("Test F1(macro):", round(test_f1, 6))

        # MLflow logging
        with mlflow.start_run(run_name=run_name):
            mlflow.log_param("preprocess", prep_name)
            mlflow.log_param("model_family", model_name)
            mlflow.log_param("n_trials", N_TRIALS)

            for k, v in best_params.items():
                mlflow.log_param(k, v)

            mlflow.log_metric("cv_f1_macro", best_cv)
            mlflow.log_metric("test_f1_macro", test_f1)

            signature = infer_signature(X_train, final_pipe.predict(X_train))
            mlflow.sklearn.log_model(
                sk_model=final_pipe,
                artifact_path="model",
                signature=signature,
                input_example=X_train.head(5),
                registered_model_name=f"{run_name}_pipeline"
            )

        results.append({
            "key": run_name,
            "preprocess": prep_name,
            "model": model_name,
            "cv_f1_macro": best_cv,
            "test_f1_macro": test_f1,
            "best_params": best_params,
            "pipeline": final_pipe
        })

# -----------------------------
# 8) Pick global best by Test F1(macro)
# -----------------------------
results_df = pd.DataFrame([{k:v for k,v in r.items() if k!="pipeline"} for r in results])
results_df = results_df.sort_values("test_f1_macro", ascending=False)

print("\n" + "="*90)
print("GLOBAL RESULTS (top 10 by Test F1 macro)")
print("="*90)
display(results_df.head(10))

best_row = results_df.iloc[0]
best_key = best_row["key"]
best_idx = int(results_df.index[0])
best_pipe = results[best_idx]["pipeline"]

print("\n" + "="*90)
print("GLOBAL BEST MODEL")
print("="*90)
print("Best key:", best_key)
print("Best Test F1(macro):", float(best_row["test_f1_macro"]))
print("Best CV  F1(macro):", float(best_row["cv_f1_macro"]))
print("Best params:", best_row["best_params"])

# Evaluate best with report
best_preds = best_pipe.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, best_preds, target_names=["fail", "pass"]))

# Save confusion matrix to reports
cm = confusion_matrix(y_test, best_preds)
cm_path = REPORTS_DIR / "optuna_best_confusion_matrix.npy"
np.save(cm_path, cm)
print(" Saved confusion matrix:", cm_path)

# Save best model
best_model_path = MODELS_DIR / "best_optuna_classifier.joblib"
joblib.dump(best_pipe, best_model_path)
print(" Saved best model to:", best_model_path)

# Save leaderboard CSV
leaderboard_path = REPORTS_DIR / "optuna_leaderboard.csv"
results_df.to_csv(leaderboard_path, index=False)
print(" Saved optuna leaderboard:", leaderboard_path)

elapsed = time.monotonic() - start_time
print(f"\nElapsed time: {elapsed/60:.2f} minutes")
print("Notebook 03 complete. Next: update FastAPI + Streamlit to load best_optuna_classifier.joblib")


Rows: 20000


Unnamed: 0,student_id,age,gender,internet_access,sleep_hours,sleep_quality,class_attendance,course,study_method,facility_rating,exam_difficulty,study_hours,exam_score,pass_fail
0,1,17,male,yes,7.4,poor,92.9,diploma,coaching,low,hard,2.78,58.9,pass
1,2,23,other,yes,4.6,average,64.8,bca,online videos,medium,moderate,3.37,54.8,pass
2,3,22,male,yes,8.5,poor,76.8,b.sc,coaching,high,moderate,7.88,90.3,pass
3,4,20,other,yes,5.8,average,48.4,diploma,online videos,low,moderate,0.67,29.7,fail
4,5,20,female,yes,9.8,poor,71.6,diploma,coaching,low,moderate,0.89,43.7,fail


2025/12/17 04:30:39 INFO mlflow.tracking.fluent: Experiment with name 'student_performance_classification_optuna' does not exist. Creating a new experiment.



Target mapping:
fail -> 0
pass -> 1

Class distribution:
pass_fail
1    0.7298
0    0.2702
Name: proportion, dtype: float64
‚úì Stratified split done. Train=16000 Test=4000
‚úì MLflow tracking: file:/content/mlruns

OPTUNA: standard_logreg_optuna | trials=20
Best CV F1(macro): 0.992761
Best params: {'C': 9.395445020858189}
Test F1(macro): 0.989641


Successfully registered model 'standard_logreg_optuna_pipeline'.
Created version '1' of model 'standard_logreg_optuna_pipeline'.



OPTUNA: standard_ridgeclf_optuna | trials=20
Best CV F1(macro): 0.984796
Best params: {'alpha': 0.011527987128232402}
Test F1(macro): 0.987579


Successfully registered model 'standard_ridgeclf_optuna_pipeline'.
Created version '1' of model 'standard_ridgeclf_optuna_pipeline'.



OPTUNA: standard_histgb_optuna | trials=20
Best CV F1(macro): 0.998102
Best params: {'learning_rate': 0.05388108577817234, 'max_depth': 2, 'max_leaf_nodes': 59}
Test F1(macro): 0.998102


Successfully registered model 'standard_histgb_optuna_pipeline'.
Created version '1' of model 'standard_histgb_optuna_pipeline'.



OPTUNA: standard_xgboost_optuna | trials=20
Best CV F1(macro): 1.0
Best params: {'n_estimators': 400, 'learning_rate': 0.13962563737015762, 'max_depth': 10, 'subsample': 0.9579309401710595, 'colsample_bytree': 0.8391599915244341}
Test F1(macro): 1.0


Successfully registered model 'standard_xgboost_optuna_pipeline'.
Created version '1' of model 'standard_xgboost_optuna_pipeline'.



OPTUNA: minmax_logreg_optuna | trials=20
Best CV F1(macro): 0.983148
Best params: {'C': 9.386832220365555}
Test F1(macro): 0.981307


Successfully registered model 'minmax_logreg_optuna_pipeline'.
Created version '1' of model 'minmax_logreg_optuna_pipeline'.



OPTUNA: minmax_ridgeclf_optuna | trials=20
Best CV F1(macro): 0.984796
Best params: {'alpha': 0.07400385759087375}
Test F1(macro): 0.987579


Successfully registered model 'minmax_ridgeclf_optuna_pipeline'.
Created version '1' of model 'minmax_ridgeclf_optuna_pipeline'.



OPTUNA: minmax_histgb_optuna | trials=20
Best CV F1(macro): 0.998102
Best params: {'learning_rate': 0.02634777514406047, 'max_depth': 2, 'max_leaf_nodes': 44}
Test F1(macro): 0.998102


Successfully registered model 'minmax_histgb_optuna_pipeline'.
Created version '1' of model 'minmax_histgb_optuna_pipeline'.



OPTUNA: minmax_xgboost_optuna | trials=20
Best CV F1(macro): 1.0
Best params: {'n_estimators': 400, 'learning_rate': 0.022587177280197233, 'max_depth': 3, 'subsample': 0.7957811041110252, 'colsample_bytree': 0.9942601816442402}
Test F1(macro): 1.0





GLOBAL RESULTS (top 10 by Test F1 macro)


Successfully registered model 'minmax_xgboost_optuna_pipeline'.
Created version '1' of model 'minmax_xgboost_optuna_pipeline'.


Unnamed: 0,key,preprocess,model,cv_f1_macro,test_f1_macro,best_params
3,standard_xgboost_optuna,standard,xgboost,1.0,1.0,"{'n_estimators': 400, 'learning_rate': 0.13962..."
7,minmax_xgboost_optuna,minmax,xgboost,1.0,1.0,"{'n_estimators': 400, 'learning_rate': 0.02258..."
6,minmax_histgb_optuna,minmax,histgb,0.998102,0.998102,"{'learning_rate': 0.02634777514406047, 'max_de..."
2,standard_histgb_optuna,standard,histgb,0.998102,0.998102,"{'learning_rate': 0.05388108577817234, 'max_de..."
0,standard_logreg_optuna,standard,logreg,0.992761,0.989641,{'C': 9.395445020858189}
1,standard_ridgeclf_optuna,standard,ridgeclf,0.984796,0.987579,{'alpha': 0.011527987128232402}
5,minmax_ridgeclf_optuna,minmax,ridgeclf,0.984796,0.987579,{'alpha': 0.07400385759087375}
4,minmax_logreg_optuna,minmax,logreg,0.983148,0.981307,{'C': 9.386832220365555}



GLOBAL BEST MODEL
Best key: standard_xgboost_optuna
Best Test F1(macro): 1.0
Best CV  F1(macro): 1.0
Best params: {'n_estimators': 400, 'learning_rate': 0.13962563737015762, 'max_depth': 10, 'subsample': 0.9579309401710595, 'colsample_bytree': 0.8391599915244341}

Classification Report:
              precision    recall  f1-score   support

        fail       1.00      1.00      1.00      1081
        pass       1.00      1.00      1.00      2919

    accuracy                           1.00      4000
   macro avg       1.00      1.00      1.00      4000
weighted avg       1.00      1.00      1.00      4000

 Saved confusion matrix: /content/drive/MyDrive/work/Finalproject/housing_app_fall25-main/reports/optuna_best_confusion_matrix.npy
 Saved best model to: /content/drive/MyDrive/work/Finalproject/housing_app_fall25-main/models/best_optuna_classifier.joblib
 Saved optuna leaderboard: /content/drive/MyDrive/work/Finalproject/housing_app_fall25-main/reports/optuna_leaderboard.csv

Elaps

In [None]:
!cp -r /content/mlruns "{base_folder}/mlruns"


In [None]:
import sqlite3

DB_PATH = "/content/drive/MyDrive/work/Finalproject/housing_app_fall25-main/data/student_performance.db"
conn = sqlite3.connect(DB_PATH)
cur = conn.cursor()

for table in ["students", "courses", "exams"]:
    print(f"\nTABLE: {table}")
    cur.execute(f"PRAGMA table_info({table})")
    for row in cur.fetchall():
        print(row)

conn.close()



TABLE: students
(0, 'student_id', 'INTEGER', 0, None, 1)
(1, 'age', 'INTEGER', 0, None, 0)
(2, 'gender', 'TEXT', 0, None, 0)
(3, 'internet_access', 'TEXT', 0, None, 0)
(4, 'sleep_hours', 'REAL', 0, None, 0)
(5, 'sleep_quality', 'TEXT', 0, None, 0)
(6, 'class_attendance', 'REAL', 0, None, 0)

TABLE: courses
(0, 'course_id', 'INTEGER', 0, None, 1)
(1, 'course_name', 'TEXT', 0, None, 0)
(2, 'study_method', 'TEXT', 0, None, 0)
(3, 'facility_rating', 'TEXT', 0, None, 0)

TABLE: exams
(0, 'exam_id', 'INTEGER', 0, None, 1)
(1, 'student_id', 'INTEGER', 0, None, 0)
(2, 'course_id', 'INTEGER', 0, None, 0)
(3, 'exam_difficulty', 'TEXT', 0, None, 0)
(4, 'study_hours', 'REAL', 0, None, 0)
(5, 'exam_score', 'REAL', 0, None, 0)
(6, 'pass_fail', 'TEXT', 0, None, 0)


In [None]:
import sqlite3
import pandas as pd
conn = sqlite3.connect(f"{base_folder}/data/housing.db")
housing = pd.read_sql_query(
    """
    SELECT
        b.block_id,
        b.longitude,
        b.latitude,
        s.housing_median_age,
        s.total_rooms,
        s.total_bedrooms,
        s.population,
        s.households,
        s.median_income,
        s.median_house_value,
        op.name AS ocean_proximity
    FROM block AS b
    JOIN block_housing_stats AS s
        ON s.block_id = b.block_id
    JOIN ocean_proximity AS op
        ON op.ocean_proximity_id = b.ocean_proximity_id
    ORDER BY b.block_id
    """,
    conn,
)
conn.close()

housing.head()

OperationalError: unable to open database file

In [None]:
# =============================================================================
# FULL PIPELINE with OPTUNA
# - Build preprocessing
# - Stratified train/test split
# - Train & log 4 models WITHOUT PCA (Ridge, HGB, XGBoost, LightGBM)
# - Train & log 4 models WITH PCA (preprocessing + PCA(0.95) + model)
# - Pick GLOBAL best among 8 models by Test MAE
# - Save, load, and compare the global best model
# =============================================================================

import time
import os
import numpy as np
import pandas as pd

from dotenv import load_dotenv

from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler
from sklearn.linear_model import Ridge
from sklearn.ensemble import HistGradientBoostingRegressor

import mlflow
from mlflow.models import infer_signature
import joblib

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

import optuna
from optuna.samplers import TPESampler

from sklearn.base import clone

# Shared components
from housing_pipeline import (
    build_preprocessing,
    make_estimator_for_name,
)

start_time = time.monotonic()
optuna.logging.set_verbosity(optuna.logging.WARNING)


# =============================================================================
# STEP 1: Build Full ML Preprocessing Pipeline
# =============================================================================

preprocessing = build_preprocessing()
print("‚úì STEP 1: Preprocessing pipeline created.")


# =============================================================================
# STEP 2: Split Data into Stratified Train and Test Sets
# =============================================================================

housing["income_cat"] = pd.cut(
    housing["median_income"],
    bins=[0, 1.5, 3.0, 4.5, 6, np.inf],
    labels=[1, 2, 3, 4, 5],
)

train_set, test_set = train_test_split(
    housing,
    test_size=0.20,
    stratify=housing["income_cat"],
    random_state=42,
)

for df in (train_set, test_set):
    df.drop("income_cat", axis=1, inplace=True)

X_train = train_set.drop(["block_id", "median_house_value"], axis=1).copy()
y_train = train_set["median_house_value"].copy()

X_test = test_set.drop(["block_id", "median_house_value"], axis=1).copy()
y_test = test_set["median_house_value"].copy()

print(f"‚úì STEP 2: Stratified split done. Train size: {len(X_train)}, Test size: {len(X_test)}")


# =============================================================================
# STEP 3: Configure MLflow
# =============================================================================

load_dotenv(
    dotenv_path="/content/drive/MyDrive/Colab Notebooks/housing_fall2025/notebooks/.env",
    override=True
)

MLFLOW_TRACKING_URI = os.getenv("MLFLOW_TRACKING_URI")
MLFLOW_TRACKING_USERNAME = os.getenv("MLFLOW_TRACKING_USERNAME")
MLFLOW_TRACKING_PASSWORD = os.getenv("MLFLOW_TRACKING_PASSWORD")

if MLFLOW_TRACKING_USERNAME:
    os.environ["MLFLOW_TRACKING_USERNAME"] = MLFLOW_TRACKING_USERNAME
if MLFLOW_TRACKING_PASSWORD:
    os.environ["MLFLOW_TRACKING_PASSWORD"] = MLFLOW_TRACKING_PASSWORD

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment("median_house_pricing_multi_model_optuna")

print("‚úì STEP 3: MLflow configured.")


# =============================================================================
# STEP 4: Define Optuna Objective Functions (NO PCA)
# =============================================================================

def objective_ridge(trial, preprocessing, X_train, y_train):
    alpha = trial.suggest_float("ridge__alpha", 0.1, 100.0, log=True)
    preprocessing_clone = clone(preprocessing)
    pipeline = make_pipeline(preprocessing_clone, Ridge(alpha=alpha))
    scores = cross_val_score(
        pipeline, X_train, y_train,
        cv=3, scoring="neg_mean_absolute_error", n_jobs=-1
    )
    return -scores.mean()


def objective_hgb(trial, preprocessing, X_train, y_train):
    learning_rate = trial.suggest_float("hgb__learning_rate", 0.05, 0.2)
    max_depth = trial.suggest_int("hgb__max_depth", 3, 8)
    preprocessing_clone = clone(preprocessing)
    pipeline = make_pipeline(
        preprocessing_clone,
        HistGradientBoostingRegressor(
            learning_rate=learning_rate,
            max_depth=max_depth,
            random_state=42
        )
    )
    scores = cross_val_score(
        pipeline, X_train, y_train,
        cv=3, scoring="neg_mean_absolute_error", n_jobs=-1
    )
    return -scores.mean()


def objective_xgb(trial, preprocessing, X_train, y_train):
    learning_rate = trial.suggest_float("xgb__learning_rate", 0.05, 0.2)
    max_depth = trial.suggest_int("xgb__max_depth", 3, 8)
    n_estimators = trial.suggest_int("xgb__n_estimators", 100, 300, step=50)
    preprocessing_clone = clone(preprocessing)
    pipeline = make_pipeline(
        preprocessing_clone,
        XGBRegressor(
            objective="reg:squarederror",
            random_state=42,
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            tree_method="hist",
            n_jobs=-1,
        )
    )
    scores = cross_val_score(
        pipeline, X_train, y_train,
        cv=3, scoring="neg_mean_absolute_error", n_jobs=-1
    )
    return -scores.mean()


def objective_lgbm(trial, preprocessing, X_train, y_train):
    learning_rate = trial.suggest_float("lgbm__learning_rate", 0.05, 0.2)
    num_leaves = trial.suggest_int("lgbm__num_leaves", 20, 80)
    n_estimators = trial.suggest_int("lgbm__n_estimators", 100, 300, step=50)
    preprocessing_clone = clone(preprocessing)
    pipeline = make_pipeline(
        preprocessing_clone,
        LGBMRegressor(
            random_state=42,
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            num_leaves=num_leaves,
            n_jobs=-1,
            verbose=-1,
        )
    )
    scores = cross_val_score(
        pipeline, X_train, y_train,
        cv=3, scoring="neg_mean_absolute_error", n_jobs=-1
    )
    return -scores.mean()


# =============================================================================
# STEP 5: Run Optuna Studies for Each Model (NO PCA)
# =============================================================================

model_names = ["ridge", "histgradientboosting", "xgboost", "lightgbm"]
objective_functions = {
    "ridge": objective_ridge,
    "histgradientboosting": objective_hgb,
    "xgboost": objective_xgb,
    "lightgbm": objective_lgbm,
}

results = {}

for name in model_names:
    print(f"\n{'='*80}")
    print(f"Optimizing {name.upper()} (NO PCA) - 10 trials")
    print(f"{'='*80}")

    study = optuna.create_study(
        direction="minimize",
        sampler=TPESampler(seed=42),
        study_name=f"{name}_study"
    )

    study.optimize(
        lambda trial: objective_functions[name](trial, preprocessing, X_train, y_train),
        n_trials=10,
        show_progress_bar=True
    )

    cv_mae = study.best_value
    print(f"\nBest {name.upper()} CV MAE: ${cv_mae:,.2f}")
    print(f"Best params: {study.best_params}")

    best_params = study.best_params
    preprocessing_clone = clone(preprocessing)

    if name == "ridge":
        final_model = make_pipeline(
            preprocessing_clone,
            Ridge(alpha=best_params["ridge__alpha"])
        )
    elif name == "histgradientboosting":
        final_model = make_pipeline(
            preprocessing_clone,
            HistGradientBoostingRegressor(
                learning_rate=best_params["hgb__learning_rate"],
                max_depth=best_params["hgb__max_depth"],
                random_state=42
            )
        )
    elif name == "xgboost":
        final_model = make_pipeline(
            preprocessing_clone,
            XGBRegressor(
                objective="reg:squarederror",
                random_state=42,
                n_estimators=best_params["xgb__n_estimators"],
                learning_rate=best_params["xgb__learning_rate"],
                max_depth=best_params["xgb__max_depth"],
                tree_method="hist",
                n_jobs=-1,
            )
        )
    elif name == "lightgbm":
        final_model = make_pipeline(
            preprocessing_clone,
            LGBMRegressor(
                random_state=42,
                n_estimators=best_params["lgbm__n_estimators"],
                learning_rate=best_params["lgbm__learning_rate"],
                num_leaves=best_params["lgbm__num_leaves"],
                n_jobs=-1,
                verbose=-1,
            )
        )

    final_model.fit(X_train, y_train)

    y_pred = final_model.predict(X_test)
    test_mae = mean_absolute_error(y_test, y_pred)
    print(f"{name} (no PCA) Test MAE: ${test_mae:,.2f}")

    results[name] = {"pipeline": final_model, "test_mae": test_mae, "cv_mae": cv_mae}

    with mlflow.start_run(run_name=f"{name}_baseline_optuna"):
        mlflow.log_param("model_family", name)
        mlflow.log_param("uses_pca", False)
        mlflow.log_params(best_params)
        mlflow.log_metric("cv_MAE", cv_mae)
        mlflow.log_metric("test_MAE", test_mae)

        signature = infer_signature(X_train, final_model.predict(X_train))
        mlflow.sklearn.log_model(
            sk_model=final_model,
            artifact_path="housing_model",
            signature=signature,
            input_example=X_train,
            registered_model_name=f"{name}_pipeline_optuna",
        )

print("\n‚úì STEP 5: All 4 baseline models optimized and logged.")


# =============================================================================
# STEP 6: PCA Optuna Objectives
# =============================================================================

def objective_ridge_pca(trial, preprocessing, X_train, y_train):
    alpha = trial.suggest_float("ridge__alpha", 0.1, 100.0, log=True)
    pca_components = trial.suggest_float("pca__n_components", 0.90, 0.99)
    preprocessing_clone = clone(preprocessing)
    pipeline = make_pipeline(preprocessing_clone, PCA(n_components=pca_components), Ridge(alpha=alpha))
    scores = cross_val_score(
        pipeline, X_train, y_train,
        cv=3, scoring="neg_mean_absolute_error", n_jobs=-1
    )
    return -scores.mean()


def objective_hgb_pca(trial, preprocessing, X_train, y_train):
    learning_rate = trial.suggest_float("hgb__learning_rate", 0.05, 0.2)
    max_depth = trial.suggest_int("hgb__max_depth", 3, 8)
    pca_components = trial.suggest_float("pca__n_components", 0.90, 0.99)
    preprocessing_clone = clone(preprocessing)
    pipeline = make_pipeline(
        preprocessing_clone,
        PCA(n_components=pca_components),
        HistGradientBoostingRegressor(
            learning_rate=learning_rate,
            max_depth=max_depth,
            random_state=42
        )
    )
    scores = cross_val_score(
        pipeline, X_train, y_train,
        cv=3, scoring="neg_mean_absolute_error", n_jobs=-1
    )
    return -scores.mean()


def objective_xgb_pca(trial, preprocessing, X_train, y_train):
    learning_rate = trial.suggest_float("xgb__learning_rate", 0.05, 0.2)
    max_depth = trial.suggest_int("xgb__max_depth", 3, 8)
    n_estimators = trial.suggest_int("xgb__n_estimators", 100, 300, step=50)
    pca_components = trial.suggest_float("pca__n_components", 0.90, 0.99)
    preprocessing_clone = clone(preprocessing)
    pipeline = make_pipeline(
        preprocessing_clone,
        PCA(n_components=pca_components),
        XGBRegressor(
            objective="reg:squarederror",
            random_state=42,
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            tree_method="hist",
            n_jobs=-1,
        )
    )
    scores = cross_val_score(
        pipeline, X_train, y_train,
        cv=3, scoring="neg_mean_absolute_error", n_jobs=-1
    )
    return -scores.mean()


def objective_lgbm_pca(trial, preprocessing, X_train, y_train):
    learning_rate = trial.suggest_float("lgbm__learning_rate", 0.05, 0.2)
    num_leaves = trial.suggest_int("lgbm__num_leaves", 20, 80)
    n_estimators = trial.suggest_int("lgbm__n_estimators", 100, 300, step=50)
    pca_components = trial.suggest_float("pca__n_components", 0.90, 0.99)
    preprocessing_clone = clone(preprocessing)
    pipeline = make_pipeline(
        preprocessing_clone,
        PCA(n_components=pca_components),
        LGBMRegressor(
            random_state=42,
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            num_leaves=num_leaves,
            n_jobs=-1,
            verbose=-1,
        )
    )
    scores = cross_val_score(
        pipeline, X_train, y_train,
        cv=3, scoring="neg_mean_absolute_error", n_jobs=-1
    )
    return -scores.mean()


# =============================================================================
# STEP 7: Run Optuna Studies for PCA Models
# =============================================================================

pca_model_names = ["ridge_with_pca", "histgradientboosting_with_pca", "xgboost_with_pca", "lightgbm_with_pca"]
pca_objective_functions = {
    "ridge_with_pca": objective_ridge_pca,
    "histgradientboosting_with_pca": objective_hgb_pca,
    "xgboost_with_pca": objective_xgb_pca,
    "lightgbm_with_pca": objective_lgbm_pca,
}

pca_results = {}

for name in pca_model_names:
    base_name = name.replace("_with_pca", "")
    print(f"\n{'='*80}")
    print(f"Optimizing {name.upper()} - 10 trials")
    print(f"{'='*80}")

    study = optuna.create_study(
        direction="minimize",
        sampler=TPESampler(seed=42),
        study_name=f"{name}_study"
    )

    study.optimize(
        lambda trial: pca_objective_functions[name](trial, preprocessing, X_train, y_train),
        n_trials=10,
        show_progress_bar=True
    )

    cv_mae_pca = study.best_value
    print(f"\nBest {name.upper()} CV MAE: ${cv_mae_pca:,.2f}")
    print(f"Best params: {study.best_params}")

    best_params = study.best_params
    preprocessing_clone = clone(preprocessing)

    if base_name == "ridge":
        final_model = make_pipeline(
            preprocessing_clone,
            PCA(n_components=best_params["pca__n_components"]),
            Ridge(alpha=best_params["ridge__alpha"])
        )
    elif base_name == "histgradientboosting":
        final_model = make_pipeline(
            preprocessing_clone,
            PCA(n_components=best_params["pca__n_components"]),
            HistGradientBoostingRegressor(
                learning_rate=best_params["hgb__learning_rate"],
                max_depth=best_params["hgb__max_depth"],
                random_state=42
            )
        )
    elif base_name == "xgboost":
        final_model = make_pipeline(
            preprocessing_clone,
            PCA(n_components=best_params["pca__n_components"]),
            XGBRegressor(
                objective="reg:squarederror",
                random_state=42,
                n_estimators=best_params["xgb__n_estimators"],
                learning_rate=best_params["xgb__learning_rate"],
                max_depth=best_params["xgb__max_depth"],
                tree_method="hist",
                n_jobs=-1,
            )
        )
    elif base_name == "lightgbm":
        final_model = make_pipeline(
            preprocessing_clone,
            PCA(n_components=best_params["pca__n_components"]),
            LGBMRegressor(
                random_state=42,
                n_estimators=best_params["lgbm__n_estimators"],
                learning_rate=best_params["lgbm__learning_rate"],
                num_leaves=best_params["lgbm__num_leaves"],
                n_jobs=-1,
                verbose=-1,
            )
        )

    final_model.fit(X_train, y_train)

    y_pred = final_model.predict(X_test)
    test_mae = mean_absolute_error(y_test, y_pred)
    print(f"{name} Test MAE: ${test_mae:,.2f}")

    pca_results[name] = {"pipeline": final_model, "test_mae": test_mae, "cv_mae": cv_mae_pca}

    with mlflow.start_run(run_name=f"{name}_optuna"):
        mlflow.log_param("model_family", base_name)
        mlflow.log_param("uses_pca", True)
        mlflow.log_params(best_params)
        mlflow.log_metric("cv_MAE", cv_mae_pca)
        mlflow.log_metric("test_MAE", test_mae)

        signature = infer_signature(X_train, final_model.predict(X_train))
        mlflow.sklearn.log_model(
            sk_model=final_model,
            artifact_path="housing_model_with_pca",
            signature=signature,
            input_example=X_train,
            registered_model_name=f"{base_name}_pipeline_with_pca_optuna",
        )

print("\n‚úì STEP 7: All 4 PCA models optimized and logged.")


# =============================================================================
# STEP 8: Choose GLOBAL Best Model
# =============================================================================

all_results = {}
all_results.update(results)
all_results.update(pca_results)

global_best_name = min(all_results, key=lambda k: all_results[k]["test_mae"])
global_best_mae = all_results[global_best_name]["test_mae"]
global_best_cv_mae = all_results[global_best_name]["cv_mae"]
global_best_pipeline = all_results[global_best_name]["pipeline"]

uses_pca = "with_pca" in global_best_name

print("\n" + "=" * 80)
print("GLOBAL BEST MODEL (ACROSS 8 CANDIDATES)")
print("=" * 80)
print(f"Global best model key: {global_best_name}")
print(f"Global best CV MAE:    ${global_best_cv_mae:,.2f}")
print(f"Global best Test MAE:  ${global_best_mae:,.2f}")
print(f"Uses PCA:               {uses_pca}")


# =============================================================================
# STEP 9: Save, Load, and Compare the GLOBAL Best Model
# =============================================================================

def save_model(model, filename="global_best_model_optuna.pkl"):
    joblib.dump(model, filename)
    print(f"‚úì Model saved to {filename}")

print("\n" + "-" * 80)
print("Saving and reloading GLOBAL best model...")
print("-" * 80)

save_model(global_best_pipeline, filename=f"{base_folder}/models/global_best_model_optuna.pkl")

print("\nDone:")
print(f"- GLOBAL best model key: {global_best_name}")
print(f"- GLOBAL best CV MAE:    ${global_best_cv_mae:,.2f}")
print(f"- GLOBAL best Test MAE:  ${global_best_mae:,.2f}")

end_time = time.monotonic()
elapsed_time = end_time - start_time
minutes = int(elapsed_time // 60)
seconds = elapsed_time % 60
print(f"Elapsed time: {minutes} minutes and {seconds:.2f} seconds")