In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install xgboost lightgbm "mlflow<3"



In [None]:
!pip install dagshub

Collecting dagshub
  Downloading dagshub-0.6.4-py3-none-any.whl.metadata (12 kB)
Collecting appdirs>=1.4.4 (from dagshub)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting dacite~=1.6.0 (from dagshub)
  Downloading dacite-1.6.0-py3-none-any.whl.metadata (14 kB)
Collecting gql[requests] (from dagshub)
  Downloading gql-4.0.0-py3-none-any.whl.metadata (10 kB)
Collecting dataclasses-json (from dagshub)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting treelib>=1.6.4 (from dagshub)
  Downloading treelib-1.8.0-py3-none-any.whl.metadata (3.3 kB)
Collecting pathvalidate>=3.0.0 (from dagshub)
  Downloading pathvalidate-3.3.1-py3-none-any.whl.metadata (12 kB)
Collecting boto3 (from dagshub)
  Downloading boto3-1.42.12-py3-none-any.whl.metadata (6.8 kB)
Collecting semver (from dagshub)
  Downloading semver-3.0.4-py3-none-any.whl.metadata (6.8 kB)
Collecting dagshub-annotation-converter>=0.1.12 (from dagshub)
  Downloading dagshub_annota

In [12]:
# =============================================================================
# 02 TRAIN MODELS WITHOUT OPTUNA (8 EXPERIMENTS)
# Classification + DagsHub MLflow (SAFE)
# =============================================================================

# -------------------- INSTALLS (COLAB SAFE) --------------------
%pip install -q dagshub mlflow scikit-learn xgboost

# -------------------- IMPORTS --------------------
import sqlite3
import pandas as pd
import numpy as np
import time

import dagshub
import mlflow
from mlflow.tracking import MlflowClient

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.decomposition import PCA

# -------------------- DAGSHUB INIT (CRITICAL) --------------------
dagshub.init(
    repo_owner="wasiq0",
    repo_name="FinalProjectEAS508",
    mlflow=True
)

mlflow.set_tracking_uri(
    "https://dagshub.com/wasiq0/FinalProjectEAS508.mlflow"
)

client = MlflowClient()

EXPERIMENT_NAME = "student_classification_no_optuna"
exp = client.get_experiment_by_name(EXPERIMENT_NAME)
if exp is None:
    exp_id = client.create_experiment(EXPERIMENT_NAME)
else:
    exp_id = exp.experiment_id

print("Using experiment:", EXPERIMENT_NAME)

# -------------------- LOAD DATA FROM SQLITE --------------------
DB_PATH = "/content/drive/MyDrive/work/Finalproject/housing_app_fall25-main/data/student_performance.db"
conn = sqlite3.connect(DB_PATH)

df = pd.read_sql_query("""
SELECT
    s.student_id,
    s.age,
    s.gender,
    s.internet_access,
    s.sleep_hours,
    s.sleep_quality,
    s.class_attendance,
    c.course_name AS course,
    c.study_method,
    c.facility_rating,
    e.exam_difficulty,
    e.study_hours,
    e.exam_score,
    e.pass_fail
FROM students s
JOIN exams e ON e.student_id = s.student_id
JOIN courses c ON c.course_id = e.course_id
ORDER BY s.student_id
""", conn)

conn.close()

print("Rows:", len(df))
print("\nClass distribution:\n", df["pass_fail"].value_counts(normalize=True))

# -------------------- TARGET ENCODING --------------------
df["pass_fail"] = df["pass_fail"].map({"fail": 0, "pass": 1})

X = df.drop(columns=["student_id", "pass_fail"])
y = df["pass_fail"]

# -------------------- TRAIN / TEST SPLIT --------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# -------------------- PREPROCESSING --------------------
num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

preprocess = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
])

# -------------------- MODELS --------------------
models = {
    "logreg": LogisticRegression(max_iter=2000),
    "ridgeclf": RidgeClassifier(),
    "histgb": HistGradientBoostingClassifier(),
    "xgboost": XGBClassifier(
        eval_metric="logloss",
        use_label_encoder=False,
        random_state=42
    )
}

# -------------------- TRAINING LOOP (8 RUNS) --------------------
start = time.time()
run_count = 0

for model_name, model in models.items():
    for use_pca in [False, True]:

        run_name = f"{model_name}_{'with_pca' if use_pca else 'no_pca'}"

        print("\n" + "="*80)
        print("Training:", run_name)
        print("="*80)

        steps = [("prep", preprocess)]
        if use_pca:
            steps.append(("pca", PCA(n_components=0.95)))
        steps.append(("model", model))

        pipe = Pipeline(steps)

        cv_f1 = cross_val_score(
            pipe, X_train, y_train,
            scoring="f1", cv=3, n_jobs=-1
        ).mean()

        pipe.fit(X_train, y_train)
        preds = pipe.predict(X_test)
        test_f1 = f1_score(y_test, preds)

        print("CV F1:", round(cv_f1, 4))
        print("Test F1:", round(test_f1, 4))

        # -------------------- SAFE MLFLOW LOGGING --------------------
        with mlflow.start_run(
            experiment_id=exp_id,
            run_name=run_name
        ):
            mlflow.log_param("model", model_name)
            mlflow.log_param("pca", use_pca)
            mlflow.log_param("optuna", False)
            mlflow.log_metric("cv_f1", cv_f1)
            mlflow.log_metric("test_f1", test_f1)
            mlflow.sklearn.log_model(pipe, artifact_path="model")

        run_count += 1

# -------------------- SUMMARY --------------------
elapsed = time.time() - start
print("\n" + "="*80)
print("‚úì 02 EXPERIMENTS COMPLETE")
print("Runs logged:", run_count)
print("Elapsed time:", round(elapsed, 2), "seconds")
print("="*80)


Using experiment: student_classification_no_optuna
Rows: 20000

Class distribution:
 pass_fail
pass    0.87125
fail    0.12875
Name: proportion, dtype: float64

Training: logreg_no_pca
CV F1: 0.9977
Test F1: 0.9987




üèÉ View run logreg_no_pca at: https://dagshub.com/wasiq0/FinalProjectEAS508.mlflow/#/experiments/0/runs/04dad10f97f04be898df02a0cbe25a1b
üß™ View experiment at: https://dagshub.com/wasiq0/FinalProjectEAS508.mlflow/#/experiments/0

Training: logreg_with_pca
CV F1: 0.9807
Test F1: 0.9792




üèÉ View run logreg_with_pca at: https://dagshub.com/wasiq0/FinalProjectEAS508.mlflow/#/experiments/0/runs/e70701a5976a4fa0aed73335e1666f1a
üß™ View experiment at: https://dagshub.com/wasiq0/FinalProjectEAS508.mlflow/#/experiments/0

Training: ridgeclf_no_pca
CV F1: 0.9514
Test F1: 0.9531




üèÉ View run ridgeclf_no_pca at: https://dagshub.com/wasiq0/FinalProjectEAS508.mlflow/#/experiments/0/runs/8fb51963618f47019c0a58bcbd049dc5
üß™ View experiment at: https://dagshub.com/wasiq0/FinalProjectEAS508.mlflow/#/experiments/0

Training: ridgeclf_with_pca
CV F1: 0.9471
Test F1: 0.9502




üèÉ View run ridgeclf_with_pca at: https://dagshub.com/wasiq0/FinalProjectEAS508.mlflow/#/experiments/0/runs/0b776bdebd9644eb816e4ac888a180a2
üß™ View experiment at: https://dagshub.com/wasiq0/FinalProjectEAS508.mlflow/#/experiments/0

Training: histgb_no_pca
CV F1: 0.9991
Test F1: 0.9994




üèÉ View run histgb_no_pca at: https://dagshub.com/wasiq0/FinalProjectEAS508.mlflow/#/experiments/0/runs/14eae57fcf844a64a674c092146bc9ae
üß™ View experiment at: https://dagshub.com/wasiq0/FinalProjectEAS508.mlflow/#/experiments/0

Training: histgb_with_pca
CV F1: 0.9869
Test F1: 0.9855




üèÉ View run histgb_with_pca at: https://dagshub.com/wasiq0/FinalProjectEAS508.mlflow/#/experiments/0/runs/9c27cf668bfc41fc8fd7842afc333289
üß™ View experiment at: https://dagshub.com/wasiq0/FinalProjectEAS508.mlflow/#/experiments/0

Training: xgboost_no_pca


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


CV F1: 0.9996
Test F1: 0.9991




üèÉ View run xgboost_no_pca at: https://dagshub.com/wasiq0/FinalProjectEAS508.mlflow/#/experiments/0/runs/d3c899ffbbbb45339628008c5eefe8a5
üß™ View experiment at: https://dagshub.com/wasiq0/FinalProjectEAS508.mlflow/#/experiments/0

Training: xgboost_with_pca


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


CV F1: 0.9873
Test F1: 0.9865




üèÉ View run xgboost_with_pca at: https://dagshub.com/wasiq0/FinalProjectEAS508.mlflow/#/experiments/0/runs/4a389b6cfd834508990a37f7f565d545
üß™ View experiment at: https://dagshub.com/wasiq0/FinalProjectEAS508.mlflow/#/experiments/0

‚úì 02 EXPERIMENTS COMPLETE
Runs logged: 8
Elapsed time: 113.79 seconds


In [None]:
# ============================================================
# 02_train_models_without_optuna.ipynb
# 8 Experiments (NO Optuna) | Dagshub + MLflow
# ============================================================

import os, sqlite3, time
import numpy as np
import pandas as pd
from pathlib import Path

import dagshub
dagshub.init(repo_owner="wasiq0", repo_name="FinalProjectEAS508", mlflow=True)

import mlflow
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier

# ------------------------------------------------------------
# Paths
# ------------------------------------------------------------
BASE_FOLDER = "/content/drive/MyDrive/work/Finalproject/housing_app_fall25-main"
DB_PATH = Path(BASE_FOLDER) / "data" / "student_performance.db"
os.chdir(BASE_FOLDER)

assert DB_PATH.exists(), "‚ùå DB not found"

# ------------------------------------------------------------
# Load data from NORMALIZED DB
# ------------------------------------------------------------
conn = sqlite3.connect(DB_PATH)
df = pd.read_sql_query("""
SELECT
    s.student_id,
    s.age,
    s.gender,
    s.internet_access,
    s.sleep_hours,
    s.sleep_quality,
    s.class_attendance,
    c.course_name AS course,
    c.study_method,
    c.facility_rating,
    e.exam_difficulty,
    e.study_hours,
    e.exam_score,
    e.pass_fail
FROM exams e
JOIN students s ON s.student_id = e.student_id
JOIN courses c ON c.course_id = e.course_id
ORDER BY s.student_id
""", conn)
conn.close()

# ------------------------------------------------------------
# Target mapping
# ------------------------------------------------------------
df["pass_fail"] = df["pass_fail"].map({"fail": 0, "pass": 1})

X = df.drop(columns=["student_id", "pass_fail"])
y = df["pass_fail"]

# ------------------------------------------------------------
# Split (STRATIFIED)
# ------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# ------------------------------------------------------------
# Preprocessing
# ------------------------------------------------------------
num_cols = X.select_dtypes(include=np.number).columns
cat_cols = X.select_dtypes(exclude=np.number).columns

preprocess = ColumnTransformer([
    ("num", Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]), num_cols),
    ("cat", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ]), cat_cols)
])

# ------------------------------------------------------------
# Models
# ------------------------------------------------------------
models = {
    "logreg": LogisticRegression(max_iter=2000),
    "ridgeclf": RidgeClassifier(),
    "histgb": HistGradientBoostingClassifier(),
    "xgboost": XGBClassifier(
        objective="binary:logistic",
        eval_metric="logloss",
        use_label_encoder=False
    )
}

# ------------------------------------------------------------
# Run Experiments
# ------------------------------------------------------------
mlflow.set_experiment("student_classification_no_optuna")
start = time.time()
count = 0

for name, model in models.items():
    for use_pca in [False, True]:
        run_name = f"{name}_{'with_pca' if use_pca else 'no_pca'}"
        print(f"\nTraining {run_name}")

        steps = [("prep", preprocess)]
        if use_pca:
            steps.append(("pca", PCA(n_components=0.95)))
        steps.append(("model", model))

        pipe = Pipeline(steps)

        cv_f1 = cross_val_score(pipe, X_train, y_train, cv=3, scoring="f1").mean()
        pipe.fit(X_train, y_train)
        test_f1 = f1_score(y_test, pipe.predict(X_test))

        with mlflow.start_run(run_name=run_name):
            mlflow.log_param("model", name)
            mlflow.log_param("pca", use_pca)
            mlflow.log_metric("cv_f1", cv_f1)
            mlflow.log_metric("test_f1", test_f1)
            mlflow.sklearn.log_model(pipe, run_name)

        print(f"CV F1={cv_f1:.4f} | Test F1={test_f1:.4f}")
        count += 1

print(f"\n‚úÖ 02 COMPLETE ‚Äî Experiments run: {count}")
print(f"Elapsed: {time.time() - start:.2f}s")



Training logreg_no_pca




CV F1=0.9977 | Test F1=0.9987

Training logreg_with_pca




CV F1=0.9807 | Test F1=0.9792

Training ridgeclf_no_pca




CV F1=0.9514 | Test F1=0.9531

Training ridgeclf_with_pca




CV F1=0.9471 | Test F1=0.9502

Training histgb_no_pca




CV F1=0.9989 | Test F1=0.9990

Training histgb_with_pca


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


CV F1=0.9867 | Test F1=0.9862

Training xgboost_no_pca


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


CV F1=0.9996 | Test F1=0.9991

Training xgboost_with_pca


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


CV F1=0.9873 | Test F1=0.9865

‚úÖ 02 COMPLETE ‚Äî Experiments run: 8
Elapsed: 58.88s


In [None]:
# =============================================================================
# 02_train_models_without_optuna.ipynb
# Classification experiments (NO Optuna)
# Models: LogReg, Ridge, HistGB, XGBoost
# PCA: Yes / No
# Metric: F1 (macro)
# =============================================================================

from pathlib import Path
import sqlite3
import pandas as pd
import numpy as np
import time

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier

import mlflow
import joblib

# -----------------------------------------------------------------------------
# Paths
# -----------------------------------------------------------------------------
BASE_DIR = Path("/content/drive/MyDrive/work/Finalproject/housing_app_fall25-main")
DATA_DIR = BASE_DIR / "data"
MODELS_DIR = BASE_DIR / "models"
MODELS_DIR.mkdir(exist_ok=True)

DB_PATH = DATA_DIR / "student_performance.db"

print("DB:", DB_PATH)

# -----------------------------------------------------------------------------
# Load data from SQLite
# -----------------------------------------------------------------------------
conn = sqlite3.connect(DB_PATH)
df = pd.read_sql_query(
    """
    SELECT
        s.student_id,
        s.age,
        s.gender,
        s.internet_access,
        s.sleep_hours,
        s.sleep_quality,
        s.class_attendance,
        c.course_name AS course,
        c.study_method,
        c.facility_rating,
        e.exam_difficulty,
        e.study_hours,
        e.exam_score,
        e.pass_fail
    FROM students s
    JOIN courses c ON c.course_id = (
        SELECT course_id FROM exams ex WHERE ex.student_id = s.student_id LIMIT 1
    )
    JOIN exams e ON e.student_id = s.student_id
    ORDER BY s.student_id
    """,
    conn,
)
conn.close()

print("Rows:", len(df))
display(df.head())

# -----------------------------------------------------------------------------
# Encode target
# -----------------------------------------------------------------------------
df["pass_fail"] = df["pass_fail"].map({"fail": 0, "pass": 1})

X = df.drop(columns=["student_id", "pass_fail", "exam_score"])
y = df["pass_fail"]

print("\nClass distribution:")
print(y.value_counts(normalize=True))

# -----------------------------------------------------------------------------
# Train / Test split (STRATIFIED)
# -----------------------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.20,
    random_state=42,
    stratify=y,
)

print(f"\n‚úì Stratified split done. Train={len(X_train)} Test={len(X_test)}")

# -----------------------------------------------------------------------------
# Preprocessing
# -----------------------------------------------------------------------------
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

numeric_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

categorical_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

preprocessing = ColumnTransformer([
    ("num", numeric_pipe, num_cols),
    ("cat", categorical_pipe, cat_cols),
])

# -----------------------------------------------------------------------------
# Models (NO tuning)
# -----------------------------------------------------------------------------
models = {
    "logreg": LogisticRegression(max_iter=2000),
    "ridgeclf": RidgeClassifier(),
    "histgb": HistGradientBoostingClassifier(random_state=42),
    "xgboost": XGBClassifier(
        eval_metric="logloss",
        random_state=42,
        use_label_encoder=False
    ),
}

# -----------------------------------------------------------------------------
# MLflow (local now, Dagshub later)
# -----------------------------------------------------------------------------
mlflow.set_tracking_uri("file:/content/mlruns")
mlflow.set_experiment("student_classification_no_optuna")

# -----------------------------------------------------------------------------
# Run experiments
# -----------------------------------------------------------------------------
results = {}
start_time = time.time()

for use_pca in [False, True]:
    for name, model in models.items():

        exp_name = f"{name}_{'with_pca' if use_pca else 'no_pca'}"

        steps = [("prep", preprocessing)]

        if use_pca:
            steps.append(("pca", PCA(n_components=0.95)))

        steps.append(("model", model))

        pipe = Pipeline(steps)

        print("\n" + "="*80)
        print(f"Training: {exp_name}")
        print("="*80)

        cv_scores = cross_val_score(
            pipe,
            X_train,
            y_train,
            cv=3,
            scoring="f1_macro",
            n_jobs=-1,
        )
        cv_f1 = cv_scores.mean()

        pipe.fit(X_train, y_train)
        preds = pipe.predict(X_test)
        test_f1 = f1_score(y_test, preds, average="macro")

        print(f"{exp_name} CV F1:  {cv_f1:.4f}")
        print(f"{exp_name} Test F1:{test_f1:.4f}")

        results[exp_name] = test_f1

        with mlflow.start_run(run_name=exp_name):
            mlflow.log_param("model", name)
            mlflow.log_param("pca", use_pca)
            mlflow.log_param("tuned", False)
            mlflow.log_metric("cv_f1", cv_f1)
            mlflow.log_metric("test_f1", test_f1)

            mlflow.sklearn.log_model(
                pipe,
                artifact_path="model",
                registered_model_name=exp_name,
            )

        joblib.dump(pipe, MODELS_DIR / f"{exp_name}.joblib")

end_time = time.time()

print("\n" + "="*80)
print("‚úì 02 EXPERIMENTS COMPLETE")
print("="*80)
print("Experiments run:", len(results))
print("Elapsed time:", round(end_time - start_time, 2), "seconds")


DB: /content/drive/MyDrive/work/Finalproject/housing_app_fall25-main/data/student_performance.db
Rows: 20000


Unnamed: 0,student_id,age,gender,internet_access,sleep_hours,sleep_quality,class_attendance,course,study_method,facility_rating,exam_difficulty,study_hours,exam_score,pass_fail
0,1,17,male,yes,7.4,poor,92.9,diploma,coaching,low,hard,2.78,58.9,pass
1,2,23,other,yes,4.6,average,64.8,bca,online videos,medium,moderate,3.37,54.8,pass
2,3,22,male,yes,8.5,poor,76.8,b.sc,coaching,high,moderate,7.88,90.3,pass
3,4,20,other,yes,5.8,average,48.4,diploma,online videos,low,moderate,0.67,29.7,fail
4,5,20,female,yes,9.8,poor,71.6,diploma,coaching,low,moderate,0.89,43.7,pass


2025/12/17 22:15:35 INFO mlflow.tracking.fluent: Experiment with name 'student_classification_no_optuna' does not exist. Creating a new experiment.



Class distribution:
pass_fail
1    0.87125
0    0.12875
Name: proportion, dtype: float64

‚úì Stratified split done. Train=16000 Test=4000

Training: logreg_no_pca
logreg_no_pca CV F1:  0.7579
logreg_no_pca Test F1:0.7588


Successfully registered model 'logreg_no_pca'.
Created version '1' of model 'logreg_no_pca'.



Training: ridgeclf_no_pca
ridgeclf_no_pca CV F1:  0.5320
ridgeclf_no_pca Test F1:0.5324


Successfully registered model 'ridgeclf_no_pca'.
Created version '1' of model 'ridgeclf_no_pca'.



Training: histgb_no_pca
histgb_no_pca CV F1:  0.7450
histgb_no_pca Test F1:0.7608


Successfully registered model 'histgb_no_pca'.
Created version '1' of model 'histgb_no_pca'.



Training: xgboost_no_pca


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


xgboost_no_pca CV F1:  0.7313
xgboost_no_pca Test F1:0.7518


Successfully registered model 'xgboost_no_pca'.
Created version '1' of model 'xgboost_no_pca'.



Training: logreg_with_pca
logreg_with_pca CV F1:  0.7570
logreg_with_pca Test F1:0.7614


Successfully registered model 'logreg_with_pca'.
Created version '1' of model 'logreg_with_pca'.



Training: ridgeclf_with_pca
ridgeclf_with_pca CV F1:  0.5307
ridgeclf_with_pca Test F1:0.5307


Successfully registered model 'ridgeclf_with_pca'.
Created version '1' of model 'ridgeclf_with_pca'.



Training: histgb_with_pca
histgb_with_pca CV F1:  0.7361
histgb_with_pca Test F1:0.7611


Successfully registered model 'histgb_with_pca'.
Created version '1' of model 'histgb_with_pca'.



Training: xgboost_with_pca


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


xgboost_with_pca CV F1:  0.7306
xgboost_with_pca Test F1:0.7440





‚úì 02 EXPERIMENTS COMPLETE
Experiments run: 8
Elapsed time: 60.19 seconds


Successfully registered model 'xgboost_with_pca'.
Created version '1' of model 'xgboost_with_pca'.


In [None]:
base_folder = "/content/drive/MyDrive/work/Finalproject/housing_app_fall25-main"
%cd "{base_folder}"

/content/drive/MyDrive/work/Finalproject/housing_app_fall25-main


In [None]:
# =============================================================================
# FULL CLASSIFICATION PIPELINE (NO OPTUNA) ‚Äî FIXED FOR XGBOOST
# =============================================================================

import os
import time
import sqlite3
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
    MinMaxScaler,
    LabelEncoder,
)
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import f1_score, classification_report
from xgboost import XGBClassifier

import mlflow
from mlflow.models import infer_signature
import joblib

# -----------------------------------------------------------------------------
# CONFIG
# -----------------------------------------------------------------------------
BASE_FOLDER = "/content/drive/MyDrive/work/Finalproject/housing_app_fall25-main"
DB_PATH = f"{BASE_FOLDER}/data/student_performance.db"
EXPERIMENT_NAME = "student_performance_classification_no_optuna"

start_time = time.monotonic()

# -----------------------------------------------------------------------------
# STEP 1: Load data from SQLite
# -----------------------------------------------------------------------------
conn = sqlite3.connect(DB_PATH)

df = pd.read_sql(
    """
    SELECT
        s.student_id,
        s.age,
        s.gender,
        s.internet_access,
        s.sleep_hours,
        s.sleep_quality,
        s.class_attendance,
        c.course_name AS course,
        c.study_method,
        c.facility_rating,
        e.exam_difficulty,
        e.study_hours,
        e.exam_score,
        e.pass_fail
    FROM exams e
    JOIN students s ON e.student_id = s.student_id
    JOIN courses c ON e.course_id = c.course_id
    """,
    conn,
)

conn.close()

print("Rows:", len(df))
display(df.head(3))

# -----------------------------------------------------------------------------
# STEP 2: Target encoding (CRITICAL FIX)
# -----------------------------------------------------------------------------
X = df.drop(columns=["pass_fail"])
y = df["pass_fail"]

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print("Target mapping:")
for cls, idx in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)):
    print(f"{cls} -> {idx}")

# -----------------------------------------------------------------------------
# STEP 3: Stratified split
# -----------------------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y_encoded,
    test_size=0.20,
    random_state=42,
    stratify=y_encoded,
)

print(f"‚úì Stratified split done. Train={len(X_train)} Test={len(X_test)}")

# -----------------------------------------------------------------------------
# STEP 4: Preprocessing
# -----------------------------------------------------------------------------
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = [c for c in X.columns if c not in numeric_features]

numeric_standard = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

numeric_minmax = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", MinMaxScaler()),
    ]
)

categorical_pipe = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

preprocessors = {
    "standard": ColumnTransformer(
        [
            ("num", numeric_standard, numeric_features),
            ("cat", categorical_pipe, categorical_features),
        ]
    ),
    "minmax": ColumnTransformer(
        [
            ("num", numeric_minmax, numeric_features),
            ("cat", categorical_pipe, categorical_features),
        ]
    ),
}

# -----------------------------------------------------------------------------
# STEP 5: Models
# -----------------------------------------------------------------------------
models = {
    "logreg": LogisticRegression(max_iter=2000),
    "ridgeclf": RidgeClassifier(),
    "histgb": HistGradientBoostingClassifier(random_state=42),
    "xgboost": XGBClassifier(
        objective="binary:logistic",
        eval_metric="logloss",
        use_label_encoder=False,
        random_state=42,
        n_estimators=300,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        tree_method="hist",
    ),
}

# -----------------------------------------------------------------------------
# STEP 6: MLflow
# -----------------------------------------------------------------------------
mlflow.set_tracking_uri("file:/content/mlruns")
mlflow.set_experiment(EXPERIMENT_NAME)

# -----------------------------------------------------------------------------
# STEP 7: Train + Evaluate
# -----------------------------------------------------------------------------
results = {}

for prep_name, preprocessor in preprocessors.items():
    for model_name, model in models.items():
        run_name = f"{prep_name}_{model_name}"
        print("\n" + "=" * 80)
        print(f"Training baseline: {run_name}")
        print("=" * 80)

        pipe = Pipeline(
            steps=[
                ("preprocess", preprocessor),
                ("model", model),
            ]
        )

        cv_scores = cross_val_score(
            pipe,
            X_train,
            y_train,
            cv=3,
            scoring="f1_macro",
            n_jobs=-1,
        )
        cv_f1 = cv_scores.mean()

        pipe.fit(X_train, y_train)
        preds = pipe.predict(X_test)
        test_f1 = f1_score(y_test, preds, average="macro")

        print(f"{run_name} CV F1(macro):  {cv_f1:.4f}")
        print(f"{run_name} Test F1(macro): {test_f1:.4f}")

        results[run_name] = {
            "pipeline": pipe,
            "cv_f1": cv_f1,
            "test_f1": test_f1,
        }

        with mlflow.start_run(run_name=run_name):
            mlflow.log_param("preprocessor", prep_name)
            mlflow.log_param("model", model_name)
            mlflow.log_metric("cv_f1_macro", cv_f1)
            mlflow.log_metric("test_f1_macro", test_f1)

            signature = infer_signature(X_train, pipe.predict(X_train))
            mlflow.sklearn.log_model(
                pipe,
                artifact_path="classification_model",
                signature=signature,
                input_example=X_train,
                registered_model_name=f"{run_name}_pipeline",
            )

# -----------------------------------------------------------------------------
# STEP 8: Select GLOBAL BEST
# -----------------------------------------------------------------------------
best_key = max(results, key=lambda k: results[k]["test_f1"])
best_model = results[best_key]["pipeline"]

print("\n" + "=" * 80)
print("GLOBAL BEST MODEL")
print("=" * 80)
print("Best key:", best_key)
print("Best Test F1(macro):", results[best_key]["test_f1"])

# -----------------------------------------------------------------------------
# STEP 9: Save model
# -----------------------------------------------------------------------------
MODEL_PATH = f"{BASE_FOLDER}/models/best_classification_model.joblib"
joblib.dump(
    {
        "model": best_model,
        "label_encoder": label_encoder,
    },
    MODEL_PATH,
)

print("‚úì Saved best model to:", MODEL_PATH)

# -----------------------------------------------------------------------------
# STEP 10: Final report
# -----------------------------------------------------------------------------
best_preds = best_model.predict(X_test)

print("\nClassification Report (decoded):")
print(
    classification_report(
        label_encoder.inverse_transform(y_test),
        label_encoder.inverse_transform(best_preds),
    )
)

elapsed = time.monotonic() - start_time
print(f"\nElapsed time: {elapsed/60:.2f} minutes")


Rows: 20000


Unnamed: 0,student_id,age,gender,internet_access,sleep_hours,sleep_quality,class_attendance,course,study_method,facility_rating,exam_difficulty,study_hours,exam_score,pass_fail
0,1,17,male,yes,7.4,poor,92.9,diploma,coaching,low,hard,2.78,58.9,pass
1,2,23,other,yes,4.6,average,64.8,bca,online videos,medium,moderate,3.37,54.8,pass
2,3,22,male,yes,8.5,poor,76.8,b.sc,coaching,high,moderate,7.88,90.3,pass


Target mapping:
fail -> 0
pass -> 1
‚úì Stratified split done. Train=16000 Test=4000

Training baseline: standard_logreg
standard_logreg CV F1(macro):  0.9946
standard_logreg Test F1(macro): 0.9949


Registered model 'standard_logreg_pipeline' already exists. Creating a new version of this model...
Created version '2' of model 'standard_logreg_pipeline'.



Training baseline: standard_ridgeclf
standard_ridgeclf CV F1(macro):  0.9848
standard_ridgeclf Test F1(macro): 0.9876


Registered model 'standard_ridgeclf_pipeline' already exists. Creating a new version of this model...
Created version '2' of model 'standard_ridgeclf_pipeline'.



Training baseline: standard_histgb
standard_histgb CV F1(macro):  0.9977
standard_histgb Test F1(macro): 0.9981


Registered model 'standard_histgb_pipeline' already exists. Creating a new version of this model...
Created version '2' of model 'standard_histgb_pipeline'.



Training baseline: standard_xgboost


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


standard_xgboost CV F1(macro):  1.0000
standard_xgboost Test F1(macro): 1.0000


Successfully registered model 'standard_xgboost_pipeline'.
Created version '1' of model 'standard_xgboost_pipeline'.



Training baseline: minmax_logreg
minmax_logreg CV F1(macro):  0.9800
minmax_logreg Test F1(macro): 0.9864


Successfully registered model 'minmax_logreg_pipeline'.
Created version '1' of model 'minmax_logreg_pipeline'.



Training baseline: minmax_ridgeclf
minmax_ridgeclf CV F1(macro):  0.9843
minmax_ridgeclf Test F1(macro): 0.9879


Successfully registered model 'minmax_ridgeclf_pipeline'.
Created version '1' of model 'minmax_ridgeclf_pipeline'.



Training baseline: minmax_histgb
minmax_histgb CV F1(macro):  0.9977
minmax_histgb Test F1(macro): 0.9981


Successfully registered model 'minmax_histgb_pipeline'.
Created version '1' of model 'minmax_histgb_pipeline'.



Training baseline: minmax_xgboost


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


minmax_xgboost CV F1(macro):  1.0000
minmax_xgboost Test F1(macro): 1.0000





GLOBAL BEST MODEL
Best key: standard_xgboost
Best Test F1(macro): 1.0
‚úì Saved best model to: /content/drive/MyDrive/work/Finalproject/housing_app_fall25-main/models/best_classification_model.joblib

Classification Report (decoded):
              precision    recall  f1-score   support

        fail       0.43      1.00      0.60      1081
        pass       1.00      0.50      0.67      2919

    accuracy                           0.64      4000
   macro avg       0.71      0.75      0.63      4000
weighted avg       0.84      0.64      0.65      4000


Elapsed time: 1.22 minutes


Successfully registered model 'minmax_xgboost_pipeline'.
Created version '1' of model 'minmax_xgboost_pipeline'.
