# 08 Optimized Word2Vec Model Training

## 1 Imports

In [14]:
import psutil, os
print([p.info for p in psutil.process_iter(['pid','name']) if 'python' in p.info['name'].lower()])

[{'pid': 624, 'name': 'python.exe'}, {'pid': 8760, 'name': 'python.exe'}, {'pid': 10172, 'name': 'python.exe'}, {'pid': 10408, 'name': 'python.exe'}, {'pid': 11200, 'name': 'python.exe'}, {'pid': 11552, 'name': 'python.exe'}, {'pid': 20524, 'name': 'python.exe'}, {'pid': 20940, 'name': 'python.exe'}, {'pid': 21696, 'name': 'python.exe'}, {'pid': 25992, 'name': 'python.exe'}, {'pid': 30956, 'name': 'python.exe'}, {'pid': 32132, 'name': 'python.exe'}]


In [2]:
import importlib
import pandas as pd
import numpy as np
import os
from sklearn.metrics import roc_auc_score, make_scorer

from src.models import (
    get_classifiers,
    get_param_distributions,
    get_n_iter_random_per_clf,
    repeated_cv_with_mixed_search,
    auc_scorer
)

from src.resampling import (
    resample_training_data,
    print_class_balance
)

from src.utils import resolve_path
from src.evaluation import export_summary

print("‚úÖ All modules reloaded successfully (models, resampling, utils, evaluation).")


‚úÖ All modules reloaded successfully (models, resampling, utils, evaluation).


## 2 Load optimized Radiology train/test sets

In [3]:
variants = ["w2v_optimized_radiology"]
datasets = {}

for variant in variants:
    X_train = pd.read_csv(resolve_path(f"data/processed/{variant}/data_{variant}_xtrain.csv"))
    X_test  = pd.read_csv(resolve_path(f"data/processed/{variant}/data_{variant}_xtest.csv"))
    y_train = pd.read_csv(resolve_path(f"data/processed/{variant}/data_{variant}_ytrain.csv")).squeeze()
    y_test  = pd.read_csv(resolve_path(f"data/processed/{variant}/data_{variant}_ytest.csv")).squeeze()

    datasets[variant] = {
        "X_train": X_train,
        "X_test": X_test,
        "y_train": y_train,
        "y_test": y_test
    }

    print(f"‚úÖ Loaded {variant} dataset ‚Üí Train: {X_train.shape}, Test: {X_test.shape}")
    print_class_balance(y_train, f"{variant} training set (before SMOTE)")

‚úÖ Loaded w2v_optimized_radiology dataset ‚Üí Train: (4166, 147), Test: (1042, 147)
w2v_optimized_radiology training set (before SMOTE) class balance: {0: 3204, 1: 962}


## 3 Apply SMOTE Resampling

In [4]:
for variant, data in datasets.items():
    X_train_res, y_train_res = resample_training_data(
        data["X_train"], data["y_train"], method="smote"
    )
    datasets[variant]["X_train_res"] = X_train_res
    datasets[variant]["y_train_res"] = y_train_res
    print_class_balance(y_train_res, f"{variant} training set (after SMOTE)")



üîÅ Applying SMOTE to training data ...
‚úÖ Resampled training set shape: (6408, 147)
   Class balance after resampling: Counter({0: 3204, 1: 3204})
w2v_optimized_radiology training set (after SMOTE) class balance: {0: 3204, 1: 3204}


## 4 Save resampled SMOTE Sets

In [5]:
for variant, data in datasets.items():
    # Create variant-specific directory
    out_dir = resolve_path(f"data/processed/{variant}")
    os.makedirs(out_dir, exist_ok=True)

    # Save SMOTE-balanced training data and labels separately
    X_train_res = pd.DataFrame(data["X_train_res"])
    y_train_res = pd.Series(data["y_train_res"], name="target")

    # Save consistent with 03_feature_engineering style
    X_train_res.to_csv(os.path.join(out_dir, f"data_{variant}_xtrain_res.csv"), index=False)
    y_train_res.to_csv(os.path.join(out_dir, f"data_{variant}_ytrain_res.csv"), index=False)

    print(f"‚úÖ Saved SMOTE-balanced training sets for {variant} under {out_dir}")

‚úÖ Saved SMOTE-balanced training sets for w2v_optimized_radiology under C:\Users\tyler\OneDrive - University of Pittsburgh\BIOST 2021 Thesis\Masters-Thesis\data\processed\w2v_optimized_radiology


## 5 Remove Subject ID

In [6]:
# ============================================================
# Remove subject_id column from all X_* datasets (post-SMOTE)
# ============================================================
mode = "w2v_optimized_radiology"

# Unpack dataset prepared in Step 1
X_train = datasets[mode]["X_train"]
X_test  = datasets[mode]["X_test"]
y_train = datasets[mode]["y_train"]
y_test  = datasets[mode]["y_test"]
X_train_res = datasets[mode]["X_train_res"]
y_train_res = datasets[mode]["y_train_res"]

drop_cols = ["subject_id", "first_hosp_stay", "suspected_infection", "sepsis3"]

# Drop from original scaled train/test sets (if still present)
if "subject_id" in X_train.columns:
    X_train = X_train.drop(columns=drop_cols)
if "subject_id" in X_test.columns:
    X_test = X_test.drop(columns=drop_cols)

# Drop from resampled training set
if "subject_id" in X_train_res.columns:
    X_train_res = X_train_res.drop(columns=drop_cols)

print("‚úÖ Removed 'subject_id' column from all X_* datasets.")
print(f"X_train: {X_train.shape}, X_train_res: {X_train_res.shape}, X_test: {X_test.shape}")


‚úÖ Removed 'subject_id' column from all X_* datasets.
X_train: (4166, 143), X_train_res: (6408, 143), X_test: (1042, 143)


## 6 Define Classifiers & Hyperparameter Distributions

In [12]:
classifiers = get_classifiers()
param_spaces = get_param_distributions()
n_iter_random_per_clf = get_n_iter_random_per_clf()

print("‚úÖ Classifiers and hyperparameter grids initialized.")
print("Available classifiers:", list(classifiers.keys()))

# Top 6 classifiers by baseline AUROC
selected_classifiers = [
    "CatBoost",
    "LogisticRegression",
    "GradientBoosting",
    "XGB",
    "LGBM",
    "RandomForest"
]

# Subset dictionaries
classifiers_subset = {k: v for k, v in classifiers.items() if k in selected_classifiers}
param_spaces_subset = {k: v for k, v in param_spaces.items() if k in selected_classifiers}

# Iteration configuration: per-clf dictionary only
n_iter_random_subset = {k: n_iter_random_per_clf.get(k, 50) for k in selected_classifiers}

print("‚úÖ Classifiers subset initialized.")
print("Included:", list(classifiers_subset.keys()))
print("n_iter_random_subset:", n_iter_random_subset)

‚úÖ Classifiers and hyperparameter grids initialized.
Available classifiers: ['LogisticRegression', 'DecisionTree', 'RandomForest', 'GradientBoosting', 'XGB', 'LGBM', 'CatBoost', 'SVC', 'MLP', 'NaiveBayes']
‚úÖ Classifiers subset initialized.
Included: ['LogisticRegression', 'RandomForest', 'GradientBoosting', 'XGB', 'LGBM', 'CatBoost']
n_iter_random_subset: {'CatBoost': 75, 'LogisticRegression': 50, 'GradientBoosting': 75, 'XGB': 75, 'LGBM': 75, 'RandomForest': 75}


## 7 Classifier Re-Training with Optimized Radiology Embeddings

In [13]:
# ============================================================
# Run repeated cross-validation with mixed search strategy
# ============================================================
# Define variant mode for saving and MLflow experiment tagging
mode = "w2v_optimized_radiology"
save_prefix = f"results/models/{mode}/"

results_opt, summary_opt = repeated_cv_with_mixed_search(
    X_train,
    y_train,
    X_test,
    y_test,
    classifiers=classifiers_subset,
    param_spaces=param_spaces_subset,
    X_train_smote=X_train_res,
    y_train_smote=y_train_res,
    n_splits=5,
    n_repeats=10,
    scoring=auc_scorer,  # matches Task 4
    n_iter_random=None,
    n_iter_random_per_clf=n_iter_random_subset,
    save_prefix=save_prefix,
    mode=mode,
    log_mlflow=True,
)

# ============================================================
# Export summary to reports/ for unified tracking
# ============================================================
export_summary(summary_opt, save_prefix="reports/", mode=mode)
print(f"‚úÖ Finished model training for {mode} dataset.")

‚úÖ MLflow tracking initialized under unified experiment 'Thesis_ModelTraining'
Tracking URI: file:///C:/Users/tyler/OneDrive%20-%20University%20of%20Pittsburgh/BIOST%202021%20Thesis/Masters-Thesis/mlflow_tracking (Experiment ID: 169692831354922862)

üîπ Running LogisticRegression...
Fitting 50 folds for each of 44 candidates, totalling 2200 fits




   Performing descriptive StratifiedKFold CV on original training set for LogisticRegression...




   Descriptive CV AUC: 0.7417 ¬± 0.0184
üíæ Saved LogisticRegression model to C:\Users\tyler\OneDrive - University of Pittsburgh\BIOST 2021 Thesis\Masters-Thesis\results\models\w2v_optimized_radiology\w2v_optimized_radiology_20251027_2357\w2v_optimized_radiology_20251027_2357_LogisticRegression_model.pkl
‚úÖ LogisticRegression done. Best params: {'clf__C': 0.1, 'clf__l1_ratio': 0, 'clf__max_iter': 1000, 'clf__penalty': 'l1', 'clf__solver': 'saga'}
   CV ROC-AUC: 0.7449 ¬± 0.018
   Holdout ROC-AUC: 0.757
üíæ Saved non-SMOTE metrics for LogisticRegression to C:\Users\tyler\OneDrive - University of Pittsburgh\BIOST 2021 Thesis\Masters-Thesis\results\models\w2v_optimized_radiology\w2v_optimized_radiology_20251027_2357\w2v_optimized_radiology_20251027_2357_LogisticRegression_metrics_non_smote.json




   SMOTE Holdout ROC-AUC: 0.7530
   Performing descriptive StratifiedKFold CV on SMOTE training set for LogisticRegression...




   Descriptive CV AUC (SMOTE): 0.7792 ¬± 0.0106
   (5 valid folds out of 5)
üíæ Saved SMOTE-trained LogisticRegression model to C:\Users\tyler\OneDrive - University of Pittsburgh\BIOST 2021 Thesis\Masters-Thesis\results\models\w2v_optimized_radiology\w2v_optimized_radiology_20251027_2357\w2v_optimized_radiology_20251027_2357_LogisticRegression_smote_model.pkl
üíæ Saved SMOTE metrics for LogisticRegression to C:\Users\tyler\OneDrive - University of Pittsburgh\BIOST 2021 Thesis\Masters-Thesis\results\models\w2v_optimized_radiology\w2v_optimized_radiology_20251027_2357\w2v_optimized_radiology_20251027_2357_LogisticRegression_metrics_smote.json
‚è±Ô∏è  Runtime for LogisticRegression: 4.64 minutes
üèÅ MLflow run for 'LogisticRegression' closed cleanly.

üîπ Running RandomForest...
Fitting 50 folds for each of 75 candidates, totalling 3750 fits
   Performing descriptive StratifiedKFold CV on original training set for RandomForest...
   Descriptive CV AUC: 0.7345 ¬± 0.0214
üíæ Saved Rand

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


   Performing descriptive StratifiedKFold CV on original training set for XGB...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


   Descriptive CV AUC: 0.7496 ¬± 0.0188
üíæ Saved XGB model to C:\Users\tyler\OneDrive - University of Pittsburgh\BIOST 2021 Thesis\Masters-Thesis\results\models\w2v_optimized_radiology\w2v_optimized_radiology_20251027_2357\w2v_optimized_radiology_20251027_2357_XGB_model.pkl
‚úÖ XGB done. Best params: {'clf__colsample_bytree': 0.9454044297767479, 'clf__gamma': 0.4303652916281717, 'clf__learning_rate': 0.01208563915935721, 'clf__max_depth': 10, 'clf__min_child_weight': 3, 'clf__n_estimators': 848, 'clf__subsample': 0.8454489914076949}
   CV ROC-AUC: 0.7479 ¬± 0.018
   Holdout ROC-AUC: 0.751
üíæ Saved non-SMOTE metrics for XGB to C:\Users\tyler\OneDrive - University of Pittsburgh\BIOST 2021 Thesis\Masters-Thesis\results\models\w2v_optimized_radiology\w2v_optimized_radiology_20251027_2357\w2v_optimized_radiology_20251027_2357_XGB_metrics_non_smote.json


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


   SMOTE Holdout ROC-AUC: 0.7516
   Performing descriptive StratifiedKFold CV on SMOTE training set for XGB...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


   Descriptive CV AUC (SMOTE): 0.9445 ¬± 0.0081
   (5 valid folds out of 5)
üíæ Saved SMOTE-trained XGB model to C:\Users\tyler\OneDrive - University of Pittsburgh\BIOST 2021 Thesis\Masters-Thesis\results\models\w2v_optimized_radiology\w2v_optimized_radiology_20251027_2357\w2v_optimized_radiology_20251027_2357_XGB_smote_model.pkl
üíæ Saved SMOTE metrics for XGB to C:\Users\tyler\OneDrive - University of Pittsburgh\BIOST 2021 Thesis\Masters-Thesis\results\models\w2v_optimized_radiology\w2v_optimized_radiology_20251027_2357\w2v_optimized_radiology_20251027_2357_XGB_metrics_smote.json
‚è±Ô∏è  Runtime for XGB: 53.40 minutes
üèÅ MLflow run for 'XGB' closed cleanly.

üîπ Running LGBM...
Fitting 50 folds for each of 75 candidates, totalling 3750 fits
   Performing descriptive StratifiedKFold CV on original training set for LGBM...
   Descriptive CV AUC: 0.7453 ¬± 0.0205
üíæ Saved LGBM model to C:\Users\tyler\OneDrive - University of Pittsburgh\BIOST 2021 Thesis\Masters-Thesis\results\mod



üéØ Final evaluation of best SMOTE-trained classifier = LogisticRegression, ROC-AUC = 0.7530
üíæ Saved best SMOTE-trained model to C:\Users\tyler\OneDrive - University of Pittsburgh\BIOST 2021 Thesis\Masters-Thesis\results\models\w2v_optimized_radiology\w2v_optimized_radiology_20251027_2357\w2v_optimized_radiology_20251027_2357_best_smote_model.pkl
üíæ Saved final SMOTE metrics to C:\Users\tyler\OneDrive - University of Pittsburgh\BIOST 2021 Thesis\Masters-Thesis\results\models\w2v_optimized_radiology\w2v_optimized_radiology_20251027_2357\w2v_optimized_radiology_20251027_2357_best_smote_metrics.json
üíæ Saved full summary including original and SMOTE metrics to C:\Users\tyler\OneDrive - University of Pittsburgh\BIOST 2021 Thesis\Masters-Thesis\results\models\w2v_optimized_radiology\w2v_optimized_radiology_20251027_2357\w2v_optimized_radiology_20251027_2357_full_summary_with_smote.csv
üíæ Saved full results dict to C:\Users\tyler\OneDrive - University of Pittsburgh\BIOST 2021 Thesi

## 8 Compare to baseline