In [0]:
import os
import pandas as pd
import numpy as np
import json
import mlflow
import mlflow.pyfunc
import mlflow.sklearn
from mlflow.tracking import MlflowClient

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import RidgeClassifierCV
from sklearn.metrics import roc_auc_score, accuracy_score, cohen_kappa_score, confusion_matrix

# --- configuration ---
# define widgets for job parameters, with defaults for interactive runs
dbutils.widgets.text("processed_data_path", "/dbfs/FileStore/tables/p300_files/processed-features/", "DBFS path to processed features")
dbutils.widgets.text("experiment_name", f"/Users/{dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get()}/P300_BCI_Training", "MLflow Experiment Name")
dbutils.widgets.text("registered_model_name", "P300-Classifier", "MLflow Registered Model Name")

# get values from widgets
processed_data_root_path = dbutils.widgets.get("processed_data_path")
mlflow_experiment_name = dbutils.widgets.get("experiment_name")
mlflow_registered_model_name = dbutils.widgets.get("registered_model_name")

# set the MLflow experiment
mlflow_registry_uri = "databricks"
mlflow.set_registry_uri(mlflow_registry_uri)
mlflow.set_experiment(mlflow_experiment_name)
display(f"MLflow experiment set to: {mlflow_experiment_name}")

In [0]:
# configuration for model and training
MODEL_CONFIG = {
    "type": "LDA",
    "lda_solver": "lsqr",
    "lda_shrinkage": "auto",
    "ridge_alphas": np.logspace(-3, 3, 7)
}
TARGET_CLASS_LABEL_CODE = 2
INTERNAL_VAL_SPLIT_RATIO = 0.8

# helper function for evaluation
def evaluate_model_sklearn(model, x_test, y_test, scaler_instance=None):
    x_test_scaled = scaler_instance.transform(x_test) if scaler_instance else x_test
    y_pred = model.predict(x_test_scaled)
    
    y_test_binary = np.where(y_test == TARGET_CLASS_LABEL_CODE, 1, 0)
    y_pred_binary = np.where(y_pred == TARGET_CLASS_LABEL_CODE, 1, 0)
    
    accuracy = accuracy_score(y_test_binary, y_pred_binary)
    kappa = cohen_kappa_score(y_test_binary, y_pred_binary)
    auc = None
    
    if hasattr(model, "predict_proba"):
        target_class_idx = list(model.classes_).index(TARGET_CLASS_LABEL_CODE)
        y_pred_proba_target = model.predict_proba(x_test_scaled)[:, target_class_idx]
        if len(np.unique(y_test_binary)) > 1:
            auc = roc_auc_score(y_test_binary, y_pred_proba_target)
            
    return {"accuracy": accuracy, "kappa": kappa, "auc": auc}


# package model and scaler together
class SklearnModelWithScaler(mlflow.pyfunc.PythonModel):
    def __init__(self, model, scaler):
        self.model = model
        self.scaler = scaler
    
    def predict(self, context, model_input):
        scaled_input = self.scaler.transform(model_input.values)
        predictions = self.model.predict(scaled_input)
        
        # prepare a dictionary for each row of the output
        results = []
        probabilities = self.model.predict_proba(scaled_input)
        target_class_idx = list(self.model.classes_).index(TARGET_CLASS_LABEL_CODE)
        
        for i, pred in enumerate(predictions):
            results.append({
                "predicted_label_code": int(pred),
                "predicted_class_name": "Target" if int(pred) == TARGET_CLASS_LABEL_CODE else "NonTarget",
                "probability_target": float(probabilities[i, target_class_idx])
            })
        
        # return a pandas series of dictionary objects for easy parsing
        return pd.Series(results)

In [0]:
# data loading logic from DBFS
all_x, all_y = [], []
all_subject_folders = [os.path.join(processed_data_root_path, d) for d in os.listdir(processed_data_root_path)]
display(f"found {len(all_subject_folders)} processed subject folders.")

for subject_folder_path in all_subject_folders:
    features_path = os.path.join(subject_folder_path, "features.parquet")
    labels_path = os.path.join(subject_folder_path, "labels.parquet")
    
    if os.path.exists(features_path) and os.path.exists(labels_path):
        x_df = pd.read_parquet(features_path)
        y_df = pd.read_parquet(labels_path)
        all_x.append(x_df.values)
        all_y.append(y_df['label'].values)
    else:
        display(f"  warning: missing features or labels for {subject_folder_path}. skipping.")

x_combined = np.vstack(all_x)
y_combined = np.concatenate(all_y)
display(f"successfully loaded and combined data. features shape: {x_combined.shape}, labels shape: {y_combined.shape}")

In [0]:
if mlflow.active_run() is not None:
    mlflow.end_run()
    
# MLflow training run
with mlflow.start_run(run_name="LDA_Training_Run") as run:
    display(f"starting MLflow run with ID: {run.info.run_id}")
    
    # log parameters
    mlflow.log_param("model_type", MODEL_CONFIG['type'])
    if MODEL_CONFIG['type'] == "LDA":
        mlflow.log_param("lda_solver", MODEL_CONFIG.get("lda_solver"))
        mlflow.log_param("lda_shrinkage", MODEL_CONFIG.get("lda_shrinkage"))
    
    mlflow.log_param("internal_val_split_ratio", INTERNAL_VAL_SPLIT_RATIO)
    mlflow.log_param("num_subjects_trained_on", len(all_subject_folders))


    # split data
    x_train, x_val, y_train, y_val = train_test_split(
        x_combined, y_combined, 
        test_size=(1.0 - INTERNAL_VAL_SPLIT_RATIO),
        stratify=y_combined, 
        random_state=42
    )
    mlflow.log_param("training_set_size", x_train.shape[0])
    mlflow.log_param("validation_set_size", x_val.shape[0])


    # train model and scaler
    scaler = StandardScaler()
    model = LinearDiscriminantAnalysis(solver=MODEL_CONFIG['lda_solver'], shrinkage=MODEL_CONFIG['lda_shrinkage'])
    
    x_train_scaled = scaler.fit_transform(x_train)
    model.fit(x_train_scaled, y_train)
    

    # evaluate and log metrics
    metrics = evaluate_model_sklearn(model, x_val, y_val, scaler_instance=scaler)
    metrics_to_log = {f"validation_{k}": v for k, v in metrics.items() if v is not None}
    mlflow.log_metrics(metrics_to_log)
    display("model metrics logged:")
    display(metrics_to_log)
    
    # log training subject names
    training_subject_names = [os.path.basename(p) for p in all_subject_folders]
    mlflow.log_param("training_subject_names", ", ".join(training_subject_names))
    training_subjects_payload = {"training_subjects": training_subject_names}
    mlflow.log_dict(training_subjects_payload, "training_subjects.json")

    # log and register the model
    input_example = pd.DataFrame(x_train[:5,:], columns=[f"feature_{i}" for i in range(x_train.shape[1])])
    pyfunc_artifact_path = "p300-pyfunc-model"
    pyfunc_model_with_scaler = SklearnModelWithScaler(model=model, scaler=scaler)
    
    registered_model_info = mlflow.pyfunc.log_model(
        name=pyfunc_artifact_path,
        python_model=pyfunc_model_with_scaler,
        input_example=input_example,
        )
    
    run_id = run.info.run_id
    model_uri = f"runs:/{run_id}/{pyfunc_artifact_path}"
    
    print(f"registering model from URI: {model_uri}")
    registered_model_info = mlflow.register_model(
        model_uri=model_uri,
        name=mlflow_registered_model_name
    )
    new_model_version = registered_model_info.version


    # this is the legacy method for updating the model stage
    # aliases don't work with Workspace Registry (DBFS based approach)
    client = MlflowClient()
    client.transition_model_version_stage(
        name=mlflow_registered_model_name,
        version=new_model_version,
        stage="Staging",
        archive_existing_versions=True # this will move any other version currently in 'Staging' to 'Archived'
    )


    display(f"model successfully packaged with scaler and registered as '{mlflow_registered_model_name} version {new_model_version}' in 'Staging'")