In [0]:
import os,sys
import mlflow, json, os, time
from mlflow.tracking import MlflowClient
import mlflow, time, json
from mlflow.tracking import MlflowClient
from pyspark.sql import functions as F
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, classification_report

In [0]:
dbutils.widgets.text("only_node", "")
dbutils.widgets.text("mlflow_experiment", "/Workspace/9900-f18a-cake/classifier")
dbutils.widgets.text("id_col",         "sample_id")

ONLY_NODE        = dbutils.widgets.get("only_node").strip()
MLF_EXPERIMENT   = dbutils.widgets.get("mlflow_experiment").strip()
ID_COL           = dbutils.widgets.get("id_col").strip()

assert ONLY_NODE, "Pass only_node"
print(f"Training node: {ONLY_NODE}")

In [0]:
os.environ.setdefault("OMP_NUM_THREADS","1")
os.environ.setdefault("OPENBLAS_NUM_THREADS","1")
os.environ.setdefault("MKL_NUM_THREADS","1")
os.environ["RF_N_JOBS"] = "1"
os.environ["CV_N_JOBS"] = "1"

In [0]:
def use_catalog_and_schema_from_3part(name: str):
    parts = name.split(".")
    if len(parts) == 3:
        spark.sql(f"USE CATALOG {parts[0]}")
        spark.sql(f"USE SCHEMA {parts[1]}")

In [0]:
def load_data_for_node(node_name: str):
    # Resolve catalog/schema for both tables
    use_catalog_and_schema_from_3part(FEATURES_TABLE)
    use_catalog_and_schema_from_3part(LABELS_TABLE)

    # 1) Read LONG features and pivot to WIDE: one row per biosample_id, one column per probe Name
    feats_long = (
        spark.table(FEATURES_TABLE)
        .select(F.col(ID_COL).alias("biosample_id"), "Name", F.col("MValue").alias("m"))
        .dropna()
    )

    # Wide matrix; F.first to aggregate the single value per (biosample_id, Name)
    feats_wide = (
        feats_long.groupBy("biosample_id")
                  .pivot("Name")
                  .agg(F.first("m"))
    )

    # 2) Read labels for THIS node and shape to (biosample_id, y)
    labels_df = (
        spark.table(LABELS_TABLE)
        .where(F.col("node_id") == F.lit(node_name))      # expects a node_id column
        .select(F.col(ID_COL).alias("biosample_id"), F.col(LABEL_COL).alias("y"))
        .dropna()
        .dropDuplicates(["biosample_id"])
    )

    # 3) Join
    df = feats_wide.join(labels_df, on="biosample_id", how="inner")
    return df

In [0]:
def train_node(node_name: str):
    df = load_data_for_node(node_name)
    n = df.count()
    if n < 10:
        raise ValueError(f"Not enough samples for {node_name}: {n}")

    pdf = df.toPandas()
    y    = pdf["y"].values
    X    = pdf.drop(columns=[ID_COL, "y"]).values
    ids  = pdf[ID_COL].astype(str).values

    # Stratified splits: 20% test, 20% of remaining as val
    sss1 = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=1337)
    (trv_idx, te_idx), = sss1.split(X, y)
    X_trv, y_trv = X[trv_idx], y[trv_idx]
    sss2 = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=1337)
    (tr_idx, va_idx), = sss2.split(X_trv, y_trv)

    model = RandomForestClassifier(
        n_estimators=600, max_depth=None, n_jobs=1, random_state=1337
    )

    t0 = time.time()
    model.fit(X_trv[tr_idx], y_trv[tr_idx])
    train_sec = time.time() - t0

    # Eval
    def eval_split(Xs, ys):
        yp = model.predict(Xs)
        return {
            "acc": accuracy_score(ys, yp),
            "f1_weighted": f1_score(ys, yp, average="weighted"),
        }

    m_val  = eval_split(X_trv[va_idx], y_trv[va_idx])
    m_test = eval_split(X[te_idx],     y[te_idx])

    # Detailed report as artifact
    y_test_pred = model.predict(X[te_idx])
    report = classification_report(y[te_idx], y_test_pred, output_dict=True)

    return {
        "train_sec": train_sec,
        "val":  m_val,
        "test": m_test,
        "report": report,
        "model": model,
    }

In [0]:
c = "cb_prod"
s = "`comp9300-9900-f18a-cake`"

print("Schemas in catalog:", c)
display(spark.sql(f"SHOW SCHEMAS IN {c}"))

print("Candidates with 'label' in name:")
display(spark.sql(f"SHOW TABLES IN {c}.{s} LIKE '*label*'"))

print("Other candidates (you can adjust patterns):")
display(spark.sql(f"SHOW TABLES IN {c}.{s} LIKE '*class*'"))

In [0]:
mlflow.set_experiment(MLF_EXPERIMENT)
client = MlflowClient()

with mlflow.start_run(run_name=f"node={ONLY_NODE}") as run:
    run_id = run.info.run_id
    if PARENT_ID:
        client.set_tag(run_id, "mlflow.parentRunId", PARENT_ID)

    mlflow.set_tag("node_id", ONLY_NODE)
    mlflow.set_tag("orchestrator", "fanout_notebook")
    mlflow.log_params({
        "rf_n_estimators": 600,
        "splits": "80/20 test, then 80/20 train/val",
    })

    out = train_node(ONLY_NODE)

    mlflow.log_metric("train_sec", out["train_sec"])
    for k,v in out["val"].items():  mlflow.log_metric(f"val_{k}",  float(v))
    for k,v in out["test"].items(): mlflow.log_metric(f"test_{k}", float(v))

    mlflow.log_dict(out["report"], "reports/test_classification_report.json")

    # Save model (optional)
    try:
        import mlflow.sklearn
        mlflow.sklearn.log_model(out["model"], artifact_path="model")
    except Exception as e:
        mlflow.log_text(str(e), "logs/model_log_model_error.txt")

    # Persist raw metrics for quick scraping, if you like
    mlflow.log_dict({
        "node": ONLY_NODE,
        "train_sec": out["train_sec"],
        **{f"val_{k}": out["val"][k]  for k in out["val"]},
        **{f"test_{k}": out["test"][k] for k in out["test"]},
    }, "metrics/summary.json")

print("Done:", ONLY_NODE)