# Model Training - AutoML & Manual Training
Demonstrate MLflow experiment tracking with both approaches

In [0]:
import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from databricks.feature_engineering import FeatureEngineeringClient
from pyspark.sql import functions as F

mlflow.set_registry_uri("databricks-uc")
mlflow.sklearn.autolog(log_input_examples=True, log_model_signatures=True)

## Model Training - Isolation Forest

## Load Training Data

In [0]:

fe = FeatureEngineeringClient()
features_df = fe.read_table(name="main.ttw_workshop_demo.customer_features")

# Convert to pandas for training
training_data = features_df.toPandas()
feature_cols = [
    'days_since_login', 'Ebooks_Downloaded_6_Months', 'Average_Session_Time',
    'Days_Since_Last_Activity', 'engagement_score', 'subscription_tier_numeric'
]
X = training_data[feature_cols].fillna(0)

# Save training dataset (dbdemos pattern)
training_spark_df = spark.createDataFrame(training_data)
training_spark_df.write.mode("overwrite").saveAsTable("main.ttw_workshop_demo.training_dataset")

print(f"Training data shape: {X.shape}")



Training data shape: (100, 6)


In [0]:
# Track the run with MLflow
experiment_path = "/Users/tingting.wan@databricks.com/anomaly_demo_qs"
mlflow.set_experiment(experiment_path)

# Train-test split
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)
print(f"Training: {X_train.shape[0]}, Testing: {X_test.shape[0]}")

with mlflow.start_run(run_name="isolation_forest") as run:
    # Log parameters
    mlflow.set_tag("role", "champion")
    mlflow.log_param("algorithm", "IsolationForest")
    mlflow.log_param("contamination", 0.05)
    mlflow.log_param("n_estimators", 100)
    
    # Create pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('isolation_forest', IsolationForest(
            contamination=0.05,
            random_state=42,
            n_estimators=100
        ))
    ])
    
    # Train model
    pipeline.fit(X_train)
    
    # Predictions and scoring
    train_predictions = pipeline.predict(X_train)
    test_predictions = pipeline.predict(X_test)
    train_scores = pipeline.decision_function(X_train)
    test_scores = pipeline.decision_function(X_test)
    
    # Calculate metrics
    train_anomaly_count = np.sum(train_predictions == -1)
    test_anomaly_count = np.sum(test_predictions == -1)
    
    # Log metrics
    mlflow.log_metric("train_anomaly_count", train_anomaly_count)
    mlflow.log_metric("test_anomaly_count", test_anomaly_count)
    mlflow.log_metric("train_anomaly_rate", train_anomaly_count / len(X_train))
    mlflow.log_metric("test_anomaly_rate", test_anomaly_count / len(X_test))
    
    # Register model
    model_name = "main.ttw_workshop_demo.customer_anomaly_detector"
    mlflow.sklearn.log_model(
        pipeline,
        "model",
        registered_model_name=model_name,
        input_example=X_train.head(5)
    )
    
    isolation_run_id = run.info.run_id
    print(f"✅ Isolation Forest: {test_anomaly_count} anomalies detected")

Training: 80, Testing: 20


Uploading artifacts:   0%|          | 0/11 [00:00<?, ?it/s]



Uploading artifacts:   0%|          | 0/11 [00:00<?, ?it/s]

Registered model 'main.ttw_workshop_demo.customer_anomaly_detector' already exists. Creating a new version of this model...


Uploading artifacts:   0%|          | 0/11 [00:00<?, ?it/s]

Created version '2' of model 'main.ttw_workshop_demo.customer_anomaly_detector'.


✅ Isolation Forest: 0 anomalies detected
🏃 View run isolation_forest_v1 at: https://e2-demo-west.cloud.databricks.com/ml/experiments/30434066767557/runs/dbdc5c9dfd77461187d2887a756941d4
🧪 View experiment at: https://e2-demo-west.cloud.databricks.com/ml/experiments/30434066767557


## Model Training - PCA (Comparison)

In [0]:
with mlflow.start_run(run_name="pca_reconstruction") as run:
    # Log parameters
    mlflow.set_tag("role", "challenger")
    mlflow.log_param("algorithm", "PCA")
    mlflow.log_param("n_components", 3)
    
    # PCA pipeline
    scaler = StandardScaler()
    pca = PCA(n_components=3)
    
    X_train_scaled = scaler.fit_transform(X_train)
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_train_reconstructed = pca.inverse_transform(X_train_pca)
    
    # Calculate reconstruction errors
    train_errors = np.sum((X_train_scaled - X_train_reconstructed) ** 2, axis=1)
    threshold = np.percentile(train_errors, 95)
    
    # Test predictions
    X_test_scaled = scaler.transform(X_test)
    X_test_pca = pca.transform(X_test_scaled)
    X_test_reconstructed = pca.inverse_transform(X_test_pca)
    test_errors = np.sum((X_test_scaled - X_test_reconstructed) ** 2, axis=1)
    
    test_anomalies = np.sum(test_errors > threshold)
    
    # Log metrics
    mlflow.log_metric("test_anomaly_count", test_anomalies)
    mlflow.log_metric("test_anomaly_rate", test_anomalies / len(X_test))
    mlflow.log_metric("explained_variance_ratio", pca.explained_variance_ratio_.sum())
    
    # Create and register pipeline
    pca_pipeline = Pipeline([('scaler', scaler), ('pca', pca)])
    pca_model_name = "main.ttw_workshop_demo.pca_anomaly_detector"
    
    # For PCA model signature
    pca_sample_output = pca_pipeline.transform(X_train.head(5))
    pca_signature = infer_signature(X_train.head(5), pca_sample_output)

    mlflow.sklearn.log_model(
        pca_pipeline,
        "model", 
        registered_model_name=pca_model_name,
        input_example=X_train.head(5),
        signature=pca_signature
    )
    
    print(f"✅ PCA Model: {test_anomalies} anomalies detected")

Uploading artifacts:   0%|          | 0/11 [00:00<?, ?it/s]



Uploading artifacts:   0%|          | 0/11 [00:00<?, ?it/s]

Registered model 'main.ttw_workshop_demo.pca_anomaly_detector' already exists. Creating a new version of this model...


Uploading artifacts:   0%|          | 0/11 [00:00<?, ?it/s]

Created version '2' of model 'main.ttw_workshop_demo.pca_anomaly_detector'.


✅ PCA Model: 1 anomalies detected
🏃 View run pca_reconstruction_v1 at: https://e2-demo-west.cloud.databricks.com/ml/experiments/30434066767557/runs/c341e8d9d1cd470bafcdaea457196d0c
🧪 View experiment at: https://e2-demo-west.cloud.databricks.com/ml/experiments/30434066767557


In [0]:
# Compare Champion vs Challenger in MLflow UI
client = mlflow.tracking.MlflowClient()
exp = mlflow.get_experiment_by_name(experiment_path)
runs = client.search_runs(
    [exp.experiment_id],
    filter_string="tags.role IN ('champion','challenger')",
    order_by=["start_time DESC"]
)
# Display role, run_id, and key metrics
display(
    pd.DataFrame([{
        "run_id": r.info.run_id,
        "role": r.data.tags.get("role"),
        "anomaly_rate": r.data.metrics.get("test_anomaly_rate")
    } for r in runs])
)

run_id,role,anomaly_rate
c341e8d9d1cd470bafcdaea457196d0c,challenger,0.05
dbdc5c9dfd77461187d2887a756941d4,champion,0.0


## (Optional)AutoML Training 

In [0]:
# from databricks.feature_engineering import FeatureEngineeringClient
# from datetime import datetime
# from databricks import automl
# import getpass
# fe = FeatureEngineeringClient()
# features_df = fe.read_table(name="main.ttw_workshop_demo.customer_features")

# # Save training dataset
# # 2. DEFINE BINARY TARGET FOR AUTOML
# # Anomaly = 1 if 'engagement_score' below 10% quantile, else 0
# quantile = features_df.approxQuantile("engagement_score", [0.1], 0.01)[0]
# X = features_df.withColumn(
#     "anomaly_target",
#     (F.col("engagement_score") < quantile).cast("int")
# )
# # 3. TRAIN/TEST SPLIT
# # For demo/test only: random split
# X = X.withColumn("split", F.when(F.rand() > 0.2, "train").otherwise("test"))

# # --- Guarantee at least two classes per split for AutoML ---
# # Convert to pandas for logic, operate only on 'anomaly_target' and 'split'
# X_pd = X.select("anomaly_target", "split").toPandas()

# for split_val in ["train", "test"]:
#     subset = X_pd[X_pd['split'] == split_val]
#     # Only flip if split is nonempty and not yet both classes
#     if subset['anomaly_target'].nunique() < 2 and len(subset) > 1:
#         first_idx = subset.index[0]
#         X_pd.at[first_idx, 'anomaly_target'] = 1 - X_pd.at[first_idx, 'anomaly_target']

# # Write updated anomaly_target back to the Spark DataFrame X, in-place with all original columns preserved
# X_pd = X_pd.reset_index().rename(columns={"index": "row_idx"})
# X = X.withColumn("row_idx", F.monotonically_increasing_id())
# fix_df = spark.createDataFrame(X_pd[["row_idx", "anomaly_target"]])
# X = X.drop("anomaly_target").join(fix_df, on="row_idx", how="left").drop("row_idx")


# X.write.mode("overwrite").saveAsTable("main.ttw_workshop_demo.training_dataset_automl_w_target")

In [0]:

# xp_path = f"/Users/tingting.wan@databricks.com/databricks_automl/"
# xp_name = f"demo_automl_anomaly_{datetime.now().strftime('%Y-%m-%d_%H:%M:%S')}"

# try:
#     from databricks import automl

#     automl_run = automl.classify(
#         experiment_name=xp_name,
#         experiment_dir=xp_path,
#         dataset=X,
#         target_col="anomaly_target",
#         split_col="split",
#         timeout_minutes=10,
#         exclude_cols=["Customer_ID"]  # Remove identifier columns from training
#     )
#     print("✅ AutoML started. Check MLflow for runs.")

# except Exception as e:
#     print(f"❌ AutoML exception: {e}")

2025/08/04 10:55:19 INFO databricks.automl.client.manager: AutoML will optimize for F1 score metric, which is tracked as val_f1_score in the MLflow experiment.
2025/08/04 10:55:20 INFO databricks.automl.client.manager: MLflow Experiment ID: 30434066767549
2025/08/04 10:55:20 INFO databricks.automl.client.manager: MLflow Experiment: https://e2-demo-west.cloud.databricks.com/?o=2556758628403379#mlflow/experiments/30434066767549


🏃 View run wise-conch-726 at: https://e2-demo-west.cloud.databricks.com/ml/experiments/30434066767549/runs/79f3545e56d444a49f815468c2370598
🧪 View experiment at: https://e2-demo-west.cloud.databricks.com/ml/experiments/30434066767549


2025/08/04 10:56:13 INFO databricks.automl.client.manager: Data exploration notebook: https://e2-demo-west.cloud.databricks.com/?o=2556758628403379#notebook/30434066767554


❌ AutoML exception: AutoML experiment failed with error "Target column must contain at least 2 distinct target classes". See job run at https://e2-demo-west.cloud.databricks.com/?o=2556758628403379#job/261022380564976/run/352847742545931 for more details.
