In [0]:
%run ./setup

## Read data from gold layer to memory

In [0]:
# Read our churn_features table
churn_dataset = spark.table("churn_features")
display(churn_dataset)

## Convert it to pandas on spark and Remove a few unrelevant cols

In [0]:
# Convert to pandas on spark
dataset = churn_dataset.pandas_api()
dataset.describe()  
# Drop columns we don't want to use in our model
dataset = dataset.drop(columns=['address', 'email', 'firstname', 'lastname', 'creation_date', 'last_activity_date', 'last_event'])
# Drop missing values
dataset = dataset.dropna()
# print the ten first rows
dataset[:10]

In [0]:
dataset.info()

## Write to feature store

In 'Features', there should be a feature table. And a physical table 'churn_user_features' is added to catalog.schema   

In [0]:
from databricks.feature_store import FeatureStoreClient

fs = FeatureStoreClient()

try:
  #drop table if exists
  fs.drop_table('churn_user_features')
except: pass

#Note: You might need to delete the FS table using the UI
churn_feature_table = fs.create_table(
  name='churn_user_features',
  primary_keys='user_id',
  schema=dataset.spark.schema(),
  description='These features are derived from the churn_bronze_customers table in the lakehouse.  We created dummy variables for the categorical columns, cleaned up their names, and added a boolean flag for whether the customer churned or not.  No aggregations were performed.'
)

fs.write_table(df=dataset.to_spark(), name='churn_user_features', mode='overwrite')
features = fs.read_table('churn_user_features')
display(features)

## Training a model from the table in the Feature Store

Convert feature table to a pandas model since we use scikit learn 

In [0]:
# Convert to Pandas
df = features.toPandas()

In [0]:
# Split to train and test set
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

In [0]:
# Select the columns
from databricks.automl_runtime.sklearn.column_selector import ColumnSelector
supported_cols = ["event_count", "gender", "total_amount", "country", "order_count", "channel", "total_item", "days_since_last_activity", "days_last_event", "days_since_creation", "session_count", "age_group", "platform"]
col_selector = ColumnSelector(supported_cols)

In [0]:
# Preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler

num_imputers = []
num_imputers.append(("impute_mean", SimpleImputer(), ["age_group", "days_last_event", "days_since_creation", "days_since_last_activity", "event_count", "gender", "order_count", "session_count", "total_amount", "total_item"]))

numerical_pipeline = Pipeline(steps=[
    ("converter", FunctionTransformer(lambda df: df.apply(pd.to_numeric, errors="coerce"))),
    ("imputers", ColumnTransformer(num_imputers)),
    ("standardizer", StandardScaler()),
])

numerical_transformers = [("numerical", numerical_pipeline, ["event_count", "gender", "total_amount", "order_count", "total_item", "days_since_last_activity", "days_last_event", "days_since_creation", "session_count", "age_group"])]

In [0]:
# Treating categorical variables
from databricks.automl_runtime.sklearn import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

one_hot_imputers = []
one_hot_pipeline = Pipeline(steps=[
    ("imputers", ColumnTransformer(one_hot_imputers, remainder="passthrough")),
    ("one_hot_encoder", OneHotEncoder(handle_unknown="indicator")),
])
categorical_one_hot_transformers = [("onehot", one_hot_pipeline, ["age_group", "channel", "country", "event_count", "gender", "order_count", "platform", "session_count"])]

In [0]:
# Final transformation of the columns
from sklearn.compose import ColumnTransformer
transformers = numerical_transformers + categorical_one_hot_transformers
preprocessor = ColumnTransformer(transformers, remainder="passthrough", sparse_threshold=1)

In [0]:
# Separate target column from features
target_col = "churn"
X_train = train_df.drop([target_col], axis=1)
y_train = train_df[target_col]

X_test = test_df.drop([target_col], axis=1)
y_test = test_df[target_col]

In [0]:
import pandas as pd
import mlflow
from mlflow.models import Model
from mlflow import pyfunc
from mlflow.pyfunc import PyFuncModel

import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

# Start a run
with mlflow.start_run(run_name="simple-RF-run") as run:
  classifier = RandomForestClassifier()
  model = Pipeline([
      ("column_selector", col_selector),
      ("preprocessor", preprocessor),
      ("classifier", classifier),
  ])

Split the codes to three cells. There are three files in the experiment

In [0]:
  # Enable automatic logging of input samples, metrics, parameters, and models
  mlflow.sklearn.autolog(
      log_input_examples=True,
      silent=True)
  model.fit(X_train, y_train)

In [0]:
  # Log metrics for the test set
  mlflow_model = Model()
  pyfunc.add_to_model(mlflow_model, loader_module="mlflow.sklearn")
  pyfunc_model = PyFuncModel(model_meta=mlflow_model, model_impl=model)
  X_test[target_col] = y_test
  test_eval_result = mlflow.evaluate(
      model=pyfunc_model,
      data=X_test,
      targets=target_col,
      model_type="classifier",
      evaluator_config = {"log_model_explainability": False,
                          "metric_prefix": "test_" , "pos_label": 1 }
  )


One cell will put all the results into a file in experiment

In [0]:
with mlflow.start_run(run_name="simple-RF-run2") as run:
  classifier = RandomForestClassifier()
  model = Pipeline([
      ("column_selector", col_selector),
      ("preprocessor", preprocessor),
      ("classifier", classifier),
  ])

  # Enable automatic logging of input samples, metrics, parameters, and models
  mlflow.sklearn.autolog(
      log_input_examples=True,
      silent=True)

  model.fit(X_train, y_train)

  # Log metrics for the test set
  mlflow_model = Model()
  pyfunc.add_to_model(mlflow_model, loader_module="mlflow.sklearn")
  pyfunc_model = PyFuncModel(model_meta=mlflow_model, model_impl=model)
  X_test[target_col] = y_test
  test_eval_result = mlflow.evaluate(
      model=pyfunc_model,
      data=X_test,
      targets=target_col,
      model_type="classifier",
      evaluator_config = {"log_model_explainability": False,
                          "metric_prefix": "test_" , "pos_label": 1 }
  )


## Register the model 

register mode to 'Models'

In [0]:
run.info.run_id

In [0]:
modelName='RF2'

This saved model to catalog.schema and also show the model in 'Models', indicating the model is registered 

In [0]:
mlflow.set_registry_uri("databricks-uc")

logged_model = 'runs:/' + run.info.run_id + '/model'

print("Registeting the model under the name '" + modelName + "'")
result=mlflow.register_model(logged_model, 'dbacademy.labuser9128531_1738705451.'+modelName, await_registration_for=0)

## Retrieving model 

In [0]:
from mlflow.tracking.client import MlflowClient

In [0]:
result.version

In [0]:
client = MlflowClient()
client.get_model_version(name='dbacademy.labuser9128531_1738705451.'+modelName, version=result.version)

When performing an automated model training and registration process, this code can be used to ensure that the model is fully available and ready before performing any further operations

In [0]:
# Retrieving the model
import time
client = MlflowClient() # query about model version, info
model_version_details = None
while True:
  model_version_details = client.get_model_version(name='dbacademy.labuser9128531_1738705451.'+modelName, version=result.version)
  if model_version_details.status == 'READY': break
  time.sleep(5) #If the model's status is not 'READY', the code will wait for 5 seconds before checking again

In [0]:
print(model_version_details)

## Set up model as produciton 
client.transition_model_version_stage has been  deprecated 

In [0]:
# create "production" alias for version 1 of model "RF2"
client.set_registered_model_alias('dbacademy.labuser9128531_1738705451.'+modelName, "production", 1)