# Experiment & Config Setup

Before starting any experiment, you should define the `EXPERIMENT_NAME` (ideally, matching the name of the notebook) and the baseline configuration for all the classes that will be used across the experiment.

In [0]:
%load_ext autoreload
%autoreload 2
#-------------------------------------------------------------------------------
# REQUIRED PACKAGES
#-------------------------------------------------------------------------------
import os
import sys
from pathlib import Path
#-------------------------------------------------------------------------------
# CONFIGURABLE OPTIONS
#-------------------------------------------------------------------------------
EXPERIMENT_NAME = "birdnet-ave-3s-chunk-embeddings"     # Experiment name (should match notebook name)
SCHEMA = "frogid_ml"
MODEL_NAME = "birdnet-ave-3s-chunk-embeddings"
CURRENT_USER="yulin.zhou@matrgroup.com"                   # Author email address on databricks             

#-------------------------------------------------------------------------------
# SYSTEM SETUP FOR EXPERIMENT
#-------------------------------------------------------------------------------
IS_DATABRICKS = "DATABRICKS_RUNTIME_VERSION" in os.environ
ROOT_DIR = Path(os.getcwd()).parent
sys.path.insert(0, str(ROOT_DIR))
from mlops.utils.environment_setup import start_experiment
experiment = start_experiment(
    experiment_name=EXPERIMENT_NAME,
    root_dir=ROOT_DIR, 
    is_databricks=IS_DATABRICKS, 
    current_user=CURRENT_USER
)

In [0]:
ROOT_DIR

# Run Setup

In the following section you can run a version of this experiment by defining a run.

In [0]:
################################################################################
# START MLFLOW RUN
################################################################################
# This an be a new run (run_id = None) or an existing run that you want to
# reload, by specifying the run_id
################################################################################
from mlops.utils.environment_setup import start_mlflow_run
from mlops.utils.pipeline import generate_pipeline_config, instantiate_pipeline

run_name, run_id = start_mlflow_run(run_id = None)
config = generate_pipeline_config(
    experiment, 
    run_id, 
    force_save=True,
    overrides={
        'mlflow_config.log_model_wrapper': True
    }
)
pipeline = instantiate_pipeline(config)

In [0]:
################################################################################
# LOAD CLEAN DATA
################################################################################
# The anchoring function determines how you select the class_label_single
# from a list of species in multi-species settings. Below we use the
# most-frequent-target strategy, which means that if there are multiple species
# the single class label will be the species that is most frequently represented
# among the list of class labels.
################################################################################

from mlops.feature_engineering.registry_anchoring_strategies import ANCHORING_STRATEGY_REGISTRY

# Load the cleaned data and their classes
df_data, class_labels_to_species_mapping = pipeline.data_selector.load_data(
    label_anchor_fn=ANCHORING_STRATEGY_REGISTRY["most-frequent-target"]
)

In [0]:
################################################################################
# MODELLING
################################################################################
# The following code snippet demonstrates how to produce a reproducible ML
# model training pipeline. The process involves:
# 1. Selecting the subset of data to use for modelling based on a strategy
# 2. Downloading & Preprocessing the selected subset to create a feature df
# 3. Training the model according to the initial experiment setup
################################################################################
from mlops.training.tf_model_registry import MODEL_REGISTRY

# Step 1: Sample the modelling data using the data_sampler
df_modelling = pipeline.data_sampler.sample_modelling_dataset(
    df_data=df_data,
    modelling_strategy=config.modelling_data_strategy,
)

df_modelling = df_modelling.head(1000)

# Step 2: Download the data and return the updated dataframe (in case of missing files)
df_modelling = pipeline.data_downloader.download_files(df_modelling)

# Step 3: Create embeddings for the data using the data_preprocessor
df_modelling_features = pipeline.data_preprocessor.run(df_modelling)

In [0]:

# Step 4: Train the model
model = pipeline.model_trainer.train(df_modelling_features, model_fn=MODEL_REGISTRY['birdnet_mlp_multiclass'], name = f"{SCHEMA}.{MODEL_NAME}")

In [0]:
################################################################################
# EVALUATION: TEST DATA
################################################################################
# The following code snippet demonstrates how to do an evaluation of a model
# 1. Select a sample you are interested in using the data_sampler
# 2. Downloading & Preprocessing the selected subset to create a feature df
# 3. Evaluating the model by pointing to the correct run_id
################################################################################

# Sample the data you are interested in
df_sample = pipeline.data_sampler.sample_test_data(run_id=run_id, df=df_data)

# Download any files required to evaluate this sample
df_sample = pipeline.data_downloader.download_files(df_sample)

# Create embeddings for the data using this sample
df_sample_features = pipeline.data_preprocessor.run(df_sample)

# Evaluate the results for the model stored inside the given run_id
y_true, y_true_binarized, y_pred, y_probs, macro_results, per_species_results = pipeline.model_evaluator.evaluate(
    run_id=run_id,
    df_features=df_sample_features,
    class_label_to_species_mapping=class_labels_to_species_mapping,
    dir_name_to_store_results="single-species-max-1000"
)

In [0]:
################################################################################
# EVALUATION: HOLDOUT DATA
################################################################################
# The following code snippet demonstrates how to do an evaluation of a model
# 1. Select a sample you are interested in using the data_sampler
# 2. Downloading & Preprocessing the selected subset to create a feature df
# 3. Evaluating the model by pointing to the correct run_id
################################################################################
from mlops.feature_engineering.registry_filtering_strategies import FILTERING_STRATEGY_REGISTRY

# Sample the data you are interested in
df_sample = pipeline.data_sampler.sample_hold_out_data(
    run_id=run_id,
    df_cleaned=df_data,
    filtering_strategy_fn=FILTERING_STRATEGY_REGISTRY['single-species-only'],
    max_samples_per_class=100,
)

# Download any files required to evaluate this sample
df_sample = pipeline.data_downloader.download_files(df_sample)

# Create embeddings for the data using this sample
df_sample_features = pipeline.data_preprocessor.run(df_sample)

# Evaluate the results for the model stored inside the given run_id
y_true, y_true_binarized, y_pred, y_probs, macro_results, per_species_results = pipeline.model_evaluator.evaluate(
    run_id=run_id,
    df_features=df_sample_features,
    class_label_to_species_mapping=class_labels_to_species_mapping,
    dir_name_to_store_results="holdout-data-max-100"
)

In [0]:
from mlflow.tracking import MlflowClient
def get_latest_model_version(model_name):
    latest_version = 1
    mlflow_client = MlflowClient()
    for mv in mlflow_client.search_model_versions(f"name='{model_name}'"):
        version_int = int(mv.version)
        if version_int > latest_version:
            latest_version = version_int
    return latest_version

In [0]:
catalog = "Aus_Museum_DBX_Prod" if "prod" in str(ROOT_DIR) else "Aus_Museum_DBX_Dev"

In [0]:
# The returned model URI is needed by the model deployment notebook.
model_name =  f"{catalog}.{SCHEMA}.{MODEL_NAME}"
model_version = get_latest_model_version(model_name)
model_uri = f"models:/{model_name}/{model_version}"
dbutils.jobs.taskValues.set("model_uri", model_uri)
dbutils.jobs.taskValues.set("model_name", model_name)
dbutils.jobs.taskValues.set("model_version", model_version)
dbutils.notebook.exit(model_uri)