# Experiment & Config Setup

Before starting any experiment, you should define the `EXPERIMENT_NAME` (ideally, matching the name of the notebook) and the baseline configuration for all the classes that will be used across the experiment.

This notebook goes over how to usetilise the stored embeddings, add new embeddings to the database, and how to work with both stored embeddings and ones that also need to be created.

In [0]:
#%load_ext autoreload
#%autoreload 2
#-------------------------------------------------------------------------------
# REQUIRED PACKAGES
#-------------------------------------------------------------------------------
import os
import sys
from pathlib import Path
from mlops.utils.environment_setup import start_experiment

#-------------------------------------------------------------------------------
# CONFIGURABLE OPTIONS
#-------------------------------------------------------------------------------
EXPERIMENT_NAME = "testing-embeddings"     # Experiment name (should match notebook name)
CURRENT_USER="elise.hampton@matrgroup.com"                   # Author email address on databricks             

#-------------------------------------------------------------------------------
# SYSTEM SETUP FOR EXPERIMENT
#-------------------------------------------------------------------------------
IS_DATABRICKS = "DATABRICKS_RUNTIME_VERSION" in os.environ
ROOT_DIR = Path(os.getcwd()).parent
sys.path.insert(0, str(ROOT_DIR))
experiment = start_experiment(
    experiment_name=EXPERIMENT_NAME,
    root_dir=ROOT_DIR, 
    is_databricks=IS_DATABRICKS, 
    current_user=CURRENT_USER
)

# Run Setup

In the following section you can run a version of this experiment by defining a run.

In [0]:
################################################################################
# START MLFLOW RUN
################################################################################
# This an be a new run (run_id = None) or an existing run that you want to
# reload, by specifying the run_id
################################################################################
from mlops.utils.environment_setup import start_mlflow_run
from mlops.utils.pipeline import generate_pipeline_config, instantiate_pipeline

run_name, run_id = start_mlflow_run(run_id=None)
config = generate_pipeline_config(
    experiment,
    run_id,
    overrides={
    #    "selector_config.target_species_definition.base_target_species": [
    #    "Litoria verreauxii",
    #    "Litoria ewingii"
    #]
    #,
    "modelling_data_strategy.other_species_sampling_strategy": "stratify"
    #these next two parameters need to be changed in sync
     ,"preprocessor_config.birdnet_extractor_config.output_mode" : "stack"
     ,"preprocessor_config.embeddings_databricks_config.embedding_strategy":"stack"

    ,"modelling_data_strategy.target_species_max_samples_per_class" : 5

    #specific to the storing and retrieving of data
    ,"preprocessor_config.orchestrator_config.embeddings_save_format" : "dbx-table"
    ,"preprocessor_config.embeddings_databricks_config.table": "aus_museum_dbx_dev.frogid_ml.dev_embed_table" 
    #these two parameters are specific to the embeddings created trhough BirdNet, if you switch from BirdNet these need to be updated
    ,"preprocessor_config.embeddings_databricks_config.embeddings": "BirdNet" 
    ,"preprocessor_config.embeddings_databricks_config.window_duration":3.0
    #this parameter is currently 0.0 but when sliding windows are brought in there will be another parameter than will need to be in sync with this one
    ,"preprocessor_config.embeddings_databricks_config.overlap_duration": 0.0
    
},
    force_save=True)

pipeline = instantiate_pipeline(config)

In [0]:
################################################################################
# LOAD CLEAN DATA
################################################################################
# The anchoring function determines how you select the class_label_single
# from a list of species in multi-species settings. Below we use the
# most-frequent-target strategy, which means that if there are multiple species
# the single class label will be the species that is most frequently represented
# among the list of class labels.
################################################################################

from mlops.feature_engineering.registry_anchoring_strategies import ANCHORING_STRATEGY_REGISTRY

# Load the cleaned data and their classes
df_data, class_labels_to_species_mapping = pipeline.data_selector.load_data(
    label_anchor_fn=ANCHORING_STRATEGY_REGISTRY["most-frequent-target"]
)

In [0]:
################################################################################
# MODELLING
################################################################################
# The following code snippet demonstrates how to produce a reproducible ML
# model training pipeline. The process involves:
# 1. Selecting the subset of data to use for modelling based on a strategy
# 2. Downloading & Preprocessing the selected subset to create a feature df
# 3. Training the model according to the initial experiment setup
################################################################################
from mlops.training.tf_model_registry import MODEL_REGISTRY

# Step 1: Sample the modelling data using the data_sampler
df_modelling = pipeline.data_sampler.sample_modelling_dataset(
    df_data=df_data,
    modelling_strategy=config.modelling_data_strategy,
)


In [0]:
import pandas as pd

#Step 2. -identify what data is already stored in the embeddings table and what is not
df_modelling_comp, df_modelling = pipeline.data_databricks.data_embeddings_search(df_modelling)

#Step 3. - download the data that is there and/or create the data we need (and store it for next time)

#### Logic for both retrieveing and producing embeddings where needed ####
#Create two empty dataframes
# Define the columns and their desired data types
columns_with_dtypes = {
    'id': int,
    'chunk_index': int,
    'features': 'O',
    'class_label': float,
    'species_name': str
}
df_modelling_embed = pd.DataFrame({col: pd.Series(dtype=dtype) for col, dtype in columns_with_dtypes.items()})
df_modelling_features = pd.DataFrame({col: pd.Series(dtype=dtype) for col, dtype in columns_with_dtypes.items()})

if df_modelling_comp.empty: 
    print("No data in database - need to download and produce embeddings for all.")
else:
    #Step3.1. - if there is data in the database
    #go and get the embeddings from the database
    df_modelling_embed = pipeline.data_databricks.data_embeddings_retrieve(df_modelling_comp)
    
if df_modelling.empty == False: #go and get the missing data and add to the embeddings database
    # Step 3.2: Download the data and return the updated dataframe (in case of missing files)
    df_modelling = pipeline.data_downloader.download_files(df_modelling)
    # Step 3.3: Create embeddings for the data using the data_preprocessor
    df_modelling_features = pipeline.data_preprocessor.run(df_modelling)

#Step 3.4: Combine the data together in cases where the data is being retrieved two ways
df_modelling_features = pd.concat([df_modelling_features, df_modelling_embed], ignore_index=True)


In [0]:
# Step 4: Train the model
model = pipeline.model_trainer.train(df_modelling_features, model_fn=MODEL_REGISTRY['birdnet_mlp_multiclass'])

# Before you continue
There are two options going forward: 
1. All the data you need is being stored and you only need to call 
> pipeline.data_databricks.data_embeddings_retrieve(df_modelling) 
2. You don't know what data is avaliable and you will need to use the same logic as used above to work out what data you want is missing and to add it into the database.

In [0]:
################################################################################
# EVALUATION: TEST DATA
################################################################################
# The following code snippet demonstrates how to do an evaluation of a model
# 1. Select a sample you are interested in using the data_sampler
# 2. Downloading & Preprocessing the selected subset to create a feature df
# 3. Evaluating the model by pointing to the correct run_id
################################################################################

# Sample the data you are interested in
df_sample = pipeline.data_sampler.sample_test_data(run_id=run_id, df=df_data)


# Download any files required to evaluate this sample
df_sample = pipeline.data_downloader.download_files(df_sample)
# Create embeddings for the data using this sample
df_sample_features = pipeline.data_preprocessor.run(df_sample)


# Evaluate the results for the model stored inside the given run_id
y_true, y_true_binarized, y_pred, y_probs, macro_results, per_species_results = pipeline.model_evaluator.evaluate(
    run_id=run_id,
    df_features=df_sample_features,
    class_label_to_species_mapping=class_labels_to_species_mapping,
    dir_name_to_store_results="single-species-max-1000"
)

In [0]:
################################################################################
# EVALUATION: HOLDOUT DATA
################################################################################
# The following code snippet demonstrates how to do an evaluation of a model
# 1. Select a sample you are interested in using the data_sampler
# 2. Downloading & Preprocessing the selected subset to create a feature df
# 3. Evaluating the model by pointing to the correct run_id
################################################################################
from mlops.feature_engineering.registry_filtering_strategies import FILTERING_STRATEGY_REGISTRY

# Sample the data you are interested in
df_sample = pipeline.data_sampler.sample_hold_out_data(
    run_id=run_id,
    df_cleaned=df_data,
    filtering_strategy_fn=FILTERING_STRATEGY_REGISTRY['single-species-only'],
    max_samples_per_class=100,
)

# Download any files required to evaluate this sample
df_sample = pipeline.data_downloader.download_files(df_sample)

# Create embeddings for the data using this sample
df_sample_features = pipeline.data_preprocessor.run(df_sample)

# Evaluate the results for the model stored inside the given run_id
y_true, y_true_binarized, y_pred, y_probs, macro_results, per_species_results = pipeline.model_evaluator.evaluate(
    run_id=run_id,
    df_features=df_sample_features,
    class_label_to_species_mapping=class_labels_to_species_mapping,
    dir_name_to_store_results="holdout-data-max-100"
)