In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys
from pathlib import Path

In [None]:
#============================================================================       
# Databricks Setup & Variables
#============================================================================
def is_databricks():
    return "DATABRICKS_RUNTIME_VERSION" in os.environ

IS_DATABRICKS = is_databricks()

EXPERIMENT_NAME = "birdnet48000_averaging_multiclass_aftermergetest"
CURRENT_USER = "hannah.weng@matrgrou.com.au"
BUNDLE_TARGET =  "frogid-ml-15species-dbx"
DATABRICKS_PATH = f"/Workspace/Users/{CURRENT_USER}/{BUNDLE_TARGET}/mlops/"

if IS_DATABRICKS:
    sys.path.append(DATABRICKS_PATH)
    print(f"✅ Databricks path: {DATABRICKS_PATH}")

In [None]:
import sys
import os
notebook_path =  '/Workspace/' + os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get())
os.chdir(notebook_path)
os.chdir('..')
sys.path.append("../..")
from mlops.utils.notebook_setup import generate_project_path

print('notebook_setup imported successfully')

#============================================================================       
# Setup Paths & Project
#============================================================================

ROOT_DIR, DIR_DATA, DIR_AUDIO_FILES, DIR_RAW_DATA, DIR_EMBEDDINGS, DIR_ARTIFACTS, DIR_PROCESSED, FROGID_CSV_PATH, EXPERIEMNT_FULL_NAME, IS_DATABRICKS = generate_project_path(EXPERIMENT_NAME, CURRENT_USER, BUNDLE_TARGET)

print(f"✅ Databricks: {IS_DATABRICKS}")
print(f"✅ Project root directory: {ROOT_DIR}")
print(f"📁 CSV Path: {FROGID_CSV_PATH}")
print(f"📁 Audio Files Path: {DIR_AUDIO_FILES}")
print(f"📁 Processed Path: {DIR_PROCESSED}")
print(f"📁 Embeddings Path: {DIR_EMBEDDINGS}")
print(f"📁 Artifacts Path: {DIR_ARTIFACTS}")
print(f"📁 Experiment Path: {EXPERIEMNT_FULL_NAME}")

In [None]:
#============================================================================
# Experiment Parameters & Configurations
#============================================================================

TARGET_AUDIO_FORMAT = "wav"
MAX_SAMPLES_PER_CLASS = 1350
INCLUDE_OTHER_CATEGORY = True  # Include 16th "other" category

TARGET_SPECIES = [
    "Rhinella marina",
    "Crinia signifera",
    "Limnodynastes peronii",
    "Litoria moorei",
    "Litoria fallax",
    "Limnodynastes tasmaniensis",
    "Crinia parinsignifera",
    "Limnodynastes dumerilii",
    "Litoria caerulea",
    "Litoria ewingii",
    "Litoria verreauxii",
    "Litoria rubella",
    "Crinia glauerti",
    "Litoria peronii",
    "Litoria ridibunda"
]

NUM_CLASSES = len(TARGET_SPECIES) + 1 if INCLUDE_OTHER_CATEGORY else len(TARGET_SPECIES)

print(f"🐸 Target species: {len(TARGET_SPECIES)}")
print(f"🐸 Total classes: {NUM_CLASSES}")
print(f"🐸 Max samples/class: {MAX_SAMPLES_PER_CLASS}")

In [None]:
################################################################################
# Data Selection
################################################################################
from feature_engineering.data_selector import FrogDataSelector, FrogDataSelectorConfig

INCLUDE_MULTI_SPECIES = False  # Exclude multi-species for cleaner "Other" category analysis
SEPARATE_MULTI_SPECIES_OTHER = False  # Keep single "Other" category
OTHER_SPECIES_BOOST_FACTOR = 3.0  # Increased boost factor for better "Other" representation
OTHER_SPECIES_MIN_SAMPLES = 15  # Increased minimum samples for more robust analysis

# Use the same config as the 
selector_config_dict = {
    'force_rerun': True,
    'csv_path': str(FROGID_CSV_PATH),
    'target_species': TARGET_SPECIES,
    'output_dir': str(DIR_ARTIFACTS / EXPERIMENT_NAME / "data_input"),
    'allow_duplicates': False,
    'allow_inappropriate': False,
    'allow_people_activity': False,
    'avoid_poor_quality': True,
    # Sampling Criteria
    'sampling_strategy': 'downsample',
    'max_samples_per_class': MAX_SAMPLES_PER_CLASS,
    'include_multi_species': INCLUDE_MULTI_SPECIES,
    'separate_multi_species_other': SEPARATE_MULTI_SPECIES_OTHER,
    'other_species_boost_factor': OTHER_SPECIES_BOOST_FACTOR
}

selector_config = FrogDataSelectorConfig(**selector_config_dict)
selector = FrogDataSelector(selector_config)
df = selector.get_dataframe()