# This notebook gives a demo to load the binary classifier models and calculate the daily evaluation metrics of the model for drift

### Loading the binary_models.yaml file

In [1]:
from os import truncate
import sys
import pandas as pd
from pathlib import Path
import pandas as pd
import joblib
from pathlib import Path
import os
import yaml
import h3

path_root = Path(os.path.abspath('')).parents[0] / 'lib'
sys.path.append(str(path_root))
from ds_shared.service_level import unify_service_levels, lp_sla_time_in_minutes

# Define the path where models are stored
MODEL_FOLDER = Path().resolve().parents[0] / 'output_models'

# Define the file path
yaml_file_path = MODEL_FOLDER / "binary_models.yaml"

# Read from a YAML file
with open(yaml_file_path, 'r') as file:
    yaml_data = yaml.safe_load(file)

best_resolution = yaml_data['geospatial_resolution']

predictors = {}
for model_num in range(4):
    predictors[model_num] =  yaml_data['feature_sets'][f'set{model_num}']

### Loading all the models

In [2]:
# Number of models
n_models = len(predictors)
all_models = {}

# Load the models
for i in range(n_models):
    fname = 'model_set' + str(i) + '.joblib'
    all_models[i] = joblib.load(MODEL_FOLDER / fname)
    print(f"Loaded models from: {MODEL_FOLDER / fname}")

Loaded models from: /Users/tariehkgeter/projects-active/lm-aspm-1-notebook-play/output_models/model_set0.joblib
Loaded models from: /Users/tariehkgeter/projects-active/lm-aspm-1-notebook-play/output_models/model_set1.joblib
Loaded models from: /Users/tariehkgeter/projects-active/lm-aspm-1-notebook-play/output_models/model_set2.joblib
Loaded models from: /Users/tariehkgeter/projects-active/lm-aspm-1-notebook-play/output_models/model_set3.joblib


### Loading the last day's data for evaluating metrics

In [3]:
import sys
print("Extracting data!!")
!{sys.executable} ../data_extraction/sf-extract-attempt-data-evenly-by-sla.py --capture_days 1 --folder_name 'daily_data' --sample_rate 10

print("Combining CSVs!!")
!{sys.executable} ../data_extraction/combine_csvs.py --folder_name 'daily_data'

Extracting data!!
THE PATH IS ../daily_data
('PROD_REPORTING_RW_AR', 'PROD_DB_V2', 'CURATED_DATA')
completed the execution
Combining CSVs!!


In [4]:
# Loading the pricing data
DATA = Path().resolve().parents[0] / 'daily_data'
PRICING_SET = DATA / 'attempt_success_prediction_final_mi.csv'
df_day = pd.read_csv(PRICING_SET, dtype={7: str})
df_day = df_day.sort_values(by='dispatchedOn').reset_index(drop=True)
df_day = df_day.drop_duplicates(subset=['attemptId'], keep=False)
df_day, all_sls = unify_service_levels(df_day)

df_day['attemptSuccess'] = df_day.apply(lambda x: 1 if x.attemptStatus == 'DELIVERED' else 0, axis=1)
df_day['dispatchedOn'] = pd.to_datetime(df_day['dispatchedOn'])

# Including only the rows with serviceLevels in the desired modes
df_day = df_day[df_day['serviceLevel'].isin(lp_sla_time_in_minutes.keys())]

# Adding the geospatial feature
df_day['h3_zone'] = df_day.apply(lambda row: h3.latlng_to_cell(row.fromLat, row.fromLon, best_resolution), axis=1).astype('category')

# Change this block to extract the measures from redis as it already has Past 1 month driver features

In [5]:
import sys
print("Extracting data!!")
!{sys.executable} ../data_extraction/sf-extract-attempt-data-evenly-by-sla.py --capture_days 31 --folder_name 'past_month' --sample_rate 10

print("Combining CSVs!!")
!{sys.executable} ../data_extraction/combine_csvs.py --folder_name 'past_month'

# Loading the pricing data
DATA = Path().resolve().parents[0] / 'past_month'
PRICING_SET = DATA / 'attempt_success_prediction_final_mi.csv'
df_month = pd.read_csv(PRICING_SET, dtype={7: str})
df_month = df_month.sort_values(by='dispatchedOn').reset_index(drop=True)
df_month = df_month.drop_duplicates(subset=['attemptId'], keep=False)
df_month, all_sls = unify_service_levels(df_month)


df_month['attemptSuccess'] = df_month.apply(lambda x: 1 if x.attemptStatus == 'DELIVERED' else 0, axis=1)
df_month['dispatchedOn'] = pd.to_datetime(df_month['dispatchedOn'])

# Including only the rows with serviceLevels in the desired modes
df_month = df_month[df_month['serviceLevel'].isin(lp_sla_time_in_minutes.keys())]

# Adding the geospatial feature
df_month['h3_zone'] = df_month.apply(lambda row: h3.latlng_to_cell(row.fromLat, row.fromLon, best_resolution), axis=1).astype('category')

# Now this would get the past month's data with respect to the past day's data
df_month = df_month.merge(df_day, how='outer', indicator=True).query('_merge == "left_only"').drop(columns=['_merge'])

past_1mo_stats = df_month.groupby('lpOrganizationId').agg(
    past_1mo_attempts=('attemptSuccess', 'size'),
    past_1mo_success=('attemptSuccess', 'sum')
)

Extracting data!!
THE PATH IS ../past_month
('PROD_REPORTING_RW_AR', 'PROD_DB_V2', 'CURATED_DATA')
completed the execution
Combining CSVs!!


# Daily eval metrics for different models

In [6]:
df_day['past_1mo_total_attempts'] = df_day.apply(lambda x: past_1mo_stats.past_1mo_attempts.get(x.lpOrganizationId, 0), axis=1)
df_day['past_1mo_successes'] = df_day.apply(lambda x: past_1mo_stats.past_1mo_success.get(x.lpOrganizationId, 0), axis=1)

In [7]:
import numpy as np
from sklearn.metrics import accuracy_score, recall_score

daily_accuracy = {}
daily_sensitivity = {}

y_test = df_day['attemptSuccess']

for model_num in range(len(predictors)):
    preds = all_models[model_num].predict(df_day[predictors[model_num]]) >= yaml_data['decision_boundary'][f'model_set{model_num}']
    daily_accuracy[model_num] = np.round(accuracy_score(y_test, preds), 4)
    daily_sensitivity[model_num] = np.round(recall_score(y_test, preds), 4)

In [8]:
from pathlib import Path
import pandas as pd
from datetime import datetime

# Step 1: Reformat daily_eval into rows for (date, q)
records = []
today = datetime.today().date()

# Assuming 4 models: 0,1,2,3

row = {'date': today}
for model_num in range(4):
    row[f'accuracy_set{model_num}'] = daily_accuracy.get((model_num), None)
    row[f'sensitivity_set{model_num}'] = daily_sensitivity.get((model_num), None)
    model_name = f'model_set{model_num}'
    row[f'drift_set{model_num}'] = (daily_accuracy[(model_num)] < yaml_data['drift_detection']['accuracy_condition']['thresholds'][f'model_set{model_num}']) or (daily_sensitivity[(model_num)] < yaml_data['drift_detection']['sensitivity_condition']['thresholds'][f'model_set{model_num}'])
records.append(row)

df_daily = pd.DataFrame(records)
# Step 2: Append to CSV log
log_path = Path().resolve().parents[0] / 'output_models'/'model_eval_log.csv'

if log_path.exists():
    df_log = pd.read_csv(log_path)
    df_log = pd.concat([df_log, df_daily], ignore_index=True)
else:
    df_log = df_daily

df_log.to_csv(log_path, index=False)