# This notebook gives a demo to load the binary classifier models built for predicting the success of an LP Attempt

### Loading the binary_models.yaml file

In [1]:
from os import truncate
import sys
import pandas as pd
from pathlib import Path
import pandas as pd
import joblib
from pathlib import Path
import os
import yaml
import h3

path_root = Path(os.path.abspath('')).parents[0] / 'lib'
sys.path.append(str(path_root))
from ds_shared.service_level import unify_service_levels, lp_sla_time_in_minutes

# Define the path where models are stored
MODEL_FOLDER = Path().resolve().parents[0] / 'output_models'

# Define the file path
yaml_file_path = MODEL_FOLDER / "binary_models.yaml"

# Read from a YAML file
with open(yaml_file_path, 'r') as file:
    yaml_data = yaml.safe_load(file)

best_resolution = yaml_data['geospatial_resolution']

predictors = {}
for model_num in range(4):
    predictors[model_num] =  yaml_data['feature_sets'][f'set{model_num}']

### Loading all the models

In [2]:
# Number of models
n_models = len(predictors)
all_models = {}

# Load the models
for i in range(n_models):
    fname = 'model_set' + str(i) + '.joblib'
    all_models[i] = joblib.load(MODEL_FOLDER / fname)
    print(f"Loaded models from: {MODEL_FOLDER / fname}")

Loaded models from: /Users/tariehkgeter/projects-active/lm-aspm-1-notebook-dev/output_models/model_set0.joblib
Loaded models from: /Users/tariehkgeter/projects-active/lm-aspm-1-notebook-dev/output_models/model_set1.joblib
Loaded models from: /Users/tariehkgeter/projects-active/lm-aspm-1-notebook-dev/output_models/model_set2.joblib
Loaded models from: /Users/tariehkgeter/projects-active/lm-aspm-1-notebook-dev/output_models/model_set3.joblib


# Loading the past month's data to update driver features

In [3]:
import sys
print("Extracting data!!")
!{sys.executable} ../data_extraction/sf-extract-attempt-data-evenly-by-sla.py --capture_days 30 --folder_name 'past_month' --sample_rate 10

print("Combining CSVs!!")
!{sys.executable} ../data_extraction/combine_csvs.py --folder_name 'past_month'

Extracting data!!
THE PATH IS ../past_month
('PROD_REPORTING_RW_AR', 'PROD_DB_V2', 'CURATED_DATA')
completed the execution
Combining CSVs!!


In [4]:
# Loading the pricing data
DATA = Path().resolve().parents[0] / 'past_month'
PRICING_SET = DATA / 'attempt_success_prediction_final_mi.csv'
df_month = pd.read_csv(PRICING_SET, dtype={7: str})
df_month = df_month.sort_values(by='dispatchedOn').reset_index(drop=True)
df_month = df_month.drop_duplicates(subset=['attemptId'], keep=False)

df_month['attemptSuccess'] = df_month.apply(lambda x: 1 if x.attemptStatus == 'DELIVERED' else 0, axis=1)
df_month['dispatchedOn'] = pd.to_datetime(df_month['dispatchedOn'])

df_month, all_sls = unify_service_levels(df_month)
# Including only the rows with serviceLevels in the desired modes
df_month = df_month[df_month['serviceLevel'].isin(lp_sla_time_in_minutes.keys())]

# Adding the geospatial feature
df_month['h3_zone'] = df_month.apply(lambda row: h3.latlng_to_cell(row.fromLat, row.fromLon, best_resolution), axis=1).astype('category')

In [5]:
past_1mo_stats = df_month.groupby('lpOrganizationId').agg(
    past_1mo_attempts=('attemptSuccess', 'size'),
    past_1mo_success=('attemptSuccess', 'sum')
)

In [6]:
df_month.columns

Index(['deliveryId', 'attemptId', 'lpOrganizationId', 'attemptNumber',
       'deliveryStatus', 'attemptStatus', 'shipperCost', 'serviceLevel',
       'dispatchedOn', 'distanceMi', 'weightLbsTotal', 'largestWeightLbs',
       'sizeCuInTotal', 'largestDimIn', 'secondDimIn', 'thirdDimIn',
       'itemCount', 'fromLat', 'fromLon', 'toLat', 'toLon', 'stateCode',
       'NETWORK_TYPE', 'SHIPPER_CONTRACT_SLA_ID', 'attemptSuccess', 'h3_zone'],
      dtype='object')

In [7]:
past_1mo_stats

Unnamed: 0_level_0,past_1mo_attempts,past_1mo_success
lpOrganizationId,Unnamed: 1_level_1,Unnamed: 2_level_1
0274e9a2-7d49-4607-bdfe-bf058ff8d4e5,51,4
0323cd0e-23fb-4959-8587-51761336a435,81,0
05848181-3122-4207-a46e-0ec29ee944bd,1142,169
060b25cd-4e91-4928-90a7-51859f061c59,24,2
063a6987-7c21-4c9f-8473-a80b9382b925,26,1
...,...,...
e9606baf-834f-4ab2-a06b-5ce8ba60d8af,27,4
ed002bcd-a64d-4b2e-a81b-734e3a83fcf7,557,21
f070aa38-48ab-4ef2-a2ac-c88914b02981,247,2
f3a11bfe-41ce-48ec-9136-0b22e12b1925,3941,35


# Moving the past month attempts and success to Redis

In [8]:
# We are going to use Redis to store the field: past_1mo_attemppts and past_1mo_success
# If you don't have it installed, run the following command:
%pip install redis python-dotenv

import os
from pathlib import Path
from dotenv import load_dotenv
import redis

# Load the .env that lives in data_extraction/
load_dotenv(Path("..") / "data_extraction" / ".env")  
# Now REDIS_HOST, REDIS_PORT, REDIS_KEY are in os.environ

# Build the Redis client (SSL only)
redis_client = redis.StrictRedis(
    host=os.getenv("REDIS_HOST"),
    port=int(os.getenv("REDIS_PORT")),
    password=os.getenv("REDIS_KEY"),
    ssl=True
)

print("✅ Connected to Redis at", os.getenv("REDIS_HOST"))

Note: you may need to restart the kernel to use updated packages.
✅ Connected to Redis at onerail-data-science-staging.redis.cache.windows.net


In [9]:
# ─── 3) Seed Redis ──────────
# Since lpOrganizationId is the index, we need to iterate through the DataFrame differently
for lp, row in past_1mo_stats.iterrows():
    attempts = int(row.past_1mo_attempts)
    successes = int(row.past_1mo_success)

    # Store two keys for each LP
    redis_client.set(f"lpOrganizationId:{lp}:past_1mo_total_attempts", attempts)
    redis_client.set(f"lpOrganizationId:{lp}:past_1mo_success", successes)

print(f"✅ Seeded Redis with {len(past_1mo_stats)} LP records")

✅ Seeded Redis with 74 LP records


In [10]:
# ─── 4) Verify Redis contents ──────────
import random

# Get a list of LPs from the index of past_1mo_stats
sample_lps = random.sample(list(past_1mo_stats.index), k=3)
print("Verifying Redis data for 3 random LP organizations:")
print("-" * 70)
for lp in sample_lps:
    # Get values from Redis
    t = redis_client.get(f"lpOrganizationId:{lp}:past_1mo_total_attempts")
    s = redis_client.get(f"lpOrganizationId:{lp}:past_1mo_success")
    
    # Get values from DataFrame for comparison
    df_total = past_1mo_stats.loc[lp, 'past_1mo_attempts'] 
    df_success = past_1mo_stats.loc[lp, 'past_1mo_success']
    
    # Display in a more readable format
    print(f"LP ID: {lp}")
    print(f"Redis:     total={int(t or 0):4}  success={int(s or 0):4}")
    print(f"DataFrame: total={df_total:4}  success={df_success:4}")
    
    # Show the actual Redis keys and values
    # print(f"Redis Key: lpOrganizationId:{lp}:past_1mo_total_attempts    Redis Value: {t.decode('utf-8') if t else 0}")

    print(f"Redis Key: lpOrganizationId:{lp}:past_1mo_success  Redis Value: {s.decode('utf-8') if s else 0}")
    print("-" * 70)

print("✅ Verification complete: Redis data matches DataFrame values")

Verifying Redis data for 3 random LP organizations:
----------------------------------------------------------------------
LP ID: 71b4384b-4c14-471e-8a06-1045feef95a1
Redis:     total= 285  success=   4
DataFrame: total= 285  success=   4
Redis Key: lpOrganizationId:71b4384b-4c14-471e-8a06-1045feef95a1:past_1mo_success  Redis Value: 4
----------------------------------------------------------------------
LP ID: 38224777-cc12-4271-b29e-d61f1d8e108f
Redis:     total=   8  success=   0
DataFrame: total=   8  success=   0
Redis Key: lpOrganizationId:38224777-cc12-4271-b29e-d61f1d8e108f:past_1mo_success  Redis Value: 0
----------------------------------------------------------------------
LP ID: 6a126a25-a8dc-4f10-a8af-e3c0ee540cd6
Redis:     total=  77  success=   0
DataFrame: total=  77  success=   0
Redis Key: lpOrganizationId:6a126a25-a8dc-4f10-a8af-e3c0ee540cd6:past_1mo_success  Redis Value: 0
----------------------------------------------------------------------
✅ Verification complet

# Testing on sample raw data

In [None]:
# Sample test data
path_root = Path(os.path.abspath('')).parents[0] / 'lib'
sys.path.append(str(path_root))
from ds_shared.service_level import unify_service_levels

# Create a new row as a dictionary
raw_data = {'distanceMi': 50, 'weightLbsTotal': 0, 'largestDimIn': 0, 'secondDimIn': 0, 'thirdDimIn': 0, 'largestWeightLbs': 0, 'serviceLevel': '1 Minute', 'fromLat': 32, 'fromLon': -90, 'lpOrganizationId':'16b6c714-f8d7-4a81-9ac2-a3938f3450c2'}

X_test = pd.DataFrame([raw_data])
X_test, all_sls = unify_service_levels(X_test)

#X_test['serviceLevel'].isin(lp_sla_time_in_minutes.keys())

X_test['h3_zone'] = X_test.apply(lambda x: h3.latlng_to_cell(x.fromLat, x.fromLon, best_resolution), axis=1).astype('category')
X_test['past_1mo_total_attempts'] = X_test.apply(lambda x: past_1mo_stats.past_1mo_attempts.get(x.lpOrganizationId, 0), axis=1)
X_test['past_1mo_successes'] = X_test.apply(lambda x: past_1mo_stats.past_1mo_success.get(x.lpOrganizationId, 0), axis=1)


# This is the check for a single instance, do modify when you have multiple rows
if(X_test['serviceLevel'].isin(lp_sla_time_in_minutes.keys()).loc[0]):
    if(('secondDimIn' in X_test.columns) and ('thirdDimIn' in X_test.columns)):
        model_num = 3
    else:
        model_num = 1
else:
    if(('secondDimIn' in X_test.columns) and ('thirdDimIn' in X_test.columns)):
        model_num = 2
    else:
        model_num = 0

print('For teh above set of features using model#{} and its prediction is {}'.format(model_num, all_models[model_num].predict(X_test[predictors[model_num]])))