In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np
import h3
import joblib


In [None]:
data_4 = pd.read_csv("....Data/all_waybill_info_meituan_0322.csv")

In [None]:
min_lat = 0
min_lon = 0
max_lat = 46000000
max_lon = 174700000

"first we will filter the data and remvoing a order area encupsuled from the main area of operation"

data_4 = data_4[
        (data_4['recipient_lat'] >= min_lat) &
        (data_4['recipient_lat'] <= max_lat) &
        (data_4['recipient_lng'] >= min_lon) &
        (data_4['recipient_lng'] <= max_lon) &
        (data_4['sender_lat'] >= min_lat) &
        (data_4['sender_lat'] <= max_lat) &
        (data_4['sender_lng'] >= min_lon) &
        (data_4['sender_lng'] <= max_lon)] 


data_4['platform_order_date'] = data_4['platform_order_time']
data_4['platform_order_date'] = pd.to_datetime(data_4['platform_order_date'], unit='s') + pd.Timedelta(hours=8) #Offset of 8 hours, because it is probably a chinese timezone

data_4 = data_4[data_4['estimate_meal_prepare_time'] != 0].reset_index(drop=True) #data cleaning
data_4 = data_4[data_4['is_prebook'] != 1].reset_index(drop=True) #data cleaning

scaling_factor = 1_000_000
data_4['sender_lat'] = data_4['sender_lat'] / scaling_factor
data_4['sender_lng'] = data_4['sender_lng'] / scaling_factor
data_4['recipient_lat'] = data_4['recipient_lat'] / scaling_factor
data_4['recipient_lng'] = data_4['recipient_lng'] / scaling_factor
data_4["grab_lat"] = data_4["grab_lat"] / scaling_factor
data_4["grab_lng"] = data_4["grab_lng"] / scaling_factor

In [None]:
WORK_RESOLUTION = 13  # All courier positions and specific locations (restaurants, customers) are mapped to this grid.
MACRO_RESOLUTION = 8 # Define the lower resolution for strategic demand analysis. res = 8 ~0.74 km² #How many available couriers in the grid versus whats the demand
"""
Convert all relevant lat/lon coordinates from the DataFrame into H3 cell indices.
This maps every point of interest (restaurants, customers, courier starting points) onto our discrete hexagon grid.
"""
try:
    data_4['sender_h3'] = data_4.apply(
        lambda row: h3.latlng_to_cell(row['sender_lat'], row['sender_lng'], WORK_RESOLUTION),
        axis=1
    )
    data_4['recipient_h3'] = data_4.apply(
        lambda row: h3.latlng_to_cell(row['recipient_lat'], row['recipient_lng'], WORK_RESOLUTION),
        axis=1
    )
    data_4['grab_h3'] = data_4.apply(
        lambda row: h3.latlng_to_cell(row['grab_lat'], row['grab_lng'], WORK_RESOLUTION),
        axis=1
    )
    print("Conversion to H3 indices complete.")


except h3.H3ValueError as e:
    print("ERROR: A coordinate in the dataset is invalid. H3 message: {e}")

In [None]:
#Prepare the functions for the features

def get_hex_distance(start_hex, end_hex):
    key = tuple(sorted((start_hex, end_hex)))
    try:
        distance = h3.grid_distance(start_hex, end_hex)
        return distance
    except (h3.H3FailedError, TypeError):
        return float('inf')
    
data_4['hex_distance'] = data_4.apply(
    lambda row: get_hex_distance(row['sender_h3'], row['recipient_h3']),
    axis=1
)

print(data_4[['sender_h3', 'recipient_h3', 'hex_distance']].head())

In [None]:
data_4['push_hour'] = pd.to_datetime(data_4['order_push_time'], unit='s').dt.hour

In [None]:
features = [
    'is_weekend',
    'push_hour',
    #'estimate_meal_prepare_time',
    'hex_distance' 
]

# Define objective and features
X = data_4[features]
y = data_4['is_courier_grabbed'] # if courier is rejection is our goal to predict

# Split data to test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

#Train the logistic regression as a model
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
#We save the model for our ABM
model_filename = 'rejection_model.joblib'
joblib.dump(model, model_filename)