In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, explained_variance_score

In [3]:
# Load dataset
file_path = "historical_data.csv"  # Update with actual path
data = pd.read_csv(file_path)

In [4]:
# Convert timestamps to datetime
data['created_at'] = pd.to_datetime(data['created_at'])
data['actual_delivery_time'] = pd.to_datetime(data['actual_delivery_time'])

In [5]:
# Create target variable
data['total_delivery_duration_seconds'] = (data['actual_delivery_time'] - data['created_at']).dt.total_seconds()

In [6]:
# Feature Engineering
data['hour_of_day'] = data['created_at'].dt.hour
data['day_of_week'] = data['created_at'].dt.dayofweek
data['is_weekend'] = data['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
data['dasher_utilization_rate'] = data['total_busy_dashers'] / (data['total_onshift_dashers'] + 1e-5)
data['order_to_dasher_ratio'] = data['total_outstanding_orders'] / (data['total_onshift_dashers'] + 1e-5)
data['average_item_price'] = data['subtotal'] / (data['total_items'] + 1e-5)
data['price_range'] = data['max_item_price'] - data['min_item_price']

In [7]:
drop_cols = ['created_at', 'actual_delivery_time', 'store_id']
data = data.drop(columns=drop_cols)
data.head()

Unnamed: 0,market_id,store_primary_category,order_protocol,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,total_onshift_dashers,total_busy_dashers,...,estimated_order_place_duration,estimated_store_to_consumer_driving_duration,total_delivery_duration_seconds,hour_of_day,day_of_week,is_weekend,dasher_utilization_rate,order_to_dasher_ratio,average_item_price,price_range
0,1.0,american,1.0,4,3441,4,557,1239,33.0,14.0,...,446,861.0,3779.0,22,4,0,0.424242,0.636363,860.247849,682
1,2.0,mexican,2.0,1,1900,1,1400,1400,1.0,2.0,...,446,690.0,4024.0,21,1,0,1.99998,1.99998,1899.981,0
2,3.0,,1.0,1,1900,1,1900,1900,1.0,0.0,...,446,690.0,1781.0,20,3,0,0.0,0.0,1899.981,0
3,3.0,,1.0,6,6900,5,600,1800,1.0,1.0,...,446,289.0,3075.0,21,1,0,0.99999,1.99998,1149.998083,1200
4,3.0,,1.0,3,3900,3,1100,1600,6.0,6.0,...,446,650.0,2390.0,2,6,1,0.999998,1.499998,1299.995667,500


In [8]:
# Handle missing and infinite values
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.fillna(data.median(), inplace=True)


  data.fillna(data.median(), inplace=True)


In [9]:
# Encoding categorical variables
le = LabelEncoder()
data['store_primary_category'] = le.fit_transform(data['store_primary_category'])
data['order_protocol'] = le.fit_transform(data['order_protocol'])

In [10]:
data.head()

Unnamed: 0,market_id,store_primary_category,order_protocol,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,total_onshift_dashers,total_busy_dashers,...,estimated_order_place_duration,estimated_store_to_consumer_driving_duration,total_delivery_duration_seconds,hour_of_day,day_of_week,is_weekend,dasher_utilization_rate,order_to_dasher_ratio,average_item_price,price_range
0,1.0,4,0,4,3441,4,557,1239,33.0,14.0,...,446,861.0,3779.0,22,4,0,0.424242,0.636363,860.247849,682
1,2.0,47,1,1,1900,1,1400,1400,1.0,2.0,...,446,690.0,4024.0,21,1,0,1.99998,1.99998,1899.981,0
2,3.0,74,0,1,1900,1,1900,1900,1.0,0.0,...,446,690.0,1781.0,20,3,0,0.0,0.0,1899.981,0
3,3.0,74,0,6,6900,5,600,1800,1.0,1.0,...,446,289.0,3075.0,21,1,0,0.99999,1.99998,1149.998083,1200
4,3.0,74,0,3,3900,3,1100,1600,6.0,6.0,...,446,650.0,2390.0,2,6,1,0.999998,1.499998,1299.995667,500


In [11]:
# Split data
X = data.drop(columns=['total_delivery_duration_seconds'])
y = data['total_delivery_duration_seconds']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [13]:
# Standardizing numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [14]:
# Define models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(objective='reg:squarederror', random_state=42),
    "K-Nearest Neighbors": KNeighborsRegressor()
}

In [15]:
# Train and evaluate models
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    evs = explained_variance_score(y_test, y_pred)
    results[name] = {"MAE": mae, "RMSE": rmse, "R2": r2, "Explained Variance": evs}
    print(f"{name} - MAE: {mae:.2f}, RMSE: {rmse:.2f}, R²: {r2:.2f}, EVS: {evs:.2f}")


Linear Regression - MAE: 736.55, RMSE: 1696.61, R²: 0.08, EVS: 0.08
Random Forest - MAE: 795.09, RMSE: 10724.85, R²: -35.88, EVS: -35.87
XGBoost - MAE: 780.93, RMSE: 26728.52, R²: -228.06, EVS: -228.05
K-Nearest Neighbors - MAE: 777.06, RMSE: 1765.70, R²: 0.00, EVS: 0.00


In [16]:
# Compare model performance
results_df = pd.DataFrame(results).T
print("\nModel Comparison:")
print(results_df.sort_values(by="MAE"))


Model Comparison:
                            MAE          RMSE          R2  Explained Variance
Linear Regression    736.550535   1696.608437    0.077093            0.078653
K-Nearest Neighbors  777.059349   1765.703084    0.000391            0.000410
XGBoost              780.928390  26728.518843 -228.057571         -228.052049
Random Forest        795.091928  10724.851276  -35.878854          -35.869735
