# Logistic Regression Model for Delivery Prediction

In [2]:
# Logistic Regression Model for Delivery Prediction

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Load dataset
df = pd.read_csv("dataset.csv")  # Replace with your actual file path

# Convert datetime columns and calculate delivery duration
df['created_at'] = pd.to_datetime(df['created_at'])
df['actual_delivery_time'] = pd.to_datetime(df['actual_delivery_time'])
df['delivery_duration'] = (df['actual_delivery_time'] - df['created_at']).dt.total_seconds()

# Create binary target: 1 if delivery is late (> 1 hour), else 0
df['is_late'] = (df['delivery_duration'] > 3600).astype(int)

# Extract datetime features
df['hour'] = df['created_at'].dt.hour
df['day_of_week'] = df['created_at'].dt.dayofweek
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)

# Drop rows with missing target or invalid durations
df = df.dropna(subset=['delivery_duration', 'is_late'])

# Define feature columns
numeric_features = [
    'order_protocol', 'total_items', 'subtotal', 'num_distinct_items',
    'min_item_price', 'max_item_price', 'total_onshift_partners',
    'total_busy_partners', 'total_outstanding_orders', 'hour', 'day_of_week', 'is_weekend'
]
categorical_features = ['market_id', 'store_primary_category']

# Separate features and target
X = df[numeric_features + categorical_features]
y = df['is_late']

# Define transformers
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Create model pipeline
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)


              precision    recall  f1-score   support

           0       0.82      0.98      0.89     31675
           1       0.60      0.14      0.23      7810

    accuracy                           0.81     39485
   macro avg       0.71      0.56      0.56     39485
weighted avg       0.78      0.81      0.76     39485

