# Food Delivery prediction

## Data cleaning

### Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
from sklearn.model_selection import train_test_split
import missingno as msno
from geopy.distance import geodesic
import plotly.express as px

In [None]:
df = pd.read_csv("../data/train.csv")

print(f"Rows/columns dimension - {df.shape}")
print("Data types - ")
print(df.dtypes)
print("Columns - ")
print(df.columns)
df.head()


In [None]:
# delivery person ID contains city_code
df["city_code"] = df['Delivery_person_ID'].str.split("RES", expand=True)[0]

# remove unique identifiers
df.drop(columns=["ID", "Delivery_person_ID"], inplace=True)


In [None]:
# rename columns
df.rename(
    columns={
        "Delivery_person_Age": "driver_age",
        "Delivery_person_Ratings": "driver_rating",
        "Restaurant_latitude": "restaurant_lat",
        "Restaurant_longitude": "restaurant_long",
        "Delivery_location_latitude": "dest_location_lat",
        "Delivery_location_longitude": "dest_location_long",
        "Order_Date": "order_date",
        "Time_Orderd": "time_ordered",
        "Time_Order_picked": "time_order_picked",
        "Weatherconditions": "weather",
        "Road_traffic_density": "traffic_density",
        "Vehicle_condition": "vehicle_condition",
        "Type_of_order": "order_type",
        "Type_of_vehicle": "vehicle_type",
        # 'multiple_deliveries': "multiple_deliveries",
        "Festival": "festival",
        "City": "city",
        "Time_taken(min)": "time_taken_min",
    },
    inplace=True,
)
df.head(3)


In [None]:
df.describe(include="all").T

- `NaN` values present in some columns as string
- additional features can be included like -
    - food_prep_time = order_ime - picked_time
    - distance = dest_lat_long - restaurant_lat_long


In [None]:
# checking columns datatypes and their unique values
for col in df.columns:
    uniques = df[col].nunique()
    print(f"# unique in {col} \t- {uniques}")
    if uniques < 25:
        print("\t\t", df[col].unique())


### Check and convert missing to NaN

In [None]:
# checking NA
df.replace('NaN', float(np.nan), regex=True,inplace=True)

display(df.isna().sum())

print(df.info())


In [None]:
# convert datatypes
df['driver_age'] = df['driver_age'].astype('float64')
df['driver_rating'] = df['driver_rating'].astype('float64')
df['multiple_deliveries'] = df['multiple_deliveries'].astype('float64')
df['order_date'] = pd.to_datetime(df['order_date'],format="%d-%m-%Y")

In [None]:
# handling missing values
# - imputing missing values
# - dropping rows having missing features
# since we have very few missing features and good amouint of data, we'll go with dropping rows
df2 = df.dropna(how="any")
df2.shape

In [None]:
df2.to_csv("../data/train_cleaned.csv", index=False)

In [None]:
df2.head()

## Feature Engineering

In [None]:
# load cleaned data
df = pd.read_csv(
    "../data/train_cleaned.csv",
    parse_dates=["order_date"], # , "time_ordered", "time_order_picked"
    date_format="%Y-%m-%d"
)
df.shape

In [None]:
df.dtypes

In [None]:
# create additional features
# df = 0
df["weekend"] = df["order_date"].dt.day_of_week > 4

df["month_intervals"] = df["order_date"].apply(lambda x:
                                                "start_month" if x.day <=10
                                                else ("middle_month" if x.day <= 20 else "end_month")
                                            )

df["year_quarter"] = df["order_date"].apply(lambda x: x.quarter)

# Calculate formatted pickup time considering cases where pickup time is on the next day
df['time_order_picked_formatted'] = (
    df['order_date']
    + pd.to_timedelta(np.where(df['time_order_picked'] < df['time_ordered'], 1, 0), unit='D')
    + pd.to_timedelta(df['time_order_picked'])
)

# Calculate formatted order time
df['time_ordered_formatted'] = df['order_date'] + pd.to_timedelta(df['time_ordered'])

# Calculate time difference in minutes
df['order_prepare_time'] = (df['time_order_picked_formatted'] - df['time_ordered_formatted']).dt.total_seconds() / 60

# Handle null values by filling with the median
df['order_prepare_time'].fillna(df['order_prepare_time'].median(), inplace=True)

# remove redundant columns
df.drop(['time_ordered', 'time_order_picked', 'time_ordered_formatted', 'time_order_picked_formatted', 'order_date'], axis=1, inplace=True)


df.head()

In [None]:
R = 6371  ##The earth's radius (in km)

def deg_to_rad(degrees):
    return degrees * (np.pi/180)

## The haversine formula
def distcalculate(lat1, lon1, lat2, lon2):
    d_lat = deg_to_rad(lat2-lat1)
    d_lon = deg_to_rad(lon2-lon1)
    a1 = np.sin(d_lat/2)**2 + np.cos(deg_to_rad(lat1))
    a2 = np.cos(deg_to_rad(lat2)) * np.sin(d_lon/2)**2
    a = a1 * a2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    return R * c

# Create distance column & calculate the distance
df['distance'] = np.nan

for i in range(len(df)):
  df.loc[i, 'distance'] = distcalculate(df.loc[i, 'restaurant_lat'],
                                          df.loc[i, 'restaurant_long'],
                                          df.loc[i, 'dest_location_lat'],
                                          df.loc[i, 'dest_location_long'])
df.distance = df.distance.astype("int64")
df.distance.head()

In [None]:
df["weather"] = df["weather"].str.lower().str.split(expand=True)[1]
df["time_taken_min"] = df["time_taken_min"].str.lower().str.split(expand=True)[1]
df["time_taken_min"] = df["time_taken_min"].str.strip().astype(int)
df["festival"] = df["festival"] == "Yes"

In [None]:
df.head().T

In [None]:
df.to_csv("../data/train_feature_engineered.csv", index=False)

In [None]:
# distance vs time taken
figure = px.scatter(data_frame = df,
                    x="distance",
                    y="time_taken_min",
                    size="time_taken_min",
                    trendline="ols",
                    title = "Relationship Between Time Taken and Distance")
figure.show()

In [None]:
# driver age vs distance
figure = px.scatter(data_frame = df,
                    x="driver_age",
                    y="time_taken_min",
                    size="time_taken_min",
                    color = "distance",
                    trendline="ols",
                    title = "Relationship Between Delivery Partner Age and Time Taken")
figure.show()

In [None]:
# rating vs time taken
figure = px.scatter(data_frame = df,
                    x="driver_rating",
                    y="time_taken_min",
                    size="time_taken_min",
                    color = "distance",
                    trendline="ols",
                    title = "Relationship Between Delivery Partner Ratings and Time Taken")
figure.show()

In [None]:
# vehicle_type vs time vs order type
fig = px.box(df,
             x="vehicle_type",
             y="time_taken_min",
             color="order_type",
             title = "Relationship Between Type of Vehicle and Type of Order")
fig.show()

## Data prep for Model Training

In [None]:
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [None]:
df = pd.read_csv("../data/train_feature_engineered.csv", index_col = 0)
df.head()

In [None]:
df.dtypes

In [None]:
# encode categorical features
categorical_columns = df.select_dtypes(include='object').columns
label_encoder = LabelEncoder()
df[categorical_columns] = df[categorical_columns].apply(lambda col: label_encoder.fit_transform(col))

In [None]:
df.head()

In [None]:
column_mapping = {column_name: idx for idx, column_name in enumerate(df.columns)}
column_mapping

In [None]:
# Split features & label
X = df.drop('time_taken_min', axis=1)
y = df['time_taken_min']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

In [None]:
# standardizing numerical features
scaler = StandardScaler()

# Fit the scaler on the training data
scaler.fit(X_train)

# Perform standardization on the training data
X_train = scaler.transform(X_train)

# Perform standardization on the testing data
X_test = scaler.transform(X_test)

# X_train.head()

## Model Building
- Try different models with hyperparameter tuning using grid/random search CV

In [None]:
models = [
    LinearRegression(),
    DecisionTreeRegressor(),
    RandomForestRegressor(),
    xgb.XGBRegressor(),         # Random Forest regressor with XGBoost (Gradient Boosting Trees)

    # optional
    # LGBMRegressor(),          # Light Gradient Boost Regressor
    # CatBoostRegressor()       # Cat Boost Regressor
]

param_grid = [
    {},
    {'max_depth': [3, 5, 7]},
    {'n_estimators': [3, 5, 7], 'max_features': ['sqrt', 'log2']},                          # to reduce training time
                                                                                                    # {'n_estimators': [100, 200, 300]}, # more training time

    {'n_estimators': [20, 25, 30], 'max_depth': [5, 7, 9]},                                         # xgb params

    # {'num_leaves': [15, 20, 5], 'max_depth': [3, 5, 8], 'learning_rate': [0.05, 0.1, 0.2]},       # 'num_leaves': [85]
    # {'n_estimators': [5, 7, 9], 'max_depth': [6, 8, 10], 'learning_rate': [0.05, 0.1, 0.2]}       # experiment with 'n_estimators': [500, 700, 900]
]

for i, model in enumerate(models):
    grid_search = GridSearchCV(model, param_grid[i], cv=5, scoring='r2')
    grid_search.fit(X_train, y_train)

    print(f"{model.__class__.__name__}:")
    print("Best parameters:", grid_search.best_params_)
    print("Best R2 score:", grid_search.best_score_)
    print()

In [None]:
# retrain best regressor from scratch
# Create a XGB regressor model
xgb_model = xgb.XGBRegressor(n_estimators=20 ,max_depth=7)

# Fit the model on the training data
xgb_model.fit(X_train, y_train)

In [None]:
# model evaluation using adjusted r2
y_pred = xgb_model.predict(X_test)

def adjusted_r_squared(r2, n, k):
    return 1 - ((1 - r2) * (n - 1) / (n - k - 1))

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
adjusted_r2 = adjusted_r_squared(r2, len(y_test), X_test.shape[1])

print("Mean Absolute Error (MAE):", round(mae,2))
print("Mean Squared Error (MSE):", round(mse,2))
print("Root Mean Squared Error (RMSE):", round(rmse,2))
print("R-squared (R2) Score:", round(r2,2))
print("Adjusted R-squared Score:", round(adjusted_r2, 2))

In [None]:
# feature selection - forward selection
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
sfs = SFS(xgb.XGBRegressor(n_estimators=20,max_depth=7),
           k_features="best",
           forward=True,
           floating=False,
           verbose=2,
           scoring='r2',
           cv=2)

sfs = sfs.fit(X_train ,y_train)
selected_feat_= list(sfs.k_feature_names_)
selected_feat_ = list(map(int, selected_feat_))

selected_feat_

In [None]:
column_mapping

In [None]:
selected_columns = [
    # 'Delivery_person_Age', 'Delivery_person_Ratings', 'Weather_conditions', 'month_intervals', 'year_quarter',
    # 'Road_traffic_density', 'Vehicle_condition', 'Multiple_deliveries', 'Festival', 'City_type', "is_weekend",
    "driver_rating", "weather", "traffic_density", "vehicle_condition",
    "vehicle_type", "multiple_deliveries", "city", "order_prepare_time",
]

X_train = df[selected_columns]         # Features
y = df['time_taken_min']              # Target variable

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


scaler = StandardScaler()                    # Create a StandardScaler object
scaler.fit(X_train)                          # Fit the scaler on the training data
X_train = scaler.transform(X_train)          # Perform standardization on the training data
X_test = scaler.transform(X_test)            # Perform standardization on the testing data


# Fit XGBRegressor model with selected features
model = xgb.XGBRegressor(n_estimators=20, max_depth=7)  # Initialize XGBRegressor model
model.fit(X_train, y_train)


# Make predictions with selected features and seeing if accuracy improved or not of the model
xgb_pred = model.predict(X_test)


def adjusted_r_squared(r2, n, k):
    return 1 - ((1 - r2) * (n - 1) / (n - k - 1))

# Evaluate the model
mae = mean_absolute_error(y_test, xgb_pred)
mse = mean_squared_error(y_test, xgb_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, xgb_pred)
adjusted_r2 = adjusted_r_squared(r2, len(y_test), X_test.shape[1])

print("Mean Absolute Error (MAE):", round(mae,2))
print("Mean Squared Error (MSE):", round(mse,2))
print("Root Mean Squared Error (RMSE):", round(rmse,2))
print("R-squared (R2) Score:", round(r2,2))
print("Adjusted R-squared Score:", round(adjusted_r2, 2))

In [None]:
#  feature selection - backward elimination
from sklearn.feature_selection import RFE

# Split features and target variable
X = df.drop('time_taken_min', axis=1)  # Features
y = df['time_taken_min']  # Target variable

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()                    # Create a StandardScaler object
scaler.fit(X_train)                          # Fit the scaler on the training data
X_train_scaled = scaler.transform(X_train)   # Perform standardization on the training data
X_test_scaled = scaler.transform(X_test)     # Perform standardization on the testing data

# Initialize XGBRegressor model as the base model
xgb_model = xgb.XGBRegressor(n_estimators=20, max_depth=9)

# Define the number of features to select
num_features_list = [7, 11, 15, 18, 20]

feature_names = X.columns  # Get the feature names from the DataFrame

for num_features in num_features_list:

    # Initialize RFE with the model and the number of features to select
    rfe = RFE(estimator=xgb_model, n_features_to_select=num_features)

    # Fit RFE to the training data
    rfe.fit(X_train_scaled, y_train)

    # Get the ranking of each feature
    feature_ranking = rfe.ranking_

    # Get the indices of the selected features
    selected_features_indices = np.where(feature_ranking == 1)[0]

    # Map the selected indices back to the feature names
    selected_feature_names = feature_names[selected_features_indices]

    # Visualize the feature ranking
    plt.figure(figsize=(5, 5))
    plt.title(f"RFE - Feature Ranking for {num_features} Features")
    plt.xlabel("Feature Index")
    plt.ylabel("Ranking")
    plt.xticks(range(len(feature_ranking)), np.arange(1, len(feature_ranking) + 1))
    plt.bar(range(len(feature_ranking)), feature_ranking)
    plt.show()

    # Print the selected features
    print(f"Selected {num_features} Features:", selected_feature_names)

    # Train the final model using the selected features
    xgb_model.fit(X_train_scaled[:, selected_features_indices], y_train)

    # Evaluate the model on the test set
    accuracy = xgb_model.score(X_test_scaled[:, selected_features_indices], y_test)
    print("Accuracy on the Test Set:", accuracy)

Selecting 7 features (by backward elimination) will yeild same accuracy as Xgboost model `~0.72`