In [1]:
import numpy as np 
import pandas as pd 
import mlflow 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, OneHotEncoder, PowerTransformer

In [2]:
df = pd.read_csv("swiggy_cleaned.csv")

In [3]:
df.head()

Unnamed: 0,rider_id,age,ratings,restaurant_latitude,restaurant_longitude,delivery_latitude,delivery_longitude,order_date,weather,traffic,...,city_name,order_day,order_month,order_day_of_week,is_weekend,pickup_time_minutes,order_time_hour,order_time_of_day,distance,distance_type
0,INDORES13DEL02,37.0,4.9,22.745049,75.892471,22.765049,75.912471,2022-03-19,sunny,high,...,INDO,19,3,saturday,1,15.0,11.0,morning,3.025149,short
1,BANGRES18DEL02,34.0,4.5,12.913041,77.683237,13.043041,77.813237,2022-03-25,stormy,jam,...,BANG,25,3,friday,0,5.0,19.0,evening,20.18353,very_long
2,BANGRES19DEL01,23.0,4.4,12.914264,77.6784,12.924264,77.6884,2022-03-19,sandstorms,low,...,BANG,19,3,saturday,1,15.0,8.0,morning,1.552758,short
3,COIMBRES13DEL02,38.0,4.7,11.003669,76.976494,11.053669,77.026494,2022-04-05,sunny,medium,...,COIMB,5,4,tuesday,0,10.0,18.0,evening,7.790401,medium
4,CHENRES12DEL01,32.0,4.6,12.972793,80.249982,13.012793,80.289982,2022-03-26,cloudy,high,...,CHEN,26,3,saturday,1,15.0,13.0,afternoon,6.210138,medium


In [4]:
df.shape

(45502, 27)

In [5]:
df.isna().sum()

rider_id                   0
age                     1854
ratings                 1908
restaurant_latitude     3630
restaurant_longitude    3630
delivery_latitude       3630
delivery_longitude      3630
order_date                 0
weather                  525
traffic                  510
vehicle_condition          0
type_of_order              0
type_of_vehicle            0
multiple_deliveries      993
festival                 228
city_type               1198
time_taken                 0
city_name                  0
order_day                  0
order_month                0
order_day_of_week          0
is_weekend                 0
pickup_time_minutes     1640
order_time_hour         1640
order_time_of_day       2070
distance                3630
distance_type           3630
dtype: int64

In [6]:
df.duplicated().sum()

np.int64(0)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45502 entries, 0 to 45501
Data columns (total 27 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   rider_id              45502 non-null  object 
 1   age                   43648 non-null  float64
 2   ratings               43594 non-null  float64
 3   restaurant_latitude   41872 non-null  float64
 4   restaurant_longitude  41872 non-null  float64
 5   delivery_latitude     41872 non-null  float64
 6   delivery_longitude    41872 non-null  float64
 7   order_date            45502 non-null  object 
 8   weather               44977 non-null  object 
 9   traffic               44992 non-null  object 
 10  vehicle_condition     45502 non-null  int64  
 11  type_of_order         45502 non-null  object 
 12  type_of_vehicle       45502 non-null  object 
 13  multiple_deliveries   44509 non-null  float64
 14  festival              45274 non-null  object 
 15  city_type          

In [8]:
df.columns

Index(['rider_id', 'age', 'ratings', 'restaurant_latitude',
       'restaurant_longitude', 'delivery_latitude', 'delivery_longitude',
       'order_date', 'weather', 'traffic', 'vehicle_condition',
       'type_of_order', 'type_of_vehicle', 'multiple_deliveries', 'festival',
       'city_type', 'time_taken', 'city_name', 'order_day', 'order_month',
       'order_day_of_week', 'is_weekend', 'pickup_time_minutes',
       'order_time_hour', 'order_time_of_day', 'distance', 'distance_type'],
      dtype='object')

In [9]:
columns_to_drop = ['rider_id', 'restaurant_latitude', 'restaurant_longitude',
       'delivery_latitude','delivery_longitude', 'city_name','order_day',
       'order_month', 'order_day_of_week', 'order_time_hour', 'order_date'
]

df.drop(columns=columns_to_drop, inplace = True)

In [10]:
df.columns

Index(['age', 'ratings', 'weather', 'traffic', 'vehicle_condition',
       'type_of_order', 'type_of_vehicle', 'multiple_deliveries', 'festival',
       'city_type', 'time_taken', 'is_weekend', 'pickup_time_minutes',
       'order_time_of_day', 'distance', 'distance_type'],
      dtype='object')

In [11]:
missing_cols = (df
                .isna()
                .any(axis = 0)
                .loc[lambda x : x]
                .index
                )

missing_cols

Index(['age', 'ratings', 'weather', 'traffic', 'multiple_deliveries',
       'festival', 'city_type', 'pickup_time_minutes', 'order_time_of_day',
       'distance', 'distance_type'],
      dtype='object')

In [12]:
temp_df = df.copy().dropna()

In [13]:
temp_df.isna().sum()

age                    0
ratings                0
weather                0
traffic                0
vehicle_condition      0
type_of_order          0
type_of_vehicle        0
multiple_deliveries    0
festival               0
city_type              0
time_taken             0
is_weekend             0
pickup_time_minutes    0
order_time_of_day      0
distance               0
distance_type          0
dtype: int64

In [14]:
temp_df.shape

(37695, 16)

In [15]:
X = temp_df.drop(columns= 'time_taken')
y = temp_df['time_taken']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
from sklearn import set_config
set_config(transform_output="pandas")

In [18]:
pt = PowerTransformer()

y_train_pt = pt.fit_transform(y_train.values.reshape(-1 , 1))
y_test_pt = pt.transform(y_test.values.reshape(-1 , 1))

In [19]:
X_train.shape , y_train_pt.shape

((30156, 15), (30156, 1))

In [20]:
X_test.shape , y_test_pt.shape

((7539, 15), (7539, 1))

In [21]:
len(X_train.columns)


15

In [22]:
num_cols = ['age', 'ratings', 'distance', 'pickup_time_minutes']

nominal_cat_cols = ['weather','type_of_order','type_of_vehicle','city_type','is_weekend','order_time_of_day', 'festival']

ordinal_cat_cols = ['traffic', 'distance_type']

In [23]:
len(num_cols + nominal_cat_cols + ordinal_cat_cols)

13

In [24]:
for col in ordinal_cat_cols:
    print(col , X_train[col].unique())

traffic ['jam' 'medium' 'high' 'low']
distance_type ['medium' 'short' 'long' 'very_long']


In [25]:
traffic_order = ['low', 'medium' , 'high', 'jam']

distance_type_order = ['short', 'medium' , 'long' , 'very_long']

In [26]:
preprocessor = ColumnTransformer(
    [
        ('scaling', MinMaxScaler() , num_cols),
        ('ordinal_encoding',OrdinalEncoder(
            categories=[traffic_order,distance_type_order],
            handle_unknown= "use_encoded_value",
            unknown_value=-1, 
            encoded_missing_value=-999
        ), ordinal_cat_cols),
        ('nominal_encoding',OneHotEncoder(
            handle_unknown="ignore",
            sparse_output=False
            ), nominal_cat_cols)
    ], remainder= 'passthrough',verbose_feature_names_out=False,force_int_remainder_cols = False
)

In [27]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [28]:
X_train_processed.shape , X_test_processed.shape

((30156, 32), (7539, 32))

In [29]:
best_lgbm_params = {'n_estimators':
200,
'max_depth':
15,
'learning_rate':
0.025811265327068796,
'num_leaves':
84}

best_rf_params = {'n_estimators':
400,
'max_depth':
14,
'min_samples_leaf':
5, 
'max_features':
0.7
}


In [30]:
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor

In [31]:
best_lgbm = LGBMRegressor(**best_lgbm_params)

best_rf = RandomForestRegressor(**best_rf_params)

In [32]:
import optuna 

In [33]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

In [34]:
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.compose import TransformedTargetRegressor
from sklearn.ensemble import StackingRegressor

In [35]:
import dagshub
dagshub.init(repo_owner='vinayak910', repo_name='swiggy-delivery-time-prediction', mlflow=True)

In [36]:
mlflow.set_tracking_uri("https://dagshub.com/vinayak910/swiggy-delivery-time-prediction.mlflow")
mlflow.set_experiment("Stacking Regressor Tuning")

<Experiment: artifact_location='mlflow-artifacts:/0ad1d7785d094282862a06a1ac89e914', creation_time=1752556108852, experiment_id='9', last_update_time=1752556108852, lifecycle_stage='active', name='Stacking Regressor Tuning', tags={}>

In [37]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
import mlflow

# First, split training set into sub-train and validation
X_train_sub, X_val, y_train_sub, y_val = train_test_split(
    X_train_processed, y_train, test_size=0.2, random_state=42
)

def objective(trial):
    with mlflow.start_run(nested=True):
        meta_model_name = trial.suggest_categorical("model", ['LR', 'KNN', 'DT'])

        if meta_model_name == 'LR':
            meta = LinearRegression()

        elif meta_model_name == 'KNN':
            n_neighbors = trial.suggest_int("n_neighbors", 1, 15)
            weights = trial.suggest_categorical("weights", ['uniform', 'distance'])            
            meta = KNeighborsRegressor(
                n_neighbors=n_neighbors,
                weights=weights,
                n_jobs=-1
            )

        elif meta_model_name == 'DT':
            max_depth_dt = trial.suggest_int("max_depth_dt", 1, 10)
            min_samples_split_dt = trial.suggest_int("min_samples_split_dt", 2, 10)
            min_samples_leaf_dt = trial.suggest_int("min_samples_leaf_dt", 1, 10)
            meta = DecisionTreeRegressor(
                max_depth=max_depth_dt,
                min_samples_split=min_samples_split_dt,
                min_samples_leaf=min_samples_leaf_dt,
                random_state=42
            )

        mlflow.log_param("meta_model", meta_model_name)

        stacking_reg = StackingRegressor(
            estimators=[
                ('rf', best_rf),
                ('lgbm', best_lgbm)
            ],
            final_estimator=meta,
            cv=5,
            n_jobs=-1
        )

        model = TransformedTargetRegressor(
            regressor=stacking_reg,
            transformer=pt
        )

        # Fit on sub-train
        model.fit(X_train_sub, y_train_sub)

        # Predict on train, val and test
        y_pred_train = model.predict(X_train_sub)
        y_pred_val = model.predict(X_val)
        y_pred_test = model.predict(X_test_processed)

        # Calculate all metrics
        train_mae = mean_absolute_error(y_train_sub, y_pred_train)
        val_mae = mean_absolute_error(y_val, y_pred_val)
        test_mae = mean_absolute_error(y_test, y_pred_test)

        train_r2 = r2_score(y_train_sub, y_pred_train)
        test_r2 = r2_score(y_test, y_pred_test)

        # Log everything
        mlflow.log_metric("train_mae", train_mae)
        mlflow.log_metric("val_mae", val_mae)
        mlflow.log_metric("test_mae", test_mae)
        mlflow.log_metric("train_r2", train_r2)
        mlflow.log_metric("test_r2", test_r2)

        # This is what Optuna will minimize
        return val_mae


In [38]:
study = optuna.create_study(direction="minimize")

with mlflow.start_run(run_name="best_model"):
    # optimize the objective function
    study.optimize(objective,n_trials=20,n_jobs=-1,show_progress_bar=True)

    # log the best parameters
    mlflow.log_params(study.best_params)

    # log the best score
    mlflow.log_metric("best_score",study.best_value)

[I 2025-07-15 11:01:35,601] A new study created in memory with name: no-name-09ef26c6-afd0-45c6-a673-3213efa119d8


  0%|          | 0/20 [00:00<?, ?it/s]

🏃 View run legendary-loon-452 at: https://dagshub.com/vinayak910/swiggy-delivery-time-prediction.mlflow/#/experiments/9/runs/539b1cc8066d410e84c34fbfdf75044c
🧪 View experiment at: https://dagshub.com/vinayak910/swiggy-delivery-time-prediction.mlflow/#/experiments/9
[I 2025-07-15 11:10:51,169] Trial 1 finished with value: 3.1106149164582533 and parameters: {'model': 'KNN', 'n_neighbors': 8, 'weights': 'uniform'}. Best is trial 1 with value: 3.1106149164582533.
🏃 View run intrigued-gnat-675 at: https://dagshub.com/vinayak910/swiggy-delivery-time-prediction.mlflow/#/experiments/9/runs/d8a4d0bc4cd541cb966339c8e7744229
🧪 View experiment at: https://dagshub.com/vinayak910/swiggy-delivery-time-prediction.mlflow/#/experiments/9
[I 2025-07-15 11:11:05,891] Trial 7 finished with value: 3.119860589339954 and parameters: {'model': 'KNN', 'n_neighbors': 9, 'weights': 'uniform'}. Best is trial 1 with value: 3.1106149164582533.
🏃 View run nervous-deer-575 at: https://dagshub.com/vinayak910/swiggy-del

In [39]:
# best parameter value

best_params = study.best_params

best_params

{'model': 'DT',
 'max_depth_dt': 6,
 'min_samples_split_dt': 4,
 'min_samples_leaf_dt': 5}

In [46]:
study.trials_dataframe().sort_values(by = 'value').head()[['value', 'params_model']]

Unnamed: 0,value,params_model
13,3.00132,DT
14,3.002233,LR
17,3.002413,LR
18,3.002613,LR
6,3.002727,LR


In [40]:
study.trials_dataframe()["params_model"].value_counts()

params_model
LR     7
DT     7
KNN    6
Name: count, dtype: int64

In [41]:
study.trials_dataframe().groupby(by="params_model")['value'].mean().sort_values()


params_model
LR     3.002671
KNN    3.283160
DT     3.475764
Name: value, dtype: float64

In [52]:
df = study.trials_dataframe()

lr_trials = df[df['params_model'] == 'LR']

best_lr_trial = lr_trials.loc[lr_trials['value'].idxmin()]

In [54]:
best_lr_trial 

number                                                 14
value                                            3.002233
datetime_start                 2025-07-15 11:12:01.124491
datetime_complete              2025-07-15 11:23:09.923285
duration                           0 days 00:11:08.798794
params_max_depth_dt                                   NaN
params_min_samples_leaf_dt                            NaN
params_min_samples_split_dt                           NaN
params_model                                           LR
params_n_neighbors                                    NaN
params_weights                                        NaN
state                                            COMPLETE
Name: 14, dtype: object

In [48]:
from sklearn.model_selection import cross_val_score, KFold

In [51]:


estimators = [
    ('rf', best_rf),
    ('lgb', best_lgbm)
]

meta_model = LinearRegression()

stack_model = StackingRegressor(
    estimators=estimators,
    final_estimator=meta_model,
    passthrough=False,
    cv=5
)


regressor = TransformedTargetRegressor(
    regressor=stack_model,
    transformer=pt
)

# ✅ K-Fold Cross Validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# ✅ Perform cross_val_score (e.g., scoring='neg_mean_absolute_error')
scores = cross_val_score(regressor, X_train_processed, y_train, cv=cv, scoring='neg_mean_absolute_error')

# ✅ Results
print("Cross-validated MAE scores:", -scores)
print("Average MAE:", -scores.mean())




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001663 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 366
[LightGBM] [Info] Number of data points in the train set: 24124, number of used features: 32
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001130 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 366
[LightGBM] [Info] Number of data points in the train set: 19299, number of used features: 32
[LightGBM] [Info] Start training from score 0.002972
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001010 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough,



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001164 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 366
[LightGBM] [Info] Number of data points in the train set: 24125, number of used features: 32
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001120 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 366
[LightGBM] [Info] Number of data points in the train set: 19300, number of used features: 32
[LightGBM] [Info] Start training from score 0.003526
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001014 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough,



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001166 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 366
[LightGBM] [Info] Number of data points in the train set: 24125, number of used features: 32
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000836 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 366
[LightGBM] [Info] Number of data points in the train set: 19300, number of used features: 32
[LightGBM] [Info] Start training from score -0.000451
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001115 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enoug



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002572 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 366
[LightGBM] [Info] Number of data points in the train set: 24125, number of used features: 32
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002529 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 366
[LightGBM] [Info] Number of data points in the train set: 19300, number of used features: 32
[LightGBM] [Info] Start training from score 0.001496
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001790 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough,



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001287 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 366
[LightGBM] [Info] Number of data points in the train set: 24125, number of used features: 32
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008317 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 366
[LightGBM] [Info] Number of data points in the train set: 19300, number of used features: 32
[LightGBM] [Info] Start training from score 0.000937
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004298 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins



In [55]:
regressor.fit(X_train_processed, y_train)
y_pred_train= regressor.predict(X_train_processed)

y_pred_test = regressor.predict(X_test_processed)



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002260 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 366
[LightGBM] [Info] Number of data points in the train set: 30156, number of used features: 32
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001440 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 366
[LightGBM] [Info] Number of data points in the train set: 24124, number of used features: 32
[LightGBM] [Info] Start training from score 0.001943
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001816 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough



In [56]:
import mlflow.sklearn


with mlflow.start_run(run_name= "best_model"):
    mlflow.log_param("meta_model", "LR")
    

    mlflow.log_metric("test_mae", mean_absolute_error(y_test , y_pred_test))
    mlflow.log_metric("train_mae", mean_absolute_error(y_train , y_pred_train))

    mlflow.log_metric("train_r2", r2_score(y_train , y_pred_train))
    mlflow.log_metric("test_r2", r2_score(y_test , y_pred_test))

    mlflow.log_metric("best_score",3.002233	)
    mlflow.sklearn.log_model(regressor,artifact_path="model")



🏃 View run best_model at: https://dagshub.com/vinayak910/swiggy-delivery-time-prediction.mlflow/#/experiments/9/runs/6d41968844e04fcdbc7493acb8db681d
🧪 View experiment at: https://dagshub.com/vinayak910/swiggy-delivery-time-prediction.mlflow/#/experiments/9
