In [4]:
import numpy as np 
import pandas as pd 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder,StandardScaler,LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

In [5]:
import mlflow

In [6]:
import dagshub
dagshub.init(repo_owner='vinayak910', repo_name='swiggy-delivery-time-prediction', mlflow=True)

In [7]:
mlflow.set_tracking_uri("https://dagshub.com/vinayak910/swiggy-delivery-time-prediction.mlflow")

In [8]:
mlflow.set_experiment("RF HP Tuning")

<Experiment: artifact_location='mlflow-artifacts:/a84b417ae75c41958a57d2c96b85ffe8', creation_time=1752491881633, experiment_id='8', last_update_time=1752491881633, lifecycle_stage='active', name='RF HP Tuning', tags={}>

In [9]:
from sklearn import set_config 

set_config(transform_output="pandas")

In [10]:
df = pd.read_csv("swiggy_cleaned.csv")

In [11]:
# drop columns not required for model input

columns_to_drop =  ['rider_id',
                    'restaurant_latitude',
                    'restaurant_longitude',
                    'delivery_latitude',
                    'delivery_longitude',
                    'order_date',
                    "order_time_hour",
                    "order_day",
                    "city_name",
                    "order_day_of_week",
                    "order_month"]

df.drop(columns=columns_to_drop, inplace=True)

df

Unnamed: 0,age,ratings,weather,traffic,vehicle_condition,type_of_order,type_of_vehicle,multiple_deliveries,festival,city_type,time_taken,is_weekend,pickup_time_minutes,order_time_of_day,distance,distance_type
0,37.0,4.9,sunny,high,2,snack,motorcycle,0.0,no,urban,24,1,15.0,morning,3.025149,short
1,34.0,4.5,stormy,jam,2,snack,scooter,1.0,no,metropolitian,33,0,5.0,evening,20.183530,very_long
2,23.0,4.4,sandstorms,low,0,drinks,motorcycle,1.0,no,urban,26,1,15.0,morning,1.552758,short
3,38.0,4.7,sunny,medium,0,buffet,motorcycle,1.0,no,metropolitian,21,0,10.0,evening,7.790401,medium
4,32.0,4.6,cloudy,high,1,snack,scooter,1.0,no,metropolitian,30,1,15.0,afternoon,6.210138,medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45497,30.0,4.8,windy,high,1,meal,motorcycle,0.0,no,metropolitian,32,0,10.0,morning,1.489846,short
45498,21.0,4.6,windy,jam,0,buffet,motorcycle,1.0,no,metropolitian,36,0,15.0,evening,,
45499,30.0,4.9,cloudy,low,1,drinks,scooter,0.0,no,metropolitian,16,0,15.0,night,4.657195,short
45500,20.0,4.7,cloudy,high,0,snack,motorcycle,1.0,no,metropolitian,26,0,5.0,afternoon,6.232393,medium


In [12]:
temp_df = df.copy().dropna()

In [13]:
# split into X and y

X = temp_df.drop(columns='time_taken')
y = temp_df['time_taken']

X

Unnamed: 0,age,ratings,weather,traffic,vehicle_condition,type_of_order,type_of_vehicle,multiple_deliveries,festival,city_type,is_weekend,pickup_time_minutes,order_time_of_day,distance,distance_type
0,37.0,4.9,sunny,high,2,snack,motorcycle,0.0,no,urban,1,15.0,morning,3.025149,short
1,34.0,4.5,stormy,jam,2,snack,scooter,1.0,no,metropolitian,0,5.0,evening,20.183530,very_long
2,23.0,4.4,sandstorms,low,0,drinks,motorcycle,1.0,no,urban,1,15.0,morning,1.552758,short
3,38.0,4.7,sunny,medium,0,buffet,motorcycle,1.0,no,metropolitian,0,10.0,evening,7.790401,medium
4,32.0,4.6,cloudy,high,1,snack,scooter,1.0,no,metropolitian,1,15.0,afternoon,6.210138,medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45496,35.0,4.2,windy,jam,2,drinks,motorcycle,1.0,no,metropolitian,0,10.0,night,16.600272,very_long
45497,30.0,4.8,windy,high,1,meal,motorcycle,0.0,no,metropolitian,0,10.0,morning,1.489846,short
45499,30.0,4.9,cloudy,low,1,drinks,scooter,0.0,no,metropolitian,0,15.0,night,4.657195,short
45500,20.0,4.7,cloudy,high,0,snack,motorcycle,1.0,no,metropolitian,0,5.0,afternoon,6.232393,medium


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [15]:
from sklearn.preprocessing import PowerTransformer, OrdinalEncoder, OneHotEncoder

In [16]:
# transform target column

pt = PowerTransformer()

y_train_pt = pt.fit_transform(y_train.values.reshape(-1,1))
y_test_pt = pt.transform(y_test.values.reshape(-1,1))

In [17]:
num_cols = ["age","ratings","pickup_time_minutes","distance"]

nominal_cat_cols = ['weather',
                    'type_of_order',
                    'type_of_vehicle',
                    "festival",
                    "city_type",
                    "is_weekend",
                    "order_time_of_day"]

ordinal_cat_cols = ["traffic","distance_type"]

In [18]:
# generate order for ordinal encoding

traffic_order = ["low","medium","high","jam"]

distance_type_order = ["short","medium","long","very_long"]

In [19]:
# build a preprocessor

preprocessor = ColumnTransformer(transformers=[
    ("scale", MinMaxScaler(), num_cols),
    ("nominal_encode", OneHotEncoder(drop="first",handle_unknown="ignore",
                                     sparse_output=False), nominal_cat_cols),
    ("ordinal_encode", OrdinalEncoder(categories=[traffic_order,distance_type_order],
                                      encoded_missing_value=-999,
                                      handle_unknown="use_encoded_value",
                                      unknown_value=-1), ordinal_cat_cols)
],remainder="passthrough",n_jobs=-1,force_int_remainder_cols=False,verbose_feature_names_out=False)


preprocessor

In [20]:
# do data preprocessing

X_train_processed = preprocessor.fit_transform(X_train)

X_test_processed = preprocessor.transform(X_test)

In [21]:
from sklearn.ensemble import RandomForestRegressor
import optuna
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.compose import TransformedTargetRegressor

In [19]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error, r2_score

def objective(trial):
    with mlflow.start_run(nested=True):
        # Hyperparameter search space
        params = {
    "n_estimators": trial.suggest_int("n_estimators", 100, 500, step=50),
    "max_depth": trial.suggest_int("max_depth", 5, 20),
    "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
    "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", 0.5, 0.7]),
    "n_jobs": -1,
    "random_state": 42
    }


        mlflow.log_params(params)

        base_model = RandomForestRegressor(**params)
        model = TransformedTargetRegressor(regressor=base_model, transformer=pt)

        # Fit on training data

        # Cross-validation
        cv_score = cross_val_score(
            model,
            X_train_processed,
            y_train,
            cv=5,
            scoring="neg_mean_absolute_error",
            n_jobs=-1
        )
        mean_cv_mae = -cv_score.mean()
        mlflow.log_metric("cross_val_error", mean_cv_mae)
        
        model.fit(X_train_processed, y_train)
        # Predictions
        y_pred_train = model.predict(X_train_processed)
        y_pred_test = model.predict(X_test_processed)

        # Evaluation
        train_mae = mean_absolute_error(y_train, y_pred_train)
        test_mae = mean_absolute_error(y_test, y_pred_test)
        train_r2 = r2_score(y_train, y_pred_train)
        test_r2 = r2_score(y_test, y_pred_test)

        mlflow.log_metric("train_mae", train_mae)
        mlflow.log_metric("test_mae", test_mae)
        mlflow.log_metric("train_r2", train_r2)
        mlflow.log_metric("test_r2", test_r2)

        return mean_cv_mae


In [20]:
from lightgbm import LGBMRegressor

In [21]:
study = optuna.create_study(direction="minimize")

with mlflow.start_run(run_name="best_model"):
    study.optimize(objective, n_trials=50, n_jobs=-1, show_progress_bar=True)

    mlflow.log_params(study.best_params)
    mlflow.log_metric("best_score", study.best_value)

    # Train final model with best parameters
    best_lgbm = LGBMRegressor(**study.best_params)
    final_model = TransformedTargetRegressor(regressor=best_lgbm, transformer=pt)
    final_model.fit(X_train_processed, y_train)

    # Predict
    y_pred_train = final_model.predict(X_train_processed)
    y_pred_test = final_model.predict(X_test_processed)

    # Evaluate
    train_mae = mean_absolute_error(y_train, y_pred_train)
    test_mae = mean_absolute_error(y_test, y_pred_test)
    train_r2 = r2_score(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)

    # CV Score
    cv_score = cross_val_score(
        final_model,
        X_train_processed,
        y_train,
        cv=5,
        scoring="neg_mean_absolute_error",
        n_jobs=-1
    )

    mlflow.log_metric("training_error", train_mae)
    mlflow.log_metric("test_error", test_mae)
    mlflow.log_metric("training_r2", train_r2)
    mlflow.log_metric("test_r2", test_r2)
    mlflow.log_metric("cross_val", -cv_score.mean())

    mlflow.sklearn.log_model(final_model, artifact_path="model")

[I 2025-07-14 17:01:13,018] A new study created in memory with name: no-name-ee5843c0-0687-4c11-b50b-d2563bc21faf


  0%|          | 0/50 [00:00<?, ?it/s]

🏃 View run efficient-shoat-742 at: https://dagshub.com/vinayak910/swiggy-delivery-time-prediction.mlflow/#/experiments/8/runs/583c8f9f5d2f40d4b10e6062faf3407b
🧪 View experiment at: https://dagshub.com/vinayak910/swiggy-delivery-time-prediction.mlflow/#/experiments/8
[I 2025-07-14 17:01:57,273] Trial 2 finished with value: 4.381731133696319 and parameters: {'n_estimators': 250, 'max_depth': 6, 'min_samples_leaf': 4, 'max_features': 'log2'}. Best is trial 2 with value: 4.381731133696319.
🏃 View run nebulous-shoat-674 at: https://dagshub.com/vinayak910/swiggy-delivery-time-prediction.mlflow/#/experiments/8/runs/08783fea76c2441cb211f8c9dc8a0360
🧪 View experiment at: https://dagshub.com/vinayak910/swiggy-delivery-time-prediction.mlflow/#/experiments/8
[I 2025-07-14 17:03:28,935] Trial 7 finished with value: 3.1818145056695704 and parameters: {'n_estimators': 450, 'max_depth': 19, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 7 with value: 3.1818145056695704.
🏃 View run serio



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001914 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 352
[LightGBM] [Info] Number of data points in the train set: 30156, number of used features: 25
[LightGBM] [Info] Start training from score -0.000000




🏃 View run best_model at: https://dagshub.com/vinayak910/swiggy-delivery-time-prediction.mlflow/#/experiments/8/runs/20d43a8ac2354ecb8fada780151b4367
🧪 View experiment at: https://dagshub.com/vinayak910/swiggy-delivery-time-prediction.mlflow/#/experiments/8


In [2]:
params = {'n_estimators':
400,
'max_depth' :
14,
'min_samples_leaf':
5,
'max_features':
0.7}