In [1]:
!python -V

Python 3.9.24


In [None]:
mlflow server \

    --backend-store-uri sqlite:///mlflow.db \

    --default-artifact-root ./mlruns \

    --host 0.0.0.0 \

    --port 5000

 

In [2]:
import pandas as pd

In [3]:
import pickle

In [4]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import root_mean_squared_error

In [6]:
import mlflow

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("nyc-taxi-experiment")

<Experiment: artifact_location='/workspaces/mlops-zoomcamp/03-Orchestration/artifacts/1', creation_time=1761664682548, experiment_id='1', last_update_time=1761664682548, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [7]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    return df

In [8]:
df_train = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet')
df_val = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet')

In [9]:
categorical = ['PU_DO'] #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [10]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [11]:
import xgboost as xgb

In [18]:
from pathlib import Path
models_folder = Path('models')
models_folder.mkdir(exist_ok=True)

In [12]:
with mlflow.start_run():
    
    train = xgb.DMatrix(X_train, label=y_train)
    valid = xgb.DMatrix(X_val, label=y_val)

    best_params = {
        'learning_rate': 0.09585355369315604,
        'max_depth': 30,
        'min_child_weight': 1.060597050922164,
        'objective': 'reg:linear',
        'reg_alpha': 0.018060244040060163,
        'reg_lambda': 0.011658731377413597,
        'seed': 42
    }

    mlflow.log_params(best_params)

    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=30,
        evals=[(valid, 'validation')],
        early_stopping_rounds=50
    )

    y_pred = booster.predict(valid)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

    mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")



[0]	validation-rmse:11.44482
[1]	validation-rmse:10.77202
[2]	validation-rmse:10.18363
[3]	validation-rmse:9.67396
[4]	validation-rmse:9.23166
[5]	validation-rmse:8.84808
[6]	validation-rmse:8.51883
[7]	validation-rmse:8.23597
[8]	validation-rmse:7.99320
[9]	validation-rmse:7.78709
[10]	validation-rmse:7.61022
[11]	validation-rmse:7.45952
[12]	validation-rmse:7.33049
[13]	validation-rmse:7.22098
[14]	validation-rmse:7.12713
[15]	validation-rmse:7.04752
[16]	validation-rmse:6.98005
[17]	validation-rmse:6.92232
[18]	validation-rmse:6.87112
[19]	validation-rmse:6.82740
[20]	validation-rmse:6.78995
[21]	validation-rmse:6.75792
[22]	validation-rmse:6.72994
[23]	validation-rmse:6.70547
[24]	validation-rmse:6.68390
[25]	validation-rmse:6.66421
[26]	validation-rmse:6.64806
[27]	validation-rmse:6.63280
[28]	validation-rmse:6.61924
[29]	validation-rmse:6.60773




🏃 View run persistent-yak-513 at: http://localhost:5000/#/experiments/1/runs/7b6d6b04648d4c268941d92c1aac31e8
🧪 View experiment at: http://localhost:5000/#/experiments/1


In [13]:
import pandas as pd

results = pd.DataFrame({
    'actual_duration': y_val,
    'predicted_duration': y_pred
})
print(results.head(10))
results['difference'] = results['actual_duration'] - results['predicted_duration']
print("Average difference:", results['difference'].mean())

results.to_csv("models/predictions_vs_actuals.csv", index=False)
mlflow.log_artifact("models/predictions_vs_actuals.csv", artifact_path="predictions")


   actual_duration  predicted_duration
0        17.916667           16.125685
1         6.500000            7.607263
2        15.250000           19.447042
3        18.233333           24.010212
4         8.966667           10.241908
5         7.850000           17.095030
6         9.700000           14.144274
7        11.283333            9.423327
8         8.733333            9.298021
9         1.716667           19.088913
Average difference: 1.2705986995605845


In [None]:
import pandas as pd
import pickle
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import mlflow
from pathlib import Path


df = pd.read_csv("/workspaces/mlops-zoomcamp/03-Orchestration/delhivery_data.csv") 
print(df.head())

df = df.drop(columns=['data', 'trip_creation_time', 'route_schedule_uuid', 'trip_uuid', 'cutoff_timestamp'], errors='ignore')

target = 'actual_time'

categorical = [
    'source_name',
    'destination_name',
    'route_type',
    'is_cutoff'
]

numerical = [
    'actual_distance_to_destination',
    'osrm_time',
    'osrm_distance',
    'factor',
    'segment_osrm_time',
    'segment_osrm_distance',
    'segment_factor'
]
df = df.dropna(subset=[target])
df_train, df_val = train_test_split(df, test_size=0.2, random_state=42)

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

y_train = df_train[target].values
y_val = df_val[target].values

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("delhivery-duration-experiment")

models_folder = Path('models')
models_folder.mkdir(exist_ok=True)

with mlflow.start_run():
    train = xgb.DMatrix(X_train, label=y_train)
    valid = xgb.DMatrix(X_val, label=y_val)

    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'learning_rate': 0.1,
        'max_depth': 6,
        'min_child_weight': 1,
        'seed': 42
    }

    mlflow.log_params(params)

    booster = xgb.train(
        params=params,
        dtrain=train,
        num_boost_round=100,
        evals=[(valid, 'validation')],
        early_stopping_rounds=10
    )


    y_pred = booster.predict(valid)

    rmse = root_mean_squared_error(y_val, y_pred)

    print(f"RMSE: {rmse:.3f}")

    mlflow.log_metric("rmse", rmse)
    with open("models/preprocessor_delhivery.b", "wb") as f_out:
        pickle.dump(dv, f_out)
    mlflow.log_artifact("models/preprocessor_delhivery.b", artifact_path="preprocessor")

    mlflow.xgboost.log_model(booster, artifact_path="model")
    results = pd.DataFrame({
        "actual_duration": y_val,
        "predicted_duration": y_pred
    })
    results.to_csv("models/delhivery_predictions.csv", index=False)
    mlflow.log_artifact("models/delhivery_predictions.csv", artifact_path="predictions")

       data          trip_creation_time  \
0  training  2018-09-20 02:35:36.476840   
1  training  2018-09-20 02:35:36.476840   
2  training  2018-09-20 02:35:36.476840   
3  training  2018-09-20 02:35:36.476840   
4  training  2018-09-20 02:35:36.476840   

                                 route_schedule_uuid route_type  \
0  thanos::sroute:eb7bfc78-b351-4c0e-a951-fa3d5c3...    Carting   
1  thanos::sroute:eb7bfc78-b351-4c0e-a951-fa3d5c3...    Carting   
2  thanos::sroute:eb7bfc78-b351-4c0e-a951-fa3d5c3...    Carting   
3  thanos::sroute:eb7bfc78-b351-4c0e-a951-fa3d5c3...    Carting   
4  thanos::sroute:eb7bfc78-b351-4c0e-a951-fa3d5c3...    Carting   

                 trip_uuid source_center                 source_name  \
0  trip-153741093647649320  IND388121AAA  Anand_VUNagar_DC (Gujarat)   
1  trip-153741093647649320  IND388121AAA  Anand_VUNagar_DC (Gujarat)   
2  trip-153741093647649320  IND388121AAA  Anand_VUNagar_DC (Gujarat)   
3  trip-153741093647649320  IND388121AAA  Anand_VU

2025/10/31 13:40:54 INFO mlflow.tracking.fluent: Experiment with name 'delhivery-duration-experiment' does not exist. Creating a new experiment.


[0]	validation-rmse:539.48926
[1]	validation-rmse:486.27303
[2]	validation-rmse:438.52949
[3]	validation-rmse:395.56884
[4]	validation-rmse:356.87901
[5]	validation-rmse:321.95071
[6]	validation-rmse:290.58722
[7]	validation-rmse:262.33153
[8]	validation-rmse:236.81886
[9]	validation-rmse:213.98505
[10]	validation-rmse:193.30926
[11]	validation-rmse:174.78856
[12]	validation-rmse:158.08293
[13]	validation-rmse:143.06514
[14]	validation-rmse:129.54643
[15]	validation-rmse:117.40477
[16]	validation-rmse:106.47050
[17]	validation-rmse:96.65722
[18]	validation-rmse:87.89909
[19]	validation-rmse:80.01022
[20]	validation-rmse:72.95315
[21]	validation-rmse:66.68623
[22]	validation-rmse:61.07896
[23]	validation-rmse:56.03590
[24]	validation-rmse:51.59117
[25]	validation-rmse:47.68410
[26]	validation-rmse:44.16707
[27]	validation-rmse:41.09498
[28]	validation-rmse:38.37053
[29]	validation-rmse:35.99390
[30]	validation-rmse:33.94487
[31]	validation-rmse:32.10847
[32]	validation-rmse:30.52589
[33



🏃 View run redolent-bird-597 at: http://localhost:5000/#/experiments/2/runs/0edac139aa534c00bf444964c9be4dda
🧪 View experiment at: http://localhost:5000/#/experiments/2


In [None]:
import pandas as pd
import os
import mlflow

results = pd.DataFrame({
    'actual_duration': y_val,
    'predicted_duration': y_pred
})

results['difference'] = results['actual_duration'] - results['predicted_duration']
print(results.head(10))
print("\nAverage difference:", results['difference'].mean())

os.makedirs("models", exist_ok=True)
results.to_csv("models/predictions_vs_actuals.csv", index=False)
print("Saved results to models/predictions_vs_actuals.csv")

mlflow.log_artifact("models/predictions_vs_actuals.csv", artifact_path="predictions")
print("Logged artifact to MLflow under 'predictions/'")


   actual_duration  predicted_duration  difference
0            440.0          442.456970   -2.456970
1             57.0           48.008472    8.991528
2            238.0          241.649765   -3.649765
3           1724.0         1711.571899   12.428101
4            999.0          996.462341    2.537659
5            793.0          786.916626    6.083374
6            276.0          279.496216   -3.496216
7            764.0          745.722717   18.277283
8             65.0           67.620132   -2.620132
9            894.0          896.887451   -2.887451

Average difference: 0.07410632733718747
✅ Saved results to models/predictions_vs_actuals.csv
✅ Logged artifact to MLflow under 'predictions/'
