In [1]:
import pandas as pd
import numpy as np
import mlflow
import xgboost as xgb
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import root_mean_squared_error
from pathlib import Path
import pickle

In [3]:
def file_load(path:str) -> pd.DataFrame:
    
    cols = ['lpep_pickup_datetime', 'lpep_dropoff_datetime', 'PULocationID', 'DOLocationID',
       'trip_distance']
    df = pd.read_parquet(path, columns=cols)
    
    df['duration'] = (df['lpep_dropoff_datetime']-df['lpep_pickup_datetime']).dt.total_seconds()/60
    df = df[(df['duration']>=1)&(df['duration']<=60)]
    
    df['PU_DO'] = df['PULocationID'].astype(str) + "_" + df['DOLocationID'].astype(str)
    
    return df[['duration', 'PU_DO', 'trip_distance']]

In [4]:
df_train = file_load('../Data/green_tripdata_2021-01.parquet')

In [5]:
df_val = file_load('../Data/green_tripdata_2021-02.parquet')

In [6]:
df_train.head()

Unnamed: 0,duration,PU_DO,trip_distance
0,3.933333,43_151,1.01
1,8.75,166_239,2.53
2,5.966667,41_42,1.12
3,7.083333,168_75,1.99
7,2.316667,75_75,0.45


In [7]:
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('nyc-taxi-experiment')
models_folder = Path('models')
models_folder.mkdir(exist_ok=True)

2025/06/17 18:16:06 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/06/17 18:16:06 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.


In [13]:
def X_feature(df:pd.DataFrame, dv:DictVectorizer = None):
    
    dic = df[['PU_DO', 'trip_distance']].to_dict(orient = 'records')
    
    if dv is None:
        dv = DictVectorizer()
        x = dv.fit_transform(dic)
        
    else:
        x = dv.transform(dic)
    
    y = df['duration']
        
    return x, y, dv

In [14]:
X_train, y_train, dv = X_feature(df_train)

In [15]:
X_val, y_val, dv = X_feature(df_val, dv)

In [32]:
import gc
gc.collect()

408

In [30]:
import mlflow.xgboost


def model_training(X_train, y_train, X_val, y_val, dv):
    
    with mlflow.start_run():
        train = xgb.DMatrix(X_train, label = y_train)
        val = xgb.DMatrix(X_val, label = y_val)
        
        best_params = {
            'learning_rate': 0.09585355369315604,
            'max_depth': 30,
            'min_child_weight': 1.060597050922164,
            'objective': 'reg:linear',
            'reg_alpha': 0.018060244040060163,
            'reg_lambda': 0.011658731377413597,
            'seed': 42
        }
        
        mlflow.log_params(best_params)
        
        booster = xgb.train(params=best_params, dtrain=train, num_boost_round=30, evals=[(val, 'validation')], early_stopping_rounds=50)
        
        y_pred = booster.predict(val)
        rmse = root_mean_squared_error(y_val, y_pred)
        mlflow.log_metric('RMSE', rmse)
        
        with open('models/preprocessor.b', 'wb') as f_out:
            pickle.dump(dv, f_out)
            
        mlflow.log_artifact('models/preprocessor.b', artifact_path='preprocessor')
        mlflow.xgboost.log_model(booster, artifact_path='models_mlflow')
        
    return booster, dv

            

In [31]:
booster, dv = model_training(X_train, y_train, X_val, y_val, dv)

  self.starting_round = model.num_boosted_rounds()


[0]	validation-rmse:11.44482
[1]	validation-rmse:10.77202
[2]	validation-rmse:10.18363
[3]	validation-rmse:9.67396
[4]	validation-rmse:9.23166
[5]	validation-rmse:8.84808
[6]	validation-rmse:8.51883
[7]	validation-rmse:8.23597
[8]	validation-rmse:7.99320
[9]	validation-rmse:7.78709
[10]	validation-rmse:7.61022
[11]	validation-rmse:7.45952
[12]	validation-rmse:7.33049
[13]	validation-rmse:7.22098
[14]	validation-rmse:7.12713
[15]	validation-rmse:7.04752
[16]	validation-rmse:6.98005
[17]	validation-rmse:6.92232
[18]	validation-rmse:6.87112
[19]	validation-rmse:6.82740
[20]	validation-rmse:6.78995
[21]	validation-rmse:6.75792
[22]	validation-rmse:6.72994
[23]	validation-rmse:6.70547
[24]	validation-rmse:6.68390
[25]	validation-rmse:6.66421
[26]	validation-rmse:6.64806
[27]	validation-rmse:6.63280
[28]	validation-rmse:6.61924
[29]	validation-rmse:6.60773


  xgb_model.save_model(model_data_path)
