mlflow ui --backend-store-uri sqlite:///mlflow.db

In [3]:
import mlflow

In [4]:
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('nyc-taxi-experiment')

<Experiment: artifact_location='/workspaces/mlops-zoomcamp/02-experiment_tracking/mlruns/1', creation_time=1747436587009, experiment_id='1', last_update_time=1747436587009, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
import seaborn as sns
import matplotlib as plt
from sklearn.metrics import mean_squared_error
import pickle

import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [6]:
df = pd.read_parquet("data/yellow_tripdata_2023-01.parquet")
val_df = pd.read_parquet('data/yellow_tripdata_2023-02.parquet')

In [7]:
df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
val_df['duration'] = (val_df['tpep_dropoff_datetime'] - val_df['tpep_pickup_datetime']).dt.total_seconds() / 60

df = df.query('duration >= 1 and duration <= 60')
val_df = val_df.query('duration >= 1 and duration <= 60')

In [8]:
categorical = ['PULocationID', 'DOLocationID']
df[categorical] = df[categorical].astype(str)
val_df[categorical] = val_df[categorical].astype(str)

In [9]:
train_dict = df[categorical].to_dict(orient='records')
val_dict = val_df[categorical].to_dict(orient='records')

In [10]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dict)
X_val = dv.transform(val_dict)

target = 'duration'
y_train = df[target].values
y_val = val_df[target].values

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [None]:
with open('models/lin_reg.pkl', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)

In [None]:
y_pred = lr.predict(X_val)

In [None]:
float(np.sqrt(mean_squared_error(y_val, y_pred)))

# 2.3

In [None]:
with mlflow.start_run():
    mlflow.set_tag('developer', 'tman0004')

    mlflow.log_param('train-data-path', 'data/yellow_tripdata_2023-01.parquet')
    mlflow.log_param('valid-data-path', 'data/yellow_tripdata_2023-02.parquet')

    alpha = 0.01
    mlflow.log_param('alpha', alpha)
    lr = Lasso(alpha)
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_val)
    rmse = float(np.sqrt(mean_squared_error(y_val, y_pred)))
    mlflow.log_metric('rmse', rmse)

In [12]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [None]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = float(np.sqrt(mean_squared_error(y_val, y_pred)))
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [None]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

In [13]:
import mlflow.xgboost

params = {
    'learning_rate': 0.1233407742765768,
    'max_depth': 30,
    'min_child_weight': 13.092247284357976,
    'objective': 'reg:linear',
    'reg_alpha': 0.3399656070101837,
    'reg_lambda': 0.35806973799616537,
    'seed': 42
}

mlflow.xgboost.autolog()

booster = xgb.train(
    params=params,
    dtrain=train,
    num_boost_round=300,
    evals=[(valid, 'validation')],
    early_stopping_rounds=50
)

2025/05/16 18:40:55 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'e55dc8b16b8840848b9a853b228c40db', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


[0]	validation-rmse:9.55691
[1]	validation-rmse:9.14126
[2]	validation-rmse:8.81143
[3]	validation-rmse:8.47926
[4]	validation-rmse:8.24807
[5]	validation-rmse:8.07030
[6]	validation-rmse:7.90747
[7]	validation-rmse:7.76855
[8]	validation-rmse:7.67264
[9]	validation-rmse:7.49945
[10]	validation-rmse:7.43364
[11]	validation-rmse:7.37173
[12]	validation-rmse:7.32484
[13]	validation-rmse:7.28572
[14]	validation-rmse:7.15281
[15]	validation-rmse:7.12299
[16]	validation-rmse:7.09362
[17]	validation-rmse:7.07014
[18]	validation-rmse:7.04888
[19]	validation-rmse:6.92740
[20]	validation-rmse:6.90768
[21]	validation-rmse:6.81168
[22]	validation-rmse:6.79530
[23]	validation-rmse:6.78326
[24]	validation-rmse:6.77143
[25]	validation-rmse:6.75656
[26]	validation-rmse:6.72900
[27]	validation-rmse:6.71478
[28]	validation-rmse:6.70465
[29]	validation-rmse:6.61078
[30]	validation-rmse:6.59977
[31]	validation-rmse:6.58997
[32]	validation-rmse:6.58183
[33]	validation-rmse:6.50019
[34]	validation-rmse:6.4



# 2.4

In [14]:
mlflow.xgboost.autolog(disable=True)

In [15]:
import mlflow.xgboost
import mlflow.xgboost
with mlflow.start_run():
    train = xgb.DMatrix(X_train, label=y_train)
    valid = xgb.DMatrix(X_val, label=y_val)
    
    params = {
        'learning_rate': 0.1233407742765768,
        'max_depth': 30,
        'min_child_weight': 13.092247284357976,
        'objective': 'reg:linear',
        'reg_alpha': 0.3399656070101837,
        'reg_lambda': 0.35806973799616537,
        'seed': 42
    }

    mlflow.log_params(params)

    booster = xgb.train(
        params=params,
        dtrain=train,
        num_boost_round=100,
        evals=[(valid, 'validation')],
        early_stopping_rounds=50
    )

    y_pred = booster.predict(valid)
    rmse = float(np.sqrt(mean_squared_error(y_val, y_pred)))
    mlflow.log_metric('rmse', rmse)

    with open('models/preprocessor.b', 'wb') as f_out:
        pickle.dump(dv, f_out)

    mlflow.log_artifact('models/preprocessor.b', artifact_path='preprocessor')

    mlflow.xgboost.log_model(booster, artifact_path='models_mlflow')




[0]	validation-rmse:9.55691
[1]	validation-rmse:9.14126
[2]	validation-rmse:8.81143
[3]	validation-rmse:8.47926
[4]	validation-rmse:8.24807
[5]	validation-rmse:8.07030
[6]	validation-rmse:7.90747
[7]	validation-rmse:7.76855
[8]	validation-rmse:7.67264
[9]	validation-rmse:7.49945
[10]	validation-rmse:7.43364
[11]	validation-rmse:7.37173
[12]	validation-rmse:7.32484
[13]	validation-rmse:7.28572
[14]	validation-rmse:7.15281
[15]	validation-rmse:7.12299
[16]	validation-rmse:7.09362
[17]	validation-rmse:7.07014
[18]	validation-rmse:7.04888
[19]	validation-rmse:6.92740
[20]	validation-rmse:6.90768
[21]	validation-rmse:6.81168
[22]	validation-rmse:6.79530
[23]	validation-rmse:6.78326
[24]	validation-rmse:6.77143
[25]	validation-rmse:6.75656
[26]	validation-rmse:6.72900
[27]	validation-rmse:6.71478
[28]	validation-rmse:6.70465
[29]	validation-rmse:6.61078
[30]	validation-rmse:6.59977
[31]	validation-rmse:6.58997
[32]	validation-rmse:6.58183
[33]	validation-rmse:6.50019
[34]	validation-rmse:6.4



In [16]:
logged_model = 'runs:/31fc9269b3624429b1efdc9e760b2ad3/models_mlflow'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

In [17]:
loaded_model

mlflow.pyfunc.loaded_model:
  artifact_path: models_mlflow
  flavor: mlflow.xgboost
  run_id: 31fc9269b3624429b1efdc9e760b2ad3

In [18]:
import mlflow.xgboost


xgboost_model = mlflow.xgboost.load_model(logged_model)

In [20]:
xgboost_model

<xgboost.core.Booster at 0x19780abd550>

In [21]:
y_pred = xgboost_model.predict(valid)

In [22]:
y_pred[:10]

array([ 7.4349585, 42.689312 , 15.670577 , 18.74556  , 21.164425 ,
        6.0658417, 20.723898 , 10.218073 , 10.719147 , 11.963271 ],
      dtype=float32)

# 2.5

In [25]:
from mlflow.tracking import MlflowClient

MLFLOW_TRACKING_URI = 'sqlite:///mlflow.db'

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

In [28]:
client.search_experiments()

[<Experiment: artifact_location='/workspaces/mlops-zoomcamp/02-experiment_tracking/mlruns/1', creation_time=1747436587009, experiment_id='1', last_update_time=1747436587009, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>,
 <Experiment: artifact_location='/workspaces/mlops-zoomcamp/02-experiment_tracking/mlruns/0', creation_time=1747436587004, experiment_id='0', last_update_time=1747436587004, lifecycle_stage='active', name='Default', tags={}>]

In [29]:
from mlflow.entities import ViewType
runs = client.search_runs(
    experiment_ids='1',
    filter_string='',
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=['metrics.rmse ASC']
)

In [35]:
for run in runs:
    print(f"run id: {run.info.run_id}, rmse: {run.data.metrics['rmse']:.4f}")
    break

run id: 19952056b6284d0ea30dc5b28ac78072, rmse: 5.3229


In [36]:
client.get_latest_versions(name='nyc-taxi-regressor')

  client.get_latest_versions(name='nyc-taxi-regressor')


[<ModelVersion: aliases=[], creation_timestamp=1747451731796, current_stage='None', description='', last_updated_timestamp=1747451731796, name='nyc-taxi-regressor', run_id='e55dc8b16b8840848b9a853b228c40db', run_link='', source='/workspaces/mlops-zoomcamp/02-experiment_tracking/mlruns/1/e55dc8b16b8840848b9a853b228c40db/artifacts/model', status='READY', status_message=None, tags={'model': 'xgboost', 'stage': 'staging'}, user_id=None, version=2>]

In [37]:
client.download_artifacts(run_id='31fc9269b3624429b1efdc9e760b2ad3', path='preprocessor')

'c:\\workspaces\\mlops-zoomcamp\\02-experiment_tracking\\mlruns\\1\\31fc9269b3624429b1efdc9e760b2ad3\\artifacts\\preprocessor'