In [40]:
import matplotlib.pyplot as plt
import mlflow
import pandas as pd
import seaborn as sns
import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
from mlflow.entities import ViewType
from mlflow.tracking import MlflowClient
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mean_squared_error

In [3]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")

2024/03/02 18:18:42 INFO mlflow.tracking.fluent: Experiment with name 'nyc-taxi-experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='/Users/matthewzak/projects/MLOps/mlruns/1', creation_time=1709399922983, experiment_id='1', last_update_time=1709399922983, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [4]:
def read_data(path):
    data = pd.read_parquet(path)
    data['duration'] = data.tpep_dropoff_datetime - data.tpep_pickup_datetime
    data['duration'] = data.duration.apply(lambda duration: duration.total_seconds() / 60)
    data = data[(data.duration >= 1) & (data.duration <= 60)]

    location_features = ['PULocationID', 'DOLocationID']
    data[location_features] = data[location_features].astype(str)
    
    return data

In [7]:
location_features = ['PULocationID', 'DOLocationID']
num_features = ['trip_distance']
target_col = ['duration']

In [8]:
data = read_data('./data/yellow_tripdata_2022-01.parquet')
val_data = read_data('./data/yellow_tripdata_2022-02.parquet')
train_set = data[location_features].to_dict(orient='records')
val_set = val_data[location_features].to_dict(orient='records')

In [9]:
dict_vectorizer = DictVectorizer()
x_train = dict_vectorizer.fit_transform(train_set)
x_val = dict_vectorizer.transform(val_set)

In [10]:
target = data[target_col].values
val_target = val_data[target_col].values

In [27]:
train = xgb.DMatrix(data=x_train, label=target)
valid = xgb.DMatrix(data=x_val, label=val_target)

In [30]:
def objective(params):
    """
    """

    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)

        regressor = xgb.train(
            params=params,
            dtrain = train,
            num_boost_round = 1000,
            evals=[(valid, "validation")],
            early_stopping_rounds=50
        )
        y_pred = regressor.predict(valid)
        rmse = mean_squared_error(y_pred, val_target, squared=False)
        mlflow.log_metric("rmse", rmse)

    return {"loss": rmse, "status": STATUS_OK}

search_space = {
    "max_depth": scope.int(hp.quniform("max_depth", 4, 100, 1)),
    "learning_rate": hp.loguniform("learning_rate", -3, 0),
    "reg_alpha": hp.loguniform("reg_alpha", -5, -1),
    "reg_lambda": hp.loguniform("reg_lambda", -6, -1),
    "min_child_weight": hp.loguniform("min_child_weight", -1, 3),
    "objective": "reg:squarederror",
    "seed": 42
}

best_result = fmin(
    fn = objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

[0]	validation-rmse:8.48073                                                                                                                                                                                           
[1]	validation-rmse:7.38414                                                                                                                                                                                           
[2]	validation-rmse:7.00545                                                                                                                                                                                           
[3]	validation-rmse:6.36123                                                                                                                                                                                           
[4]	validation-rmse:6.24068                                                                                                                 

In [32]:
best_params = {
    "learning_rate": 0.9933151941778549,
    "max_depth": 91,
    "min_child_weight": 1.9776284209695494,
    "objective": "reg:squarederror",
    "reg_alpha": 0.1178703504331994,
    "reg_lambda": 0.08609674239776832,
    "seed": 42
}

mlflow.xgboost.autolog()
regressor = xgb.train(
    params = best_params,
    dtrain = train,
    num_boost_round = 1000,
    evals = [(valid, "validation")],
    early_stopping_rounds = 50
)

2024/03/03 13:30:24 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '640a77901a0e432ca362301ff2fbcfaa', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


[0]	validation-rmse:6.99841
[1]	validation-rmse:5.74361
[2]	validation-rmse:5.62005
[3]	validation-rmse:5.55132
[4]	validation-rmse:5.46078
[5]	validation-rmse:5.45214
[6]	validation-rmse:5.43988
[7]	validation-rmse:5.43701
[8]	validation-rmse:5.43630
[9]	validation-rmse:5.43547
[10]	validation-rmse:5.43483
[11]	validation-rmse:5.43423
[12]	validation-rmse:5.43349
[13]	validation-rmse:5.43333
[14]	validation-rmse:5.43314
[15]	validation-rmse:5.43308
[16]	validation-rmse:5.43248
[17]	validation-rmse:5.43237
[18]	validation-rmse:5.43195
[19]	validation-rmse:5.43168
[20]	validation-rmse:5.43175
[21]	validation-rmse:5.43144
[22]	validation-rmse:5.43121
[23]	validation-rmse:5.43121
[24]	validation-rmse:5.43083
[25]	validation-rmse:5.43105
[26]	validation-rmse:5.43069
[27]	validation-rmse:5.43055
[28]	validation-rmse:5.43028
[29]	validation-rmse:5.43012
[30]	validation-rmse:5.43013
[31]	validation-rmse:5.42994
[32]	validation-rmse:5.42985
[33]	validation-rmse:5.42952
[34]	validation-rmse:5.4



In [34]:
MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

In [44]:
experiment = client.search_experiments()[0]

In [47]:
client.search_runs(
    experiment_ids=experiment.experiment_id,
    filter_string="metrics.rmse < 5.9",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.rmse ASC"]
)

[<Run: data=<RunData: metrics={'rmse': 5.4354690830309975}, params={'learning_rate': '0.9933151941778549',
  'max_depth': '91',
  'min_child_weight': '1.9776284209695494',
  'objective': 'reg:squarederror',
  'reg_alpha': '0.1178703504331994',
  'reg_lambda': '0.08609674239776832',
  'seed': '42'}, tags={'mlflow.runName': 'orderly-stork-657',
  'mlflow.source.name': '/Users/matthewzak/miniconda3/lib/python3.11/site-packages/ipykernel_launcher.py',
  'mlflow.source.type': 'LOCAL',
  'mlflow.user': 'matthewzak',
  'model': 'xgboost'}>, info=<RunInfo: artifact_uri='/Users/matthewzak/projects/MLOps/mlruns/1/978de4ddd82145c5914a002c27fde383/artifacts', end_time=1709468075394, experiment_id='1', lifecycle_stage='active', run_id='978de4ddd82145c5914a002c27fde383', run_name='orderly-stork-657', run_uuid='978de4ddd82145c5914a002c27fde383', start_time=1709468069589, status='FINISHED', user_id='matthewzak'>, inputs=<RunInputs: dataset_inputs=[]>>,
 <Run: data=<RunData: metrics={'rmse': 5.43754852