In [1]:
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import pickle
import os, tempfile
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import root_mean_squared_error

In [2]:
import mlflow
# mlflow.set_tracking_uri(f"sqlite:///mlflow.db")
mlflow.set_tracking_uri('http://localhost:5000')
mlflow.set_experiment("sklearn-pred")

2025/09/24 08:11:50 INFO mlflow.tracking.fluent: Experiment with name 'sklearn-pred' does not exist. Creating a new experiment.


<Experiment: artifact_location='gs://mlflow_vaibhav/3', creation_time=1758701510754, experiment_id='3', last_update_time=1758701510754, lifecycle_stage='active', name='sklearn-pred', tags={}>

In [3]:
categorical = ['PULocationID','DOLocationID']
numerical = ['trip_distance']
target = 'duration'
def get_df(parquet_file_path):
    df = pd.read_parquet(parquet_file_path)
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df["duration"] = df["duration"].dt.total_seconds() / 60
    df = df[(df.duration >= 1) & (df.duration <=60)]
    df[categorical] = df[categorical].astype(str)
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']   # creating a new feature
    return df

In [4]:
train_df = get_df("data/yellow_tripdata_2023-01.parquet")
val_df = get_df("data/yellow_tripdata_2023-02.parquet")
train_df = train_df.sample(n=5000)
val_df = val_df.sample(n=2000)

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
new_features = ['PU_DO']  
features = numerical + new_features
train_dicts = train_df[features].to_dict(orient='records')
val_dicts = val_df[features].to_dict(orient='records')

pipeline = Pipeline([
    ('dv', DictVectorizer()),
    ('model', RandomForestRegressor(n_estimators=10, max_depth=10, random_state=1))
])


with mlflow.start_run():
    mlflow.set_tag("developer", "vaibhav")  
    mlflow.log_param("model", "RandomForestRegressor")
    mlflow.log_param("n_estimators", 10)
    mlflow.log_param("max_depth", 10)
    pipeline.fit(train_dicts, train_df[target])
    val_preds = pipeline.predict(val_dicts)
    rmse = root_mean_squared_error(val_df[target], val_preds)
    print(f"RMSE: {rmse}")
    mlflow.log_metric("rmse", rmse)

    saved_path  = os.path.join(tempfile.mkdtemp(), "sklearn-model")
    mlflow.sklearn.save_model(pipeline,saved_path)
    mlflow.log_artifacts(saved_path, artifact_path="sklearn-model")

RMSE: 5.2943745486700395
🏃 View run gentle-vole-594 at: http://localhost:5000/#/experiments/3/runs/8fec157e03fc48c189f1d8424ef30875
🧪 View experiment at: http://localhost:5000/#/experiments/3


In [5]:
dv = DictVectorizer()
new_feature = ['PU_DO']
train_dict = train_df[new_feature + numerical].to_dict(orient='records')
val_dict = val_df[new_feature + numerical].to_dict(orient='records')

x_train = dv.fit_transform(train_dict)
y_train = train_df[target].values

x_val = dv.transform(val_dict)
y_val = val_df[target].values
with open('preprocessor.b','wb') as f_out:
    pickle.dump(dv,f_out)

In [6]:
with mlflow.start_run():
    alpha = 0.1
    mlflow.log_param("alpha", alpha)
    mlflow.set_tag('model', 'Ridge')
    model = Ridge(alpha=alpha, random_state=42)
    model.fit(x_train, y_train)
    y_pred = model.predict(x_val)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)
    print(f"RMSE: {rmse}")
    with open('model.bin','wb') as f_out:
        pickle.dump((dv,model),f_out)
    mlflow.log_artifact('model.bin')

RMSE: 5.930433282292824
🏃 View run useful-grouse-470 at: http://localhost:5000/#/experiments/1/runs/47c567be045645f185e0877667d663d5
🧪 View experiment at: http://localhost:5000/#/experiments/1


In [7]:
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll.base import scope

In [8]:
dtrain= xgb.DMatrix(x_train, label=y_train)
dvalid = xgb.DMatrix(x_val, label=y_val)

In [15]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag('model','XGBoost')
        mlflow.log_params(params)
        booster = xgb.train(params=params, dtrain=dtrain, num_boost_round=10, 
                            evals=[(dvalid, 'validation')], early_stopping_rounds=5)
        y_pred = booster.predict(dvalid)
        rmse = root_mean_squared_error(y_val, y_pred)
        mlflow.log_metric("rmse", rmse)
        saved_path = os.path.join(tempfile.mkdtemp(), "xgboost-model")
        mlflow.xgboost.save_model(booster,saved_path)
        mlflow.log_artifacts(saved_path, artifact_path="xgboost-model")
    return {'loss': rmse, 'status': STATUS_OK}

In [16]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 10, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

In [17]:
best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=4,
    trials=Trials(),
    verbose=False
)

[0]	validation-rmse:9.11354
[1]	validation-rmse:8.28563
[2]	validation-rmse:7.61566
[3]	validation-rmse:7.07784
[4]	validation-rmse:6.64824
[5]	validation-rmse:6.31100
[6]	validation-rmse:6.04517
[7]	validation-rmse:5.83866
[8]	validation-rmse:5.67899
[9]	validation-rmse:5.55333


  self.starting_round = model.num_boosted_rounds()
  xgb_model.save_model(model_data_path)


[0]	validation-rmse:8.51168
[1]	validation-rmse:7.38880
[2]	validation-rmse:6.61092
[3]	validation-rmse:6.09126
[4]	validation-rmse:5.74906
[5]	validation-rmse:5.52172
[6]	validation-rmse:5.38813
[7]	validation-rmse:5.28206
[8]	validation-rmse:5.21391
[9]	validation-rmse:5.17392


  self.starting_round = model.num_boosted_rounds()
  xgb_model.save_model(model_data_path)


[0]	validation-rmse:9.61249
[1]	validation-rmse:9.14399
[2]	validation-rmse:8.72092
[3]	validation-rmse:8.33782
[4]	validation-rmse:7.99254
[5]	validation-rmse:7.68166


  self.starting_round = model.num_boosted_rounds()


[6]	validation-rmse:7.40217
[7]	validation-rmse:7.15215
[8]	validation-rmse:6.92872
[9]	validation-rmse:6.72935


  xgb_model.save_model(model_data_path)


[0]	validation-rmse:9.46355
[1]	validation-rmse:8.87832
[2]	validation-rmse:8.36533
[3]	validation-rmse:7.91616
[4]	validation-rmse:7.52640
[5]	validation-rmse:7.18762
[6]	validation-rmse:6.89524
[7]	validation-rmse:6.64215
[8]	validation-rmse:6.42609
[9]	validation-rmse:6.24037


  self.starting_round = model.num_boosted_rounds()
  xgb_model.save_model(model_data_path)


In [None]:

best_params = {
    "learning_rate": 0.6856143053942376,
    "max_depth": 7,
    "min_child_weight": 4.434221562748838,
    "objective": "reg:squarederror",  # Corrected from "reg:linear"
    "reg_alpha": 0.16562147689397697,
    "reg_lambda": 0.06268756235943868,
    "seed": 42
}

with mlflow.start_run():
    mlflow.log_params(best_params)

    booster = xgb.train(params=best_params, dtrain=dtrain, num_boost_round=10, 
                        evals=[(dvalid, 'validation')], early_stopping_rounds=5)

    y_pred = booster.predict(dvalid)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)
    tmpdir = tempfile.mkdtemp()
    saved_path = os.path.join(tmpdir, "xgboost-model")
    mlflow.xgboost.save_model(booster, saved_path)
    mlflow.log_artifacts(saved_path, artifact_path="xgboost-model")


[0]	validation-rmse:5.96364
[1]	validation-rmse:5.27381
[2]	validation-rmse:5.17035
[3]	validation-rmse:5.12828
[4]	validation-rmse:5.12205
[5]	validation-rmse:5.12208
[6]	validation-rmse:5.11905
[7]	validation-rmse:5.11908


[8]	validation-rmse:5.11822
[9]	validation-rmse:5.11775


  xgb_model.save_model(model_data_path)


In [9]:
model = mlflow.xgboost.load_model("runs:/39e5920c4b234df28b96dc6bd5fc385e/xgb_model")  
preds = model.predict(dvalid)
print(preds[:10])

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

[ 4.26634    9.457501   8.050992  21.871208   4.3089643 10.519963
 10.460446  17.684958  19.719645  15.765076 ]


In [22]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import LinearSVR

mlflow.sklearn.autolog()

for model_class in (RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, LinearSVR):

    with mlflow.start_run():

        mlflow.log_param("train-data-path", "./data/yellow_tripdata_2023-01.csv")
        mlflow.log_param("valid-data-path", "./data/yellow_tripdata_2023-02.csv")
        mlflow.log_artifact("preprocessor.b", artifact_path="preprocessor")

        mlmodel = model_class()
        mlmodel.fit(x_train, y_train)

        y_pred = mlmodel.predict(x_val)
        rmse = root_mean_squared_error(y_val, y_pred)
        mlflow.log_metric("rmse", rmse)
        

### Using the MlflowClient to interact with model registry and experiments

In [7]:
from mlflow.tracking import MlflowClient
tracking_uri ='https://localhost:5000'
client = MlflowClient(tracking_uri=tracking_uri)

In [8]:
experiments = client.search_experiments()
for exp in experiments:
    print(exp)

KeyboardInterrupt: 

In [4]:
from mlflow.entities import ViewType
runs = client.search_runs(experiment_ids='1',
                          filter_string='metrics.rmse<7',
                          run_view_type=ViewType.ACTIVE_ONLY,
                          max_results=5,
                          order_by=["metrics.rmse ASC"]
                          )
for run in runs:
    print(f"run_id={run.info.run_id},rmse={run.data.metrics['rmse']}")
                           

run_id=55911009447c47589abb8b8ebd5b6bae,rmse=5.085960913694517
run_id=b3ad1a3ba96a41ac8d4b8b0c97274a29,rmse=5.117746383729167
run_id=07caa94b1d00491a9bac12fc092f7b41,rmse=5.173922926402307
run_id=82d64c33b10b48749cc04f813f279ff6,rmse=5.469835432003781
run_id=88cd1d68870049b9bc9dfdbf3441dce2,rmse=5.6079884282968075


In [5]:
import mlflow 
mlflow.set_tracking_uri(tracking_uri)

In [6]:
run_id = '88cd1d68870049b9bc9dfdbf3441dce2'
model_uri = f'runs:/{run_id}/model'
mlflow.register_model(model_uri=model_uri,name='NYC_taxi')

2025/09/12 05:09:26 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/09/12 05:09:26 INFO mlflow.store.db.utils: Updating database tables


INFO  [alembic.runtime.migration] Context impl PostgresqlImpl.
INFO  [alembic.runtime.migration] Will assume transactional DDL.
Registered model 'NYC_taxi' already exists. Creating a new version of this model...
Created version '6' of model 'NYC_taxi'.


<ModelVersion: aliases=[], creation_timestamp=1757653767294, current_stage='None', deployment_job_state=None, description=None, last_updated_timestamp=1757653767294, metrics=None, model_id=None, name='NYC_taxi', params=None, run_id='88cd1d68870049b9bc9dfdbf3441dce2', run_link=None, source='models:/m-a82893f5bbcf476895abecba60f4211f', status='READY', status_message=None, tags={}, user_id=None, version=6>

In [7]:
model_name ='NYC_taxi'
versions = client.get_latest_versions(name=model_name)

for ver in versions:
    print(f'version {ver.version},stage {ver.current_stage}')

version 2,stage Production
version 4,stage Staging
version 6,stage None


  versions = client.get_latest_versions(name=model_name)


In [8]:
client.transition_model_version_stage(name=model_name,
                                      version=4,
                                      stage='staging',
                                      archive_existing_versions=False)

  client.transition_model_version_stage(name=model_name,


<ModelVersion: aliases=[], creation_timestamp=1757651481283, current_stage='Staging', deployment_job_state=None, description='This is the first model in staging', last_updated_timestamp=1757653768001, metrics=None, model_id=None, name='NYC_taxi', params=None, run_id='88cd1d68870049b9bc9dfdbf3441dce2', run_link=None, source='models:/m-a82893f5bbcf476895abecba60f4211f', status='READY', status_message=None, tags={}, user_id=None, version=4>

In [9]:
client.update_model_version(name=model_name,version=4,description="This is the first model in staging")

<ModelVersion: aliases=[], creation_timestamp=1757651481283, current_stage='Staging', deployment_job_state=None, description='This is the first model in staging', last_updated_timestamp=1757653768439, metrics=None, model_id=None, name='NYC_taxi', params=None, run_id='88cd1d68870049b9bc9dfdbf3441dce2', run_link=None, source='models:/m-a82893f5bbcf476895abecba60f4211f', status='READY', status_message=None, tags={}, user_id=None, version=4>

In [10]:
run_id1 = '7271cee61c844471b3c1468de051b724'
model_uri1 = f'runs:/{run_id}/model'
mlflow.register_model(model_uri=model_uri1,name='NYC_taxi')

run_id2 = '88cd1d68870049b9bc9dfdbf3441dce2'
model_uri2 = f'runs:/{run_id2}/model'
mlflow.register_model(model_uri=model_uri2,name='NYC_taxi')


Registered model 'NYC_taxi' already exists. Creating a new version of this model...
Created version '7' of model 'NYC_taxi'.
Registered model 'NYC_taxi' already exists. Creating a new version of this model...
Created version '8' of model 'NYC_taxi'.


<ModelVersion: aliases=[], creation_timestamp=1757653769625, current_stage='None', deployment_job_state=None, description=None, last_updated_timestamp=1757653769625, metrics=None, model_id=None, name='NYC_taxi', params=None, run_id='88cd1d68870049b9bc9dfdbf3441dce2', run_link=None, source='models:/m-a82893f5bbcf476895abecba60f4211f', status='READY', status_message=None, tags={}, user_id=None, version=8>

In [11]:
model_name ='NYC_taxi'
versions = client.search_model_versions(f"name='{model_name}'")
for ver in versions:
    print(f'version {ver.version},stage {ver.current_stage}')

version 8,stage None
version 7,stage None
version 4,stage Staging
version 6,stage None
version 2,stage Production
version 5,stage None


### Updating the Production Model

In [18]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df["duration"] = df["duration"].dt.total_seconds() / 60

    df = df[(df["duration"] >= 1) & (df["duration"] <= 60)]

    categorical = ["PULocationID", "DOLocationID"]
    df[categorical] = df[categorical].astype(str)
    
    return df


def preprocess(df, dv):
    df["PU_DO"] = df["PULocationID"] + "_" + df["DOLocationID"]
    categorical = ["PU_DO"]
    numerical = ["trip_distance"]
    train_dicts = df[categorical + numerical].to_dict(orient="records")
    return dv.transform(train_dicts)


def test_model(name, stage, X_test, y_test):
    model = mlflow.pyfunc.load_model(f"models:/{name}/{stage}")
    y_pred = model.predict(X_test)
    return {"rmse": root_mean_squared_error(y_test, y_pred)}


In [19]:
df = read_dataframe("data/yellow_tripdata_2023-02.parquet")
df = df.sample(n=2000, random_state=42)

In [20]:
client.download_artifacts(run_id=run_id, path='preprocessor', dst_path='.')
with open("preprocessor/preprocessor.b", "rb") as f_in:
    dv = pickle.load(f_in)

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

In [22]:
X_test = preprocess(df, dv)
target = "duration"
y_test = df[target].values

In [23]:
%time test_model(name=model_name, stage="Production", X_test=X_test, y_test=y_test)

CPU times: user 762 ms, sys: 138 ms, total: 900 ms
Wall time: 1.2 s


{'rmse': 6.853813289743332}

In [24]:
%time test_model(name=model_name, stage="Staging", X_test=X_test, y_test=y_test)

CPU times: user 89.8 ms, sys: 7.57 ms, total: 97.4 ms
Wall time: 152 ms


{'rmse': 5.988687671624652}

In [25]:
client.transition_model_version_stage(name=model_name,
                                      version=4,
                                      stage='Production',
                                      archive_existing_versions=True)

  client.transition_model_version_stage(name=model_name,


<ModelVersion: aliases=[], creation_timestamp=1757651481283, current_stage='Production', deployment_job_state=None, description='This is the first model in staging', last_updated_timestamp=1757654365618, metrics=None, model_id=None, name='NYC_taxi', params=None, run_id='88cd1d68870049b9bc9dfdbf3441dce2', run_link=None, source='models:/m-a82893f5bbcf476895abecba60f4211f', status='READY', status_message=None, tags={}, user_id=None, version=4>