Import statements

In [1]:
%load_ext autoreload
%autoreload 2
import sys
import os
import pandas as pd
from datetime import datetime
import xgboost as xgb
from dotenv import load_dotenv
load_dotenv() 
from sklearn.metrics import mean_absolute_error

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from src.config import TRANSFORMED_DATA_DIR
from src.data_utils import split_time_series_data
from src.experiment_utils import set_mlflow_tracking, log_model_to_mlflow


Data Loading and Test Train split

In [2]:
df = pd.read_parquet(TRANSFORMED_DATA_DIR / "tabular_data.parquet")
df.head(5)

X_train, y_train, X_test, y_test = split_time_series_data(
    df,
    cutoff_date=datetime(2023, 9, 1, 0, 0, 0),
    target_column="target"
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

past_ride_columns = [c for c in X_train.columns if c.startswith("rides_")]
X_train_only_numeric = X_train[past_ride_columns]
X_test_only_numeric = X_test[past_ride_columns]

(55900, 674)
(55900,)
(31720, 674)
(31720,)


XGBoost Model Predictions and Logging

In [3]:
model = xgb.XGBRegressor(max_depth=10)
model.fit(X_train_only_numeric, y_train)

predictions = model.predict(X_test_only_numeric)

test_mae = mean_absolute_error(y_test, predictions)
print(f"{test_mae:.4f}")

mlflow = set_mlflow_tracking()
log_model_to_mlflow(model, X_test_only_numeric, "XGBoost", "mean_absolute_error", score=test_mae)

INFO:src.experiment_utils:MLflow tracking URI and credentials set.


3.4586


INFO:src.experiment_utils:Experiment set to: XGBoost
INFO:src.experiment_utils:Logged mean_absolute_error: 3.458620309829712
INFO:src.experiment_utils:Model signature inferred.
  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 91.91it/s]  
2025/03/04 17:08:08 INFO mlflow.models.model: Found the following environment variables used during model inference: [HOPSWORKS_API_KEY]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
Registered model 'XGBRegressor' already exists. Creating a new version of this model...
2025/03/04 17:12:02 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: XGBRegressor, version 2
Created version '2' of model 'XGBRegressor'.
INFO:src.experiment_utils:Model logged with name: XGBRegressor


🏃 View run masked-bear-326 at: https://dagshub.com/singhvarunnn789/CDA500P1.mlflow/#/experiments/3/runs/7b63464e97ce4483be75c9b5fadfd17a
🧪 View experiment at: https://dagshub.com/singhvarunnn789/CDA500P1.mlflow/#/experiments/3


<mlflow.models.model.ModelInfo at 0x1c9a5c79010>