In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [7]:
import pandas as pd
from src.config import TRANSFORMED_DATA_DIR
from datetime import datetime
from src.data_utils import split_time_series_data
from sklearn.model_selection import train_test_split

# Load data
df = pd.read_parquet(TRANSFORMED_DATA_DIR / "tabular_data.parquet")
df


Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id,target
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,2023-01-29,2,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2023-01-30,2,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2023-01-31,2,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2023-02-01,2,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2023-02-02,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87615,25,14,5,3,7,16,53,133,126,136,...,62,62,58,50,48,42,37,2023-12-27,263,12
87616,30,7,9,6,5,23,58,123,136,108,...,64,79,65,71,72,75,35,2023-12-28,263,19
87617,50,26,17,9,8,11,43,116,137,132,...,81,78,60,85,63,62,37,2023-12-29,263,38
87618,117,88,39,19,14,12,27,37,70,97,...,84,75,100,98,88,77,69,2023-12-30,263,59


In [17]:
# Split time-series data
X_train, y_train, X_test, y_test = split_time_series_data(
    df,
    cutoff_date=datetime(2023, 10, 1, 0, 0, 0),
    target_column="target"
)

In [18]:
# test and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.1, shuffle=False  # No shuffling for time series
)

In [19]:
print("Train shape:", X_train.shape, y_train.shape)
print("Validation shape:", X_val.shape, y_val.shape)
print("Test shape:", X_test.shape, y_test.shape)


Train shape: (57330, 674) (57330,)
Validation shape: (6370, 674) (6370,)
Test shape: (23920, 674) (23920,)


In [20]:
past_ride_columns = [c for c in X_train.columns if c.startswith("rides_")]
X_train_only_numeric = X_train[past_ride_columns]
X_val_only_numeric = X_val[past_ride_columns]
X_test_only_numeric = X_test[past_ride_columns]

In [21]:
import xgboost as xgb
model = xgb.XGBRegressor(max_depth=10)
model.fit(X_train_only_numeric, y_train, eval_set=[(X_val_only_numeric, y_val)], verbose=True)

[0]	validation_0-rmse:31.20388
[1]	validation_0-rmse:21.59961
[2]	validation_0-rmse:16.67477
[3]	validation_0-rmse:13.32255
[4]	validation_0-rmse:11.41342
[5]	validation_0-rmse:10.53580
[6]	validation_0-rmse:10.27720
[7]	validation_0-rmse:10.25744
[8]	validation_0-rmse:10.34990
[9]	validation_0-rmse:10.42710
[10]	validation_0-rmse:10.53457
[11]	validation_0-rmse:10.59112
[12]	validation_0-rmse:10.63368
[13]	validation_0-rmse:10.69076
[14]	validation_0-rmse:10.70601
[15]	validation_0-rmse:10.73637
[16]	validation_0-rmse:10.78233
[17]	validation_0-rmse:10.81048
[18]	validation_0-rmse:10.81528
[19]	validation_0-rmse:10.82829
[20]	validation_0-rmse:10.83198
[21]	validation_0-rmse:10.83191
[22]	validation_0-rmse:10.86003
[23]	validation_0-rmse:10.87377
[24]	validation_0-rmse:10.87591
[25]	validation_0-rmse:10.87570
[26]	validation_0-rmse:10.87724
[27]	validation_0-rmse:10.87326
[28]	validation_0-rmse:10.87483
[29]	validation_0-rmse:10.87740
[30]	validation_0-rmse:10.88118
[31]	validation_0-

In [22]:
from sklearn.metrics import mean_absolute_error
predictions = model.predict(X_test_only_numeric)

In [23]:
test_mae = mean_absolute_error(y_test, predictions)
print(f"Test MAE: {test_mae:.4f}")

Test MAE: 3.3114


In [24]:
from src.experiment_utils import set_mlflow_tracking, log_model_to_mlflow
from dotenv import load_dotenv
import os
load_dotenv() 

mlflow = set_mlflow_tracking()
log_model_to_mlflow(model, X_test_only_numeric, "XGBoost with Correct TS Split and Validation Set", "mean_absolute_error", score=test_mae)

INFO:src.experiment_utils:MLflow tracking URI and credentials set.
2025/02/21 18:20:04 INFO mlflow.tracking.fluent: Experiment with name 'XGBoost with Correct TS Split and Validation Set' does not exist. Creating a new experiment.
INFO:src.experiment_utils:Experiment set to: XGBoost with Correct TS Split and Validation Set
INFO:src.experiment_utils:Logged mean_absolute_error: 3.311377763748169
INFO:src.experiment_utils:Model signature inferred.
Registered model 'XGBRegressor' already exists. Creating a new version of this model...
2025/02/21 18:24:23 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: XGBRegressor, version 2
Created version '2' of model 'XGBRegressor'.
INFO:src.experiment_utils:Model logged with name: XGBRegressor


🏃 View run marvelous-carp-425 at: https://dagshub.com/vidyuthkrishna03/nyc-taxi-rides-prediction.mlflow/#/experiments/6/runs/68e4737f7cf14fccb4f49ee73006e93f
🧪 View experiment at: https://dagshub.com/vidyuthkrishna03/nyc-taxi-rides-prediction.mlflow/#/experiments/6


<mlflow.models.model.ModelInfo at 0x2053f5dd550>