In [39]:
# Check Python version to ensure compatibility with required libraries
!python -V

Python 3.12.1


In [40]:
# Import pandas for data manipulation and analysis
import pandas as pd

In [41]:
# Import pickle for saving and loading Python objects
import pickle

In [42]:
# Import DictVectorizer (converts feature dicts to vectors, one-hot encoding categoricals) and RMSE metric for evaluation
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import root_mean_squared_error

In [43]:
# Import MLflow and set up experiment tracking
import mlflow

# Set the MLflow tracking URI to local server
mlflow.set_tracking_uri("http://localhost:5000")
# Set the experiment name for MLflow runs
mlflow.set_experiment("nyc-taxi-experiment")

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1751795868559, experiment_id='1', last_update_time=1751795868559, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [44]:
# Function to read and preprocess the NYC taxi data
def read_dataframe(filename):
    # Read parquet file into DataFrame
    df = pd.read_parquet(filename)

    # Calculate trip duration in minutes
    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    # Filter out trips with duration less than 1 minute or more than 60 minutes
    df = df[(df.duration >= 1) & (df.duration <= 60)]

    # Convert categorical columns to string type
    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    # Create a combined pickup-dropoff feature
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']

    return df

In [45]:
# Load and preprocess training and validation datasets
df_train = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet')
df_val = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet')

In [46]:
# Select features for the model
categorical = ['PU_DO']  # Combined pickup and dropoff location as categorical feature
numerical = ['trip_distance']  # Trip distance as numerical feature

# Initialize DictVectorizer to convert feature dictionaries to feature vectors
dv = DictVectorizer()

# Prepare training data as a list of dictionaries for the vectorizer
train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)  # Fit vectorizer and transform training data

# Prepare validation data as a list of dictionaries for the vectorizer
val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)  # Transform validation data using fitted vectorizer

In [47]:
# Extract target variable (duration) for training and validation sets
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [48]:
# Import xgboost for training the regression model
import xgboost as xgb

In [49]:
# Import Path for handling filesystem paths
from pathlib import Path

In [50]:
# Create a directory to store trained models if it doesn't exist
models_folder = Path('models')
models_folder.mkdir(exist_ok=True)

In [51]:
# Start an MLflow run to track experiment
with mlflow.start_run():
    # Prepare DMatrix for XGBoost
    train = xgb.DMatrix(X_train, label=y_train)
    valid = xgb.DMatrix(X_val, label=y_val)

    # Best hyperparameters found from tuning
    best_params = {
        'learning_rate': 0.09585355369315604,
        'max_depth': 30,
        'min_child_weight': 1.060597050922164,
        'objective': 'reg:linear',
        'reg_alpha': 0.018060244040060163,
        'reg_lambda': 0.011658731377413597,
        'seed': 42
    }

    # Log hyperparameters to MLflow
    mlflow.log_params(best_params)

    # Train XGBoost model with early stopping
    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=30,
        evals=[(valid, 'validation')],
        early_stopping_rounds=50
    )

    # Predict on validation set and calculate RMSE
    y_pred = booster.predict(valid)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    # Save the preprocessor (DictVectorizer)
    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

    # Add signature and input_example to suppress MLflow warning
    import numpy as np
    from mlflow.models.signature import infer_signature
    input_example = X_val[0:2] if hasattr(X_val, 'shape') else np.array(list(val_dicts)[:2])
    signature = infer_signature(X_val, y_val)

    # Log the trained XGBoost model to MLflow
    mlflow.xgboost.log_model(
        booster,
        name="models_mlflow",
        signature=signature,
        input_example=input_example
    )

  self.starting_round = model.num_boosted_rounds()


[0]	validation-rmse:11.44482
[1]	validation-rmse:10.77202
[1]	validation-rmse:10.77202
[2]	validation-rmse:10.18363
[2]	validation-rmse:10.18363
[3]	validation-rmse:9.67396
[3]	validation-rmse:9.67396
[4]	validation-rmse:9.23166
[4]	validation-rmse:9.23166
[5]	validation-rmse:8.84808
[5]	validation-rmse:8.84808
[6]	validation-rmse:8.51883
[6]	validation-rmse:8.51883
[7]	validation-rmse:8.23597
[7]	validation-rmse:8.23597
[8]	validation-rmse:7.99320
[8]	validation-rmse:7.99320
[9]	validation-rmse:7.78709
[9]	validation-rmse:7.78709
[10]	validation-rmse:7.61022
[10]	validation-rmse:7.61022
[11]	validation-rmse:7.45952
[11]	validation-rmse:7.45952
[12]	validation-rmse:7.33049
[12]	validation-rmse:7.33049
[13]	validation-rmse:7.22098
[13]	validation-rmse:7.22098
[14]	validation-rmse:7.12713
[14]	validation-rmse:7.12713
[15]	validation-rmse:7.04752
[15]	validation-rmse:7.04752
[16]	validation-rmse:6.98005
[16]	validation-rmse:6.98005
[17]	validation-rmse:6.92232
[17]	validation-rmse:6.92232

  xgb_model.save_model(model_data_path)


🏃 View run gifted-crane-33 at: http://localhost:5000/#/experiments/1/runs/bbc576251d204e4ca476c403e7222e39
🧪 View experiment at: http://localhost:5000/#/experiments/1
