In [2]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [4]:
import pandas as pd
from src.config import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / "tabular_data.parquet")

In [5]:
df['week_day'] = df['pickup_hour'].dt.hour % 168  # Capture weekly patterns
df['hour_of_day'] = df['pickup_hour'].dt.hour % 24  # Capture daily patterns


In [11]:
import numpy as np 
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_absolute_error
import numpy as np

def train_arma(df):
    # Selecting key lag features
    selected_lags = ['rides_t-1', 'rides_t-24', 'rides_t-168', 'rides_t-672']
    
    # Train-test split
    train_size = int(len(df) * 0.9)
    train, test = df.iloc[:train_size], df.iloc[train_size:]
    
    # ARMA Model Selection
    best_aic = np.inf
    best_order = None
    best_model = None
    
    for p in range(3):  # Try small values for AR
        for q in range(3):  # Try small values for MA
            try:
                model = ARIMA(train['target'], order=(p, 0, q), exog=train[selected_lags])
                fitted = model.fit()
                if fitted.aic < best_aic:
                    best_aic = fitted.aic
                    best_order = (p, q)
                    best_model = fitted
            except:
                continue
    
    print(f"Best ARMA Order: {best_order}, AIC: {best_aic}")
    
    # Forecasting
    test_predictions = best_model.forecast(steps=len(test), exog=test[selected_lags])
    test_mae = mean_absolute_error(test['target'], test_predictions)

    # Train MAE Calculation (using in-sample fitted values)
    train_predictions = best_model.fittedvalues
    train_mae = mean_absolute_error(train['target'], train_predictions)

    print(f"Train MAE: {train_mae}")
    print(f"Test MAE: {test_mae}")
    
    return best_model, train_mae, test_mae

In [12]:
best_model, train_mae, test_mae = train_arma(df)

  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


Best ARMA Order: (1, 2), AIC: 596404.6286985173
Train MAE: 3.052895214388896
Test MAE: 5.22049574945898


In [18]:
from src.experiment_utils import set_mlflow_tracking
from dotenv import load_dotenv
import mlflow
import mlflow.sklearn  # Needed for logging non-sklearn models too
import os
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_absolute_error

# Load environment variables
load_dotenv()

# Set up MLflow tracking
mlflow = set_mlflow_tracking()

def train_arma(df):
    # Selecting key lag features
    selected_lags = ['rides_t-1', 'rides_t-24', 'rides_t-168', 'rides_t-672']
    
    # Train-test split
    train_size = int(len(df) * 0.9)
    train, test = df.iloc[:train_size], df.iloc[train_size:]
    
    # ARMA Model Selection
    best_aic = np.inf
    best_order = None
    best_model = None
    
    for p in range(3):  # Try small values for AR
        for q in range(3):  # Try small values for MA
            try:
                model = ARIMA(train['target'], order=(p, 0, q), exog=train[selected_lags])
                fitted = model.fit()
                if fitted.aic < best_aic:
                    best_aic = fitted.aic
                    best_order = (p, q)
                    best_model = fitted
            except:
                continue
    
    print(f"Best ARMA Order: {best_order}, AIC: {best_aic}")
    
    # Forecasting
    test_predictions = best_model.forecast(steps=len(test), exog=test[selected_lags])
    test_mae = mean_absolute_error(test['target'], test_predictions)

    # Train MAE Calculation (using in-sample fitted values)
    train_predictions = best_model.fittedvalues
    train_mae = mean_absolute_error(train['target'], train_predictions)

    print(f"Train MAE: {train_mae}")
    print(f"Test MAE: {test_mae}")

    # Log model & metrics to MLflow
    with mlflow.start_run():
        mlflow.log_param("model_type", "ARMA")
        mlflow.log_param("best_order", best_order)
        mlflow.log_metric("train_mae", train_mae)
        mlflow.log_metric("test_mae", test_mae)

        # Log the model
        mlflow.sklearn.log_model(best_model, "ARMA_Model")

    print(f"Logged ARMA({best_order}) to MLflow with Train MAE: {train_mae}, Test MAE: {test_mae}")
    
    return best_model, train_mae, test_mae

best_model, train_mae, test_mae = train_arma(df)


INFO:src.experiment_utils:MLflow tracking URI and credentials set.
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


Best ARMA Order: (1, 2), AIC: 596404.6286985173
Train MAE: 3.052895214388896
Test MAE: 5.22049574945898




🏃 View run fortunate-deer-464 at: https://dagshub.com/vidyuthkrishna03/nyc-taxi-rides-prediction.mlflow/#/experiments/11/runs/9be14fd72fe44f3da4eee3bbb7818d52
🧪 View experiment at: https://dagshub.com/vidyuthkrishna03/nyc-taxi-rides-prediction.mlflow/#/experiments/11
Logged ARMA((1, 2)) to MLflow with Train MAE: 3.052895214388896, Test MAE: 5.22049574945898
