In [1]:
%load_ext autoreload
%autoreload 2
import sys
import os
import pandas as pd
from datetime import datetime
import lightgbm as lgb
from dotenv import load_dotenv
load_dotenv() 
from sklearn.metrics import mean_absolute_error
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
from prophet import Prophet
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from src.config import TRANSFORMED_DATA_DIR
from src.data_utils import split_time_series_data
from src.experiment_utils import set_mlflow_tracking, log_model_to_mlflow

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def split_time_series_by_cutoff_date(
    df: pd.DataFrame,
    target_column: str,
    cutoff_date: str
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Splits a time series DataFrame into training and testing sets based on a cutoff date.
    Retains only 'pickup_hour' and the target column, and aggregates by 'pickup_hour'.

    Args:
        df (pd.DataFrame): The input DataFrame containing the time series data.
        target_column (str): The name of the target column to separate from the features.
        cutoff_date (str): The cutoff date for splitting the data (e.g., "2023-08-01").

    Returns:
        Tuple[pd.DataFrame, pd.DataFrame]:
            - train_data (pd.DataFrame): Training data up to the cutoff date.
            - test_data (pd.DataFrame): Testing data after the cutoff date.
    """
    # Keep only 'pickup_hour' and target column
    df = df[["pickup_hour", target_column]]

    # Convert 'pickup_hour' to datetime if not already
    df["pickup_hour"] = pd.to_datetime(df["pickup_hour"])

    # Aggregate data by 'pickup_hour'
    df_aggregated = df.groupby("pickup_hour")[target_column].sum().reset_index()

    # Sort the DataFrame by date
    df_sorted = df_aggregated.sort_values("pickup_hour")

    # Split data into training and testing sets based on cutoff date
    train_data = df_sorted[df_sorted["pickup_hour"] <= pd.Timestamp(cutoff_date)].reset_index(drop=True)
    test_data = df_sorted[df_sorted["pickup_hour"] > pd.Timestamp(cutoff_date)].reset_index(drop=True)

    return train_data, test_data

In [3]:
from prophet import Prophet
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt

# Prepare data for Prophet
def prepare_prophet_data(df: pd.DataFrame, target_column: str) -> pd.DataFrame:
    df_prepared = df.rename(columns={"pickup_hour": "ds", target_column: "y"})
    df_prepared["ds"] = pd.to_datetime(df_prepared["ds"])  # Ensure datetime format
    return df_prepared

# Fit and forecast using Prophet
def fit_and_forecast_prophet(train_data: pd.DataFrame, test_data: pd.DataFrame) -> tuple:
    model = Prophet()
    model.fit(train_data)

    # Calculate how many periods we need to extend to match test data
    last_train_date = train_data["ds"].max()
    last_test_date = test_data["ds"].max()
    periods_needed = (last_test_date - last_train_date).days * 24  # Assuming hourly frequency

    future = model.make_future_dataframe(periods=periods_needed, freq="H")  # Extend forecast
    forecast = model.predict(future)

    return model, forecast

def evaluate_prophet(test_data: pd.Series, forecast: pd.DataFrame) -> float:
    forecast_filtered = forecast[forecast["ds"].isin(test_data.index)]

    if forecast_filtered.empty:
        raise ValueError(f"No overlap! Check formats:\n"
                         f"Test Data Range: {test_data.index.min()} to {test_data.index.max()}\n"
                         f"Forecast Range: {forecast['ds'].min()} to {forecast['ds'].max()}")

    mae = mean_absolute_error(test_data.values, forecast_filtered["yhat"].values)
    print(f"Test Set MAE: {mae:.2f}")
    return mae

In [4]:
df = pd.read_parquet(TRANSFORMED_DATA_DIR / "tabular_data.parquet")
df
df.shape
cutoff_date = "2023-08-01"
train_data, test_data = split_time_series_by_cutoff_date(df, target_column="target", cutoff_date=cutoff_date)

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id,target
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,2023-01-29,2,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2023-01-30,2,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2023-01-31,2,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2023-02-01,2,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2023-02-02,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87615,25,14,5,3,7,16,53,133,126,136,...,62,62,58,50,48,42,37,2023-12-27,263,12
87616,30,7,9,6,5,23,58,123,136,108,...,64,79,65,71,72,75,35,2023-12-28,263,19
87617,50,26,17,9,8,11,43,116,137,132,...,81,78,60,85,63,62,37,2023-12-29,263,38
87618,117,88,39,19,14,12,27,37,70,97,...,84,75,100,98,88,77,69,2023-12-30,263,59


(87620, 675)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["pickup_hour"] = pd.to_datetime(df["pickup_hour"])


In [5]:
# X_train
# y_train
# X_val
# y_val
# X_test
# y_test

In [6]:
# # Prepare training and test data for Prophet
# train_data = prepare_prophet_data(X_train.join(y_train), target_column="target")
# test_data = prepare_prophet_data(X_test.join(y_test), target_column="target")
# train_data
# test_data

In [7]:
target_column="target"
train_data_prepared = prepare_prophet_data(train_data, target_column)
test_data_prepared = prepare_prophet_data(test_data, target_column)
model, forecast = fit_and_forecast_prophet(train_data_prepared, test_data_prepared)
print(f"Test Data Range: {test_data_prepared['ds'].min()} to {test_data_prepared['ds'].max()}")
print(f"Forecast Range: {forecast['ds'].min()} to {forecast['ds'].max()}")
print(forecast[['ds', 'yhat']].tail(10)) # See last few timestamps
print(test_data_prepared.tail(10))  # See last few test timestamps


DEBUG:cmdstanpy:cmd: where.exe tbb.dll
cwd: None
DEBUG:cmdstanpy:Adding TBB (c:\Users\singh\Downloads\CDS500_Applied_ML_DS\Projects\CDA500P1\CDA500P1\Lib\site-packages\prophet\stan_model\cmdstan-2.33.1\stan\lib\stan_math\lib\tbb) to PATH
INFO:prophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: C:\Users\singh\AppData\Local\Temp\tmpiuqd6khn\mxe5t4vt.json
DEBUG:cmdstanpy:input tempfile: C:\Users\singh\AppData\Local\Temp\tmpiuqd6khn\c05wqs6y.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['C:\\Users\\singh\\Downloads\\CDS500_Applied_ML_DS\\Projects\\CDA500P1\\CDA500P1\\Lib\\site-packages\\prophet\\stan_model\\prophet_model.bin', 'random', 'seed=24648', 'data', 'file=C:\\Users\\singh\\AppData\\Local\\Temp\\tmpiuqd6khn\\mxe5t4vt.json', 'init=C:\\Users\\singh

Test Data Range: 2023-08-02 00:00:00 to 2023-12-31 00:00:00
Forecast Range: 2023-01-29 00:00:00 to 2023-12-31 00:00:00
                      ds         yhat
3823 2023-12-30 15:00:00  5525.859313
3824 2023-12-30 16:00:00  5498.256195
3825 2023-12-30 17:00:00  5459.426492
3826 2023-12-30 18:00:00  5409.322440
3827 2023-12-30 19:00:00  5347.978588
3828 2023-12-30 20:00:00  5275.512907
3829 2023-12-30 21:00:00  5192.127083
3830 2023-12-30 22:00:00  5098.105955
3831 2023-12-30 23:00:00  4993.816118
3832 2023-12-31 00:00:00  4879.703690
            ds     y
142 2023-12-22  3407
143 2023-12-23  2867
144 2023-12-24  1977
145 2023-12-25  1525
146 2023-12-26   834
147 2023-12-27  1414
148 2023-12-28  1599
149 2023-12-29  2276
150 2023-12-30  3190
151 2023-12-31  3407


In [8]:
print("Test Data Range:", test_data_prepared["ds"].min(), "to", test_data_prepared["ds"].max())
print("Forecast Range:", forecast["ds"].min(), "to", forecast["ds"].max())


Test Data Range: 2023-08-02 00:00:00 to 2023-12-31 00:00:00
Forecast Range: 2023-01-29 00:00:00 to 2023-12-31 00:00:00


In [9]:
forecast["ds"] = pd.to_datetime(forecast["ds"])
test_data_prepared["ds"] = pd.to_datetime(test_data_prepared["ds"])

# Filter forecast to contain only test set dates
forecast_filtered = forecast[forecast["ds"].isin(test_data_prepared["ds"])]

# Check alignment
print(forecast_filtered[["ds", "yhat"]].head())
print(test_data_prepared[["ds", "y"]].head())

            ds         yhat
208 2023-08-02  1842.184486
232 2023-08-03  2229.211516
256 2023-08-04  3218.555235
280 2023-08-05  5134.523728
304 2023-08-06  5178.258211
          ds     y
0 2023-08-02  2028
1 2023-08-03  2321
2 2023-08-04  3027
3 2023-08-05  4877
4 2023-08-06  4632


In [10]:
test_data_prepared = test_data_prepared.sort_values("ds").set_index("ds")
forecast_filtered = forecast_filtered.sort_values("ds").set_index("ds")

In [11]:
import numpy as np
from sklearn.metrics import mean_absolute_error

# Compare test values and forecasted values
print("Test Data (y):", test_data_prepared["y"].values[:10])
print("Predicted (yhat):", forecast_filtered["yhat"].values[:10])

# Ensure lengths match
print("Test Data Size:", len(test_data_prepared))
print("Forecast Data Size:", len(forecast_filtered))

# Compute MAE
mae = mean_absolute_error(test_data_prepared["y"].values, forecast_filtered["yhat"].values)
print(f"Test Set MAE Per Day: {mae:.2f}")

Test Data (y): [2028 2321 3027 4877 4632 1515 1342 1538 1886 2499]
Predicted (yhat): [1842.18448623 2229.21151575 3218.55523515 5134.52372761 5178.25821122
 1516.82023139 1352.60652376 1827.96760425 2214.99463377 3204.33835318]
Test Data Size: 152
Forecast Data Size: 152
Test Set MAE Per Day: 671.03


In [12]:
print(f"Test Set MAE Per Hour: {mae/24:.2f}")

Test Set MAE Per Hour: 27.96


In [16]:
import pandas as pd
import numpy as np
from prophet import Prophet
import matplotlib.pyplot as plt

# Step 1: Prepare data for Prophet
def prepare_data_for_prophet(df):
    df_prophet = pd.DataFrame()
    df_prophet['ds'] = df['pickup_hour']
    df_prophet['y'] = df['target']  # Using the most recent ride data as the target
    return df_prophet

# Step 2: Create and fit Prophet model
def fit_prophet_model(df_prophet):
    model = Prophet()
    model.fit(df_prophet)
    return model

# Step 3: Make predictions
def make_predictions(model, periods=30):
    future_dates = model.make_future_dataframe(periods=periods)
    forecast = model.predict(future_dates)
    return forecast

# Step 4: Visualize results
def visualize_results(model, forecast):
    fig1 = model.plot(forecast)
    fig2 = model.plot_components(forecast)
    plt.show()

# Step 5: Evaluate model performance (calculate MAE based on forecast overlap)
def evaluate_model(df_test, forecast):
    # Merge the test data with the forecast for overlap
    forecast_filtered = forecast[forecast['ds'].isin(df_test['ds'])]
    
    if forecast_filtered.empty:
        raise ValueError(f"No overlap! Check formats:\n"
                         f"Test Data Range: {df_test['ds'].min()} to {df_test['ds'].max()}\n"
                         f"Forecast Range: {forecast['ds'].min()} to {forecast['ds'].max()}")
    
    # Ensure proper alignment between forecast and test data by resetting indices
    forecast_filtered = forecast_filtered.set_index('ds').sort_index()
    df_test = df_test.set_index('ds').sort_index()

    # Now calculate Mean Absolute Error (MAE)
    forecast_filtered = forecast_filtered.loc[df_test.index]
    
    # Check if both dataframes have the same length and index after filtering
    if len(forecast_filtered) != len(df_test):
        raise ValueError(f"Data length mismatch! Forecast: {len(forecast_filtered)}, Test: {len(df_test)}")
    
    mae = np.mean(np.abs(forecast_filtered['yhat'].values - df_test['y'].values))
    
    print(f"Test Set MAE: {mae:.2f}")
    return mae

# Prepare data for Prophet
df_prophet = prepare_data_for_prophet(df)

# Split data into train and test sets
train_size = int(len(df_prophet) * 0.8)
df_train = df_prophet[:train_size]
df_test = df_prophet[train_size:]

# Fit model on training data
model = fit_prophet_model(df_train)

# Make predictions
forecast = make_predictions(model, periods=len(df_test))

# Evaluate model performance (calculate MAE)
evaluate_model(df_test, forecast)



DEBUG:cmdstanpy:cmd: where.exe tbb.dll
cwd: None
DEBUG:cmdstanpy:TBB already found in load path
INFO:prophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: C:\Users\singh\AppData\Local\Temp\tmpiuqd6khn\3tf7p92g.json
DEBUG:cmdstanpy:input tempfile: C:\Users\singh\AppData\Local\Temp\tmpiuqd6khn\pheftpyz.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['C:\\Users\\singh\\Downloads\\CDS500_Applied_ML_DS\\Projects\\CDA500P1\\CDA500P1\\Lib\\site-packages\\prophet\\stan_model\\prophet_model.bin', 'random', 'seed=23008', 'data', 'file=C:\\Users\\singh\\AppData\\Local\\Temp\\tmpiuqd6khn\\3tf7p92g.json', 'init=C:\\Users\\singh\\AppData\\Local\\Temp\\tmpiuqd6khn\\pheftpyz.json', 'output', 'file=C:\\Users\\singh\\AppData\\Local\\Temp\\tmpiuqd6khn\\prophet_model8h9ump5

Test Set MAE: 18.21


18.214036831083476