In [9]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 

In [10]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))


In [6]:
import pandas as pd
from src.config import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / "tabular_data.parquet")
df

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id,target
0,1,4,0,0,0,0,0,1,0,2,...,8,4,3,4,1,4,2,2023-01-29,HB101,6
1,1,1,0,0,0,0,0,0,3,3,...,3,5,4,3,0,0,1,2023-01-30,HB101,0
2,1,0,0,0,0,0,0,2,6,1,...,11,15,3,5,1,0,0,2023-01-31,HB101,0
3,0,0,0,0,0,0,0,2,4,0,...,15,12,5,1,1,0,0,2023-02-01,HB101,0
4,0,0,0,0,0,0,0,2,4,1,...,12,9,1,4,1,2,0,2023-02-02,HB101,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18055,0,0,0,0,0,0,0,0,0,0,...,7,2,9,3,2,2,2,2023-08-27,JC116,0
18056,0,0,0,0,0,0,0,0,0,0,...,12,4,3,2,0,3,1,2023-08-28,JC116,0
18057,0,0,0,0,0,0,0,1,1,4,...,13,13,10,1,3,3,0,2023-08-29,JC116,0
18058,0,0,1,0,0,0,0,1,5,1,...,13,13,10,3,0,0,1,2023-08-30,JC116,5


In [5]:
from datetime import datetime

from src.data_utils import split_time_series_data

X_train, y_train, X_test, y_test = split_time_series_data(
    df,
    cutoff_date=datetime(2023, 9, 1, 0, 0, 0),
    target_column="target"
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(55900, 674)
(55900,)
(31720, 674)
(31720,)


In [6]:
import numpy as np

class BaselineModelPreviousHour:

    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass

    def predict(self, X_test: pd.DataFrame) -> np.array:
        return X_test["rides_t-1"]

In [7]:
model = BaselineModelPreviousHour()
predictions = model.predict(X_test)

In [8]:
from sklearn.metrics import mean_absolute_error

test_mae = mean_absolute_error(y_test, predictions)
print(f"{test_mae:.4f}")

6.7109


In [9]:
from src.experiment_utils import set_mlflow_tracking, log_model_to_mlflow
from dotenv import load_dotenv
sys.path
load_dotenv() 
mlflow = set_mlflow_tracking()


['/opt/homebrew/Cellar/python@3.11/3.11.11/Frameworks/Python.framework/Versions/3.11/lib/python311.zip',
 '/opt/homebrew/Cellar/python@3.11/3.11.11/Frameworks/Python.framework/Versions/3.11/lib/python3.11',
 '/opt/homebrew/Cellar/python@3.11/3.11.11/Frameworks/Python.framework/Versions/3.11/lib/python3.11/lib-dynload',
 '',
 '/Users/vamsisaigarapati/Documents/github/nyc_taxi_proj/.venv/lib/python3.11/site-packages',
 '/Users/vamsisaigarapati/Documents/github/nyc_taxi_proj',
 '/Users/vamsisaigarapati/Documents/github/nyc_taxi_proj']

True

INFO:src.experiment_utils:MLflow tracking URI and credentials set.


In [10]:
# log_model_to_mlflow(model, X_test, "BaselineModelPreviousHour", "mean_absolute_error", score=test_mae)

In [11]:
import numpy as np

class BaselineModelPreviousWeek:

    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass

    def predict(self, X_test: pd.DataFrame) -> np.array:
        return X_test[f"rides_t-{7*24}"]


In [12]:
model = BaselineModelPreviousWeek()
predictions = model.predict(X_test)

In [13]:
test_mae = mean_absolute_error(y_test, predictions)
print(f"{test_mae:.4f}")

5.2301


In [None]:
log_model_to_mlflow(model, X_test, "BaselineModelPreviousWeek", "mean_absolute_error", score=test_mae)

In [15]:
import numpy as np
import pandas as pd

class BaselineModelLast4Weeks:
    """
    A baseline model that predicts the average of the last 4 weeks (28 days)
    for each test instance.
    """

    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        """
        The fit method is not used in this baseline model as it does not learn
        from the training data.
        """
        pass

    def predict(self, X_test: pd.DataFrame) -> np.array:
        """
        Predicts the average of the last 4 weeks (28 days) for each test instance.

        Parameters:
            X_test (pd.DataFrame): The test DataFrame containing lagged features
                                   (e.g., rides_t-{7*24}, rides_t-{14*24}, etc.).

        Returns:
            np.array: An array of predictions based on the average of the last 4 weeks.
        """
        # Define the columns for the last 4 weeks
        last_4_weeks_columns = [
            f"rides_t-{7*24}",  # 1 week ago
            f"rides_t-{14*24}", # 2 weeks ago
            f"rides_t-{21*24}", # 3 weeks ago
            f"rides_t-{28*24}"  # 4 weeks ago
        ]

        # Ensure the required columns exist in the test DataFrame
        for col in last_4_weeks_columns:
            if col not in X_test.columns:
                raise ValueError(f"Missing required column: {col}")

        # Calculate the average of the last 4 weeks
        predictions = X_test[last_4_weeks_columns].mean(axis=1)

        return predictions.to_numpy()

In [16]:
model = BaselineModelLast4Weeks()
predictions = model.predict(X_test)

In [17]:
X_test[X_test['pickup_location_id']==43] 

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-8,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id
5002,12,6,1,1,0,7,8,46,41,48,...,131,74,82,62,64,34,38,12,2023-09-01,43
5003,13,7,1,0,0,4,2,11,28,35,...,122,113,80,107,67,58,39,9,2023-09-02,43
5004,14,5,3,1,0,3,7,13,12,27,...,174,166,174,142,86,63,53,13,2023-09-03,43
5005,2,2,0,0,1,5,13,36,34,39,...,169,144,90,61,45,28,31,7,2023-09-04,43
5006,3,0,0,0,1,4,5,44,61,59,...,136,145,85,46,33,28,9,8,2023-09-05,43
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5119,3,0,1,0,1,6,13,54,106,83,...,205,132,64,49,44,31,14,10,2023-12-27,43
5120,6,2,2,0,1,7,8,58,84,90,...,97,57,73,29,48,42,24,7,2023-12-28,43
5121,20,7,2,0,3,3,13,54,67,90,...,239,128,73,52,50,43,30,17,2023-12-29,43
5122,10,8,3,1,3,0,8,19,35,67,...,200,185,160,133,158,69,33,19,2023-12-30,43


In [18]:
test_mae = mean_absolute_error(y_test, predictions)
print(f"{test_mae:.4f}")

4.5444


In [19]:
# log_model_to_mlflow(model, X_test, "BaselineModelLast4Weeks", "mean_absolute_error", score=test_mae)

In [68]:
X_test[X_test['pickup_location_id']==250].sort_values(by=["pickup_hour"])['pickup_hour'].unique()

<DatetimeArray>
['2023-09-01 00:00:00', '2023-09-02 00:00:00', '2023-09-03 00:00:00',
 '2023-09-04 00:00:00', '2023-09-05 00:00:00', '2023-09-06 00:00:00',
 '2023-09-07 00:00:00', '2023-09-08 00:00:00', '2023-09-09 00:00:00',
 '2023-09-10 00:00:00',
 ...
 '2023-12-22 00:00:00', '2023-12-23 00:00:00', '2023-12-24 00:00:00',
 '2023-12-25 00:00:00', '2023-12-26 00:00:00', '2023-12-27 00:00:00',
 '2023-12-28 00:00:00', '2023-12-29 00:00:00', '2023-12-30 00:00:00',
 '2023-12-31 00:00:00']
Length: 122, dtype: datetime64[ns]

In [69]:
from datetime import timedelta
from typing import Optional

import pandas as pd
import plotly.express as px
def plot_aggregated_time_seriess(
    features: pd.DataFrame,
    targets: pd.Series,
    row_id: int,
    predictions: Optional[pd.Series] = None,
):
    """
    Plots the time series data for a specific location from NYC taxi data.

    Args:
        features (pd.DataFrame): DataFrame containing feature data, including historical ride counts and metadata.
        targets (pd.Series): Series containing the target values (e.g., actual ride counts).
        row_id (int): Index of the row to plot.
        predictions (Optional[pd.Series]): Series containing predicted values (optional).

    Returns:
        plotly.graph_objects.Figure: A Plotly figure object showing the time series plot.
    """
    # Extract the specific location's features and target
    location_features = features[features["pickup_location_id"] == row_id]
    actual_target = targets[features["pickup_location_id"] == row_id]
    print(len(location_features))
    print(len(actual_target))
    # Identify time series columns (e.g., historical ride counts)
    time_series_columns = [
        col for col in features.columns if col.startswith("rides_t-")
    ]
    
    time_series_values = [location_features[col] for col in time_series_columns] + [
        actual_target
    ]

    # Generate corresponding timestamps for the time series
    time_series_dates = pd.date_range(
        start=location_features["pickup_hour"].iloc[0]
        - timedelta(hours=len(time_series_columns)),
        end=location_features["pickup_hour"].iloc[0],
        freq="h"
    )

    # Create the plot title with relevant metadata
    title = f"Pickup Hour: {location_features['pickup_hour']}, Location ID: {location_features['pickup_location_id']}"

    print(time_series_dates)
    print(time_series_values)
    # Create the base line plot
    print(type(time_series_values))
    # print(time_series_values.shape)
    print(len(time_series_dates))
    fig = px.line(
        x=time_series_dates,
        y=time_series_values,
        template="plotly_white",
        markers=True,
        title=title,
        labels={"x": "Time", "y": "Ride Counts"},
    )

    # Add the actual target value as a green marker
    fig.add_scatter(
        x=time_series_dates[-1:],  # Last timestamp
        y=[actual_target],  # Actual target value
        line_color="green",
        mode="markers",
        marker_size=10,
        name="Actual Value",
    )

    # Optionally add the prediction as a red marker
    if predictions is not None:
        fig.add_scatter(
            x=time_series_dates[-1:],  # Last timestamp
            y=predictions[
                predictions["pickup_location_id" == row_id]
            ],  # Predicted value
            line_color="red",
            mode="markers",
            marker_symbol="x",
            marker_size=15,
            name="Prediction",
        )

    return fig




In [88]:
from datetime import timedelta
from typing import Optional
import pandas as pd
import plotly.express as px
import numpy as np

def plot_aggregated_time_seriesss(
    features: pd.DataFrame,
    targets: pd.Series,
    row_id: int,
    predictions: Optional[np.ndarray] = None,
):
    """
    Plots the time series data for a specific location from NYC taxi data.

    Args:
        features (pd.DataFrame): DataFrame containing feature data, including historical ride counts and metadata.
        targets (pd.Series): Series containing the target values (e.g., actual ride counts).
        row_id (int): Index of the row to plot.
        predictions (Optional[np.ndarray]): NumPy array containing predicted values (optional).

    Returns:
        plotly.graph_objects.Figure: A Plotly figure object showing the time series plot.
    """

    # Extract the specific location's features and target
    location_features = features[features["pickup_location_id"] == row_id]
    actual_target = targets[features["pickup_location_id"] == row_id]

    # Identify time series columns (historical ride counts)
    time_series_columns = [col for col in features.columns if col.startswith("rides_t-")]

    # Convert to a single Series for plotting
    time_series_values = pd.concat([location_features[col] for col in time_series_columns] + [actual_target], axis=0).reset_index(drop=True)

    # Generate timestamps
    time_series_dates = pd.date_range(
        start=location_features["pickup_hour"].iloc[0] - timedelta(hours=len(time_series_columns)),
        end=location_features["pickup_hour"].iloc[0],
        freq="h"
    )

    # Ensure lengths match
    min_length = min(len(time_series_dates), len(time_series_values))
    time_series_dates = time_series_dates[:min_length]
    time_series_values = time_series_values[:min_length]

    # Create plot title
    title = f"Pickup Hour: {location_features['pickup_hour'].iloc[0]}, Location ID: {row_id}"

    # Create the base line plot
    fig = px.line(
        x=time_series_dates,
        y=time_series_values,
        template="plotly_white",
        markers=True,
        title=title,
        labels={"x": "Time", "y": "Ride Counts"},
    )

    # Add the actual target value as a green marker
    fig.add_scatter(
        x=[time_series_dates[-1]],  # Last timestamp
        y=[actual_target.iloc[0]],  # Actual target value
        mode="markers",
        marker=dict(color="green", size=10),
        name="Actual Value",
    )

    # Optionally add the prediction as a red marker
    print(len(predictions))
    if predictions is not None:
        if isinstance(predictions, np.ndarray):
            predictions = pd.Series(predictions, index=features["pickup_location_id"].values)

        # Ensure filtering doesn't return an empty result
        filtered_predictions = predictions.loc[features["pickup_location_id"] == row_id]

        if filtered_predictions.empty:
            print(f"Warning: No prediction found for row_id {row_id}. Skipping prediction plot.")
        else:
            predicted_value = filtered_predictions.values[0]
            fig.add_scatter(
                x=[time_series_dates[-1]],  # Last timestamp
                y=[predicted_value],  # Predicted value
                mode="markers",
                marker=dict(color="red", symbol="x", size=15),
                name="Prediction",
            )

    return fig

In [44]:
X_test

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-8,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2023-09-01,2
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2023-09-02,2
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2023-09-03,2
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2023-09-04,2
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2023-09-05,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31715,25,14,5,3,7,16,53,133,126,136,...,73,62,62,58,50,48,42,37,2023-12-27,263
31716,30,7,9,6,5,23,58,123,136,108,...,83,64,79,65,71,72,75,35,2023-12-28,263
31717,50,26,17,9,8,11,43,116,137,132,...,71,81,78,60,85,63,62,37,2023-12-29,263
31718,117,88,39,19,14,12,27,37,70,97,...,78,84,75,100,98,88,77,69,2023-12-30,263


In [45]:
y_test

0         0
1         0
2         0
3         0
4         0
         ..
31715    12
31716    19
31717    38
31718    59
31719    65
Name: target, Length: 31720, dtype: int16

In [53]:
predictions

array([  0.  ,   0.  ,   0.  , ...,  62.5 , 106.5 , 104.75])

In [85]:
X_test[X_test['pickup_location_id'] == 43].sort_values(by=["pickup_hour"])

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-8,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id
5002,12,6,1,1,0,7,8,46,41,48,...,131,74,82,62,64,34,38,12,2023-09-01,43
5003,13,7,1,0,0,4,2,11,28,35,...,122,113,80,107,67,58,39,9,2023-09-02,43
5004,14,5,3,1,0,3,7,13,12,27,...,174,166,174,142,86,63,53,13,2023-09-03,43
5005,2,2,0,0,1,5,13,36,34,39,...,169,144,90,61,45,28,31,7,2023-09-04,43
5006,3,0,0,0,1,4,5,44,61,59,...,136,145,85,46,33,28,9,8,2023-09-05,43
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5119,3,0,1,0,1,6,13,54,106,83,...,205,132,64,49,44,31,14,10,2023-12-27,43
5120,6,2,2,0,1,7,8,58,84,90,...,97,57,73,29,48,42,24,7,2023-12-28,43
5121,20,7,2,0,3,3,13,54,67,90,...,239,128,73,52,50,43,30,17,2023-12-29,43
5122,10,8,3,1,3,0,8,19,35,67,...,200,185,160,133,158,69,33,19,2023-12-30,43


In [76]:
location_features = X_test[X_test["pickup_location_id"] == 43]
actual_target = y_test[X_test["pickup_location_id"] == 43]

# Identify time series columns (historical ride counts)
time_series_columns = [col for col in X_test.columns if col.startswith("rides_t-")]

# Convert to a single Series for plotting
time_series_values = pd.concat([location_features[col] for col in time_series_columns] + [actual_target], axis=0).reset_index(drop=True)

In [84]:
len(time_series_values)

82106

In [89]:
plot_aggregated_time_seriesss(X_test, y_test, 43, predictions)


31720
