In [6]:
""" 03 â€” Train/Test Split (Time-Series Aware)
This notebook loads the engineered dataset, selects model features, and performs a **time-ordered train/test split** to prepare data for model building."""

' 03 â€” Train/Test Split (Time-Series Aware)\nThis notebook loads the engineered dataset, selects model features, and performs a **time-ordered train/test split** to prepare data for model building.'

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import joblib
import os

In [8]:
# Folders for storing processed data & models
os.makedirs("models", exist_ok=True)
os.makedirs("Data/processed", exist_ok=True)

In [9]:
""" ðŸ“Œ Load Engineered Dataset  
If processed features already exist â†’ load them.  
Else â†’ recompute them exactly like Feature Engineering file. """

' ðŸ“Œ Load Engineered Dataset  \nIf processed features already exist â†’ load them.  \nElse â†’ recompute them exactly like Feature Engineering file. '

In [10]:
# Option A: load existing engineered CSV
engineered_path = "Data/processed/engineered.csv"

if os.path.exists(engineered_path):
    data = pd.read_csv(engineered_path, parse_dates=['date'])
else:
    # Option B: compute features inline (same logic as 02_feature_engineering)
    data = pd.read_csv("Data/Uber-Jan-Feb-FOIL.csv")
    data['date'] = pd.to_datetime(data['date'])
    data = data.sort_values('date')

    data['day_of_week'] = data['date'].dt.day_name()
    data['is_weekend'] = data['day_of_week'].isin(['Saturday','Sunday']).astype(int)
    data['month'] = data['date'].dt.month
    data['day'] = data['date'].dt.day

    data['trips_rolling_mean_3'] = data['trips'].rolling(window=3).mean()
    data['trips_rolling_mean_7'] = data['trips'].rolling(window=7).mean()

    data['lag_1'] = data['trips'].shift(1)
    data['lag_2'] = data['trips'].shift(2)
    data['lag_3'] = data['trips'].shift(3)

    data = data.dropna().reset_index(drop=True)

    # Save engineered version for future fast loading
    data.to_csv(engineered_path, index=False)
    print("Saved engineered features to:", engineered_path)

print("Loaded data shape:", data.shape)
display(data.head())


Loaded data shape: (348, 13)


Unnamed: 0,dispatching_base_number,date,active_vehicles,trips,day_of_week,is_weekend,month,day,trips_rolling_mean_3,trips_rolling_mean_7,lag_1,lag_2,lag_3
0,B02764,2015-01-02,3147,19974,Friday,0,1,2,12138.0,10915.857143,6903.0,9537.0,7679.0
1,B02765,2015-01-02,196,1001,Friday,0,1,2,9292.666667,10897.142857,19974.0,6903.0,9537.0
2,B02682,2015-01-02,890,5506,Friday,0,1,2,8827.0,11431.571429,1001.0,19974.0,6903.0
3,B02617,2015-01-02,1137,7065,Friday,0,1,2,4524.0,8237.857143,5506.0,1001.0,19974.0
4,B02598,2015-01-02,785,4768,Friday,0,1,2,5779.666667,7822.0,7065.0,5506.0,1001.0


In [11]:
""" ðŸ“Œ Select Features for Model Training  
These features must match exactly with Feature Engineering output."""

' ðŸ“Œ Select Features for Model Training  \nThese features must match exactly with Feature Engineering output.'

In [12]:
features = [
    'active_vehicles', 'is_weekend', 'month', 'day',
    'trips_rolling_mean_3', 'trips_rolling_mean_7',
    'lag_1', 'lag_2', 'lag_3'
]

X = data[features].copy()
y = data['trips'].copy()

In [13]:
""" ðŸ“Œ Time-Series Train/Test Split  
- No shuffling (very important).  
- Last 20% of data = test set."""

' ðŸ“Œ Time-Series Train/Test Split  \n- No shuffling (very important).  \n- Last 20% of data = test set.'

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

print("Shapes:")
print("X_train:", X_train.shape, "| X_test:", X_test.shape)
print("y_train:", y_train.shape, "| y_test:", y_test.shape)

Shapes:
X_train: (278, 9) | X_test: (70, 9)
y_train: (278,) | y_test: (70,)


In [15]:
"""ðŸ“Œ Save Train/Test Data  
We save both CSV (easy inspection) and Joblib (fast loading, used by Streamlit app)."""

'ðŸ“Œ Save Train/Test Data  \nWe save both CSV (easy inspection) and Joblib (fast loading, used by Streamlit app).'

In [17]:
# Save as CSV
X_train.to_csv("Data/processed/X_train.csv", index=False)
X_test.to_csv("Data/processed/X_test.csv", index=False)
y_train.to_csv("Data/processed/y_train.csv", index=False)
y_test.to_csv("Data/processed/y_test.csv", index=False)

# Save as Joblib
joblib.dump(X_train, "models/X_train.joblib")
joblib.dump(X_test, "models/X_test.joblib")
joblib.dump(y_train, "models/y_train.joblib")
joblib.dump(y_test, "models/y_test.joblib")

print("Saved processed splits to Data/processed/ and models/")

Saved processed splits to Data/processed/ and models/


In [18]:
""" ðŸ“Œ Validate Time Ordering  
Ensure that:
- Last training date < First test date """

' ðŸ“Œ Validate Time Ordering  \nEnsure that:\n- Last training date < First test date '

In [19]:
train_last_date = data.iloc[len(X_train)-1]['date']
test_first_date = data.iloc[len(X_train)]['date']

print("Train last date:", train_last_date)
print("Test first date:", test_first_date)

Train last date: 2015-02-17 00:00:00
Test first date: 2015-02-17 00:00:00


In [20]:
""" âœ… Train/Test Split Summary  

- Loaded engineered dataset  
- Ensured deterministic features  
- Selected 9 final model features  
- Performed **time-aware** train/test split  
- Saved all processed files for:
  - Model Building notebook (Notebook 04)
  - Streamlit App deployment  

This prepares the dataset for building our machine learning models."""

' âœ… Train/Test Split Summary  \n\n- Loaded engineered dataset  \n- Ensured deterministic features  \n- Selected 9 final model features  \n- Performed **time-aware** train/test split  \n- Saved all processed files for:\n  - Model Building notebook (Notebook 04)\n  - Streamlit App deployment  \n\nThis prepares the dataset for building our machine learning models.'