In [1]:
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression

from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor



In [2]:


# Step 2: Load Data
data = pd.read_csv('./datasets/processed_sales_df.csv')
data.head()
# data['date_ordered'] = pd.to_datetime(data['date_ordered'])

Unnamed: 0,bike_name,bike_type,date_ordered,quantity,year,month,day,day_of_week,quantity_lag1,rolling_avg_7
0,Speedster Pro,Road Bike,2018-01-06,3,2018,1,6,5,2.0,2.714286
1,Speedster Pro,Road Bike,2018-01-07,2,2018,1,7,6,3.0,2.714286
2,Speedster Pro,Road Bike,2018-01-07,2,2018,1,7,6,2.0,2.428571
3,Speedster Pro,Road Bike,2018-01-07,2,2018,1,7,6,2.0,2.285714
4,Speedster Pro,Road Bike,2018-01-08,3,2018,1,8,0,2.0,2.285714


In [3]:


# Ensure 'date_ordered' is datetime
data['date_ordered'] = pd.to_datetime(data['date_ordered'])

# Sort data
data = data.sort_values(by=['bike_name', 'date_ordered'])

# Create lag features
data['quantity_lag1'] = data.groupby('bike_name')['quantity'].shift(1)
data['rolling_avg_7'] = data.groupby('bike_name')['quantity'].shift(1).rolling(window=7).mean()

# Drop NaNs
data = data.dropna(subset=['quantity_lag1', 'rolling_avg_7'])

# Define features and target
categorical_features = ['bike_name']
numerical_features = ['quantity', 'year', 'month', 'day', 'day_of_week', 'quantity_lag1', 'rolling_avg_7']
feature_columns = categorical_features + numerical_features
X = data[feature_columns]
y = data['quantity']



In [4]:

# Define transformers and pipeline
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
numerical_transformer = 'passthrough'
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features)
    ]
)
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])

cross_validation_scores = []

# Time series cross-validation
tscv = TimeSeriesSplit(n_splits=5)
for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    # Evaluate model (e.g., print R^2 score)
    r2 = r2_score(y_test, y_pred)
    cross_validation_scores.append(r2)

print("cross validation scores: "+str(cross_validation_scores))


cross validation scores: [0.9998876576447119, 0.9999471244146734, 0.9999993244395322, 0.9999995311082016, 0.9999995390065006]


In [5]:
# Retrain on full data
pipeline.fit(X, y)

# Future predictions
future_dates = pd.date_range(start=data['date_ordered'].max() + pd.Timedelta(days=1), periods=7, freq='D')
all_future_preds = {}
products = data['bike_name'].unique()

for product in products:
    future_data = pd.DataFrame({'date_ordered': future_dates})
    future_data['year'] = future_data['date_ordered'].dt.year
    future_data['month'] = future_data['date_ordered'].dt.month
    future_data['day'] = future_data['date_ordered'].dt.day
    future_data['day_of_week'] = future_data['date_ordered'].dt.dayofweek
    product_data = data[data['bike_name'] == product]
    # if not product_data.empty:
        # Initialize with the last known values
    quantity = product_data['quantity'].iloc[-1]
    quantity_lag1 = product_data['quantity_lag1'].iloc[-1]
    rolling_avg_7 = product_data['rolling_avg_7'].iloc[-1]
    # else:
    #     quantity = 0
    #     quantity_lag1 = 0
    #     rolling_avg_7 = 0
    predictions = []
    for i in range(len(future_dates)):
        row = {
            'bike_name': product,
            'quantity': quantity,
            'year': future_data.iloc[i]['year'],
            'month': future_data.iloc[i]['month'],
            'day': future_data.iloc[i]['day'],
            'day_of_week': future_data.iloc[i]['day_of_week'],
            'quantity_lag1': quantity_lag1,
            'rolling_avg_7': rolling_avg_7
        }
        X_future = pd.DataFrame([row])
        pred = pipeline.predict(X_future)[0]
        predictions.append(pred)
        # Update 'quantity' and lag features for next prediction
        quantity_lag1 = quantity
        quantity = pred  # Use the predicted value as the 'quantity' for the next step
        # Optionally update rolling_avg_7 (requires maintaining a window of predictions)
    all_future_preds[product] = predictions

# Print predictions
for product, preds in all_future_preds.items():
    print(f"Future Predictions for {product}:")
    for date, pred in zip(future_dates, preds):
        print(f"Date: {date.date()}, Predicted Quantity: {pred}")
    print("\n")

Future Predictions for  Speedster Pro:
Date: 2024-01-01, Predicted Quantity: 12.0
Date: 2024-01-02, Predicted Quantity: 12.0
Date: 2024-01-03, Predicted Quantity: 12.0
Date: 2024-01-04, Predicted Quantity: 12.0
Date: 2024-01-05, Predicted Quantity: 12.0
Date: 2024-01-06, Predicted Quantity: 12.0
Date: 2024-01-07, Predicted Quantity: 12.0


Future Predictions for AirMaster 300:
Date: 2024-01-01, Predicted Quantity: 14.0
Date: 2024-01-02, Predicted Quantity: 14.0
Date: 2024-01-03, Predicted Quantity: 14.0
Date: 2024-01-04, Predicted Quantity: 14.0
Date: 2024-01-05, Predicted Quantity: 14.0
Date: 2024-01-06, Predicted Quantity: 14.0
Date: 2024-01-07, Predicted Quantity: 14.0


Future Predictions for BMX Freestyle Pro:
Date: 2024-01-01, Predicted Quantity: 13.0
Date: 2024-01-02, Predicted Quantity: 13.0
Date: 2024-01-03, Predicted Quantity: 13.0
Date: 2024-01-04, Predicted Quantity: 13.0
Date: 2024-01-05, Predicted Quantity: 13.0
Date: 2024-01-06, Predicted Quantity: 13.0
Date: 2024-01-07,