In [1]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder

In [21]:


# Step 2: Load Data
data = pd.read_csv('./datasets/processed_sales_df.csv')
data.head()


Unnamed: 0,bike_name,bike_type,date_ordered,quantity,year,month,day,day_of_week,quantity_lag1,rolling_avg_7
0,Speedster Pro,Road Bike,2018-01-06,3,2018,1,6,5,2.0,2.714286
1,Speedster Pro,Road Bike,2018-01-07,2,2018,1,7,6,3.0,2.714286
2,Speedster Pro,Road Bike,2018-01-07,2,2018,1,7,6,2.0,2.428571
3,Speedster Pro,Road Bike,2018-01-07,2,2018,1,7,6,2.0,2.285714
4,Speedster Pro,Road Bike,2018-01-08,3,2018,1,8,0,2.0,2.285714


In [22]:
# Step 5: One-Hot Encoding for Categorical Data (Product)
data = pd.get_dummies(data, columns=['bike_name'], drop_first=True)
# X = data[['product_encoded', 'year', 'month', 'day', 'day_of_week', 'quantity_lag1', 'rolling_avg_7']]
X = data.drop(columns=['quantity','bike_type','date_ordered'])
print(X)
y = data['quantity']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Step 7: Model Training (Random Forest)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 8: Model Evaluation
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')


       year  month  day  day_of_week  quantity_lag1  rolling_avg_7  \
0      2018      1    6            5            2.0       2.714286   
1      2018      1    7            6            3.0       2.714286   
2      2018      1    7            6            2.0       2.428571   
3      2018      1    7            6            2.0       2.285714   
4      2018      1    8            0            2.0       2.285714   
...     ...    ...  ...          ...            ...            ...   
19947  2023     12   25            0            9.0      10.142857   
19948  2023     12   26            1            9.0      10.714286   
19949  2023     12   27            2           12.0      10.571429   
19950  2023     12   28            3           13.0      10.714286   
19951  2023     12   28            3           14.0      11.142857   

       bike_name_AirMaster 300  bike_name_BMX Freestyle Pro  \
0                        False                        False   
1                        False   

In [23]:
from sklearn.metrics import r2_score

r_squared = r2_score(y_test, y_pred)
print(f'R^2 Score: {r_squared}')

R^2 Score: 0.8436716513751434


In [11]:
# Step 9: Future Prediction for all products (next 7 days example)
future_dates = pd.date_range(start=data['date_ordered'].max(), periods=7, freq='D')
future_data = pd.DataFrame({'date_ordered': future_dates})

# Prepare features for each product
all_future_preds = {}
for product in data['bike_name'].unique():
    # Create a copy for each product's prediction
    future_data_copy = future_data.copy()
    
    # Set the time-based features
    future_data_copy['year'] = future_data_copy['date_ordered'].dt.year
    future_data_copy['month'] = future_data_copy['date_ordered'].dt.month
    future_data_copy['day'] = future_data_copy['date_ordered'].dt.day
    future_data_copy['day_of_week'] = future_data_copy['date_ordered'].dt.dayofweek

    # Set lag and rolling average for the product based on recent data
    last_known_quantity = data[data['bike_name'] == product]['quantity'].iloc[-1]
    last_rolling_avg = data[data['bike_name'] == product]['rolling_avg_7'].iloc[-1]
    
    future_data_copy['quantity_lag1'] = last_known_quantity
    future_data_copy['rolling_avg_7'] = last_rolling_avg
    
    # Encode the product
    future_data_copy['product_encoded'] = le.transform([product])[0]

    # Predict the quantity for the next 7 days for this product
    future_preds = model.predict(future_data_copy[['product_encoded', 'year', 'month', 'day', 'day_of_week', 'quantity_lag1', 'rolling_avg_7']])
    
    # Store the predictions for this product
    all_future_preds[product] = future_preds


# Print future predictions for each product
for product, preds in all_future_preds.items():
    print(f"Future Predictions for {product}: {preds}")


{' Speedster Pro': array([11.02, 12.24,  3.38,  3.38,  3.25,  3.24,  3.21]), 'AirMaster 300': array([13.6 , 10.53,  3.84,  3.73,  3.26,  3.1 ,  3.28]), 'BMX Freestyle Pro': array([12.82, 10.32,  3.86,  3.73,  3.21,  3.13,  3.33]), 'Hybrid Explorer': array([11.84, 12.36,  3.45,  3.26,  3.15,  3.2 ,  3.06]), 'Mountain Climber XT': array([13.27, 15.12,  6.98,  5.66,  3.11,  3.02,  3.28]), 'Roadmaster Pro': array([12.73,  9.91,  3.49,  3.56,  3.24,  3.18,  3.43]), 'TrailBlazer XT': array([13.35, 11.94,  3.18,  3.12,  3.13,  3.08,  2.92]), 'UrbanCommuter 500': array([11.95, 11.06,  3.16,  3.06,  3.02,  2.91,  2.78])}
Future Predictions for  Speedster Pro: [11.02 12.24  3.38  3.38  3.25  3.24  3.21]
Future Predictions for AirMaster 300: [13.6  10.53  3.84  3.73  3.26  3.1   3.28]
Future Predictions for BMX Freestyle Pro: [12.82 10.32  3.86  3.73  3.21  3.13  3.33]
Future Predictions for Hybrid Explorer: [11.84 12.36  3.45  3.26  3.15  3.2   3.06]
Future Predictions for Mountain Climber XT: [

In [13]:
all_future_preds_df = pd.DataFrame(all_future_preds)
all_future_preds_df.to_csv('./datasets/sales_predictions.csv', index=False)
