In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score

In [3]:
data = pd.read_csv('./datasets/processed_sales_df.csv')
data.head()


Unnamed: 0,bike_name,bike_type,date_ordered,quantity,search_value,bike_type_search_value,year,month,day,day_of_week,quantity_lag1,rolling_avg_7,search_value_lag1,bike_type_search_value_lag1
0,Speedster Pro,Road Bike,2020-01-02,1,40,41,2020,1,2,3,1.0,1.0,40.0,41.0
1,Speedster Pro,Road Bike,2020-01-04,1,40,40,2020,1,4,5,1.0,1.0,40.0,41.0
2,Speedster Pro,Road Bike,2020-01-04,1,40,40,2020,1,4,5,1.0,1.0,40.0,40.0
3,Speedster Pro,Road Bike,2020-01-04,1,40,41,2020,1,4,5,1.0,1.0,40.0,40.0
4,Speedster Pro,Road Bike,2020-01-04,1,40,41,2020,1,4,5,1.0,1.0,40.0,41.0


In [4]:
# Step 5: Encode Categorical Data (Product)
le = LabelEncoder()
data['product_encoded'] = le.fit_transform(data['bike_name'])

X = data[['product_encoded', 'year', 'month', 'day', 'day_of_week', 'quantity_lag1', 'rolling_avg_7','search_value_lag1','bike_type_search_value_lag1']]
y = data['quantity']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)



Model

Model 1: Random Forest Regressor

In [12]:

# Step 7: Model Training (Random Forest)
model1 = RandomForestRegressor(n_estimators=100, random_state=42)
model1.fit(X_train, y_train)

# Step 8: Model Evaluation
y_pred1 = model1.predict(X_test)
mae1 = mean_absolute_error(y_test, y_pred1)
print(f'Mean Absolute Error: {1}')

r_squared1 = r2_score(y_test, y_pred1)
print(f'R^2 Score: {r_squared1}')

Mean Absolute Error: 1
R^2 Score: 0.9580490940405398


Model 2: Gradient Boosting Regressor

In [13]:
from sklearn.ensemble import GradientBoostingRegressor

model2 = GradientBoostingRegressor(n_estimators=100, random_state=42)
model2.fit(X_train, y_train)


y_pred2 = model2.predict(X_test)
mae2 = mean_absolute_error(y_test, y_pred2)
print(f'Mean Absolute Error: {mae2}')

r_squared2 = r2_score(y_test, y_pred2)
print(f'R^2 Score: {r_squared2}')

Mean Absolute Error: 0.3249279507660825
R^2 Score: 0.9499581286525434


Model 3: XGBoost Regressor

In [14]:
from xgboost import XGBRegressor

model3 = XGBRegressor(n_estimators=100, random_state=42)
model3.fit(X_train, y_train)

y_pred3 = model3.predict(X_test)
mae3 = mean_absolute_error(y_test, y_pred3)
print(f'Mean Absolute Error: {mae3}')

r_squared3 = r2_score(y_test, y_pred3)
print(f'R^2 Score: {r_squared3}')

Mean Absolute Error: 0.32126692595118506
R^2 Score: 0.9628304961841433


Model 4: Decision Tree

In [15]:
from sklearn.tree import DecisionTreeRegressor

model4 = DecisionTreeRegressor(random_state=42)
model4.fit(X_train, y_train)

y_pred4 = model4.predict(X_test)
mae4 = mean_absolute_error(y_test, y_pred4)
print(f'Mean Absolute Error: {mae4}')

r_squared4 = r2_score(y_test, y_pred4)
print(f'R^2 Score: {r_squared4}')

Mean Absolute Error: 0.33815631214795233
R^2 Score: 0.9020524094770809


In [22]:
best_model = max([r_squared1, r_squared2, r_squared3, r_squared4])
print("Best model: XGBoost regressor ")
print("Score: "+str(best_model))

Best model: XGBoost regressor 
Score: 0.9628304961841433


In [24]:
model = model3

In [25]:
# Step 9: Future Prediction for all products (next 7 days example)
future_dates = pd.date_range(start=data['date_ordered'].max(), periods=7, freq='D')
future_data = pd.DataFrame({'date_ordered': future_dates})
# Prepare features for each product
all_future_preds = {}
for product in data['bike_name'].unique():
    # Create a copy for each product's prediction
    future_data_copy = future_data.copy()
    
    # Set the time-based features
    future_data_copy['year'] = future_data_copy['date_ordered'].dt.year
    future_data_copy['month'] = future_data_copy['date_ordered'].dt.month
    future_data_copy['day'] = future_data_copy['date_ordered'].dt.day
    future_data_copy['day_of_week'] = future_data_copy['date_ordered'].dt.dayofweek

    # Set lag and rolling average for the product based on recent data
    last_known_quantity = data[data['bike_name'] == product]['quantity'].iloc[-1]
    last_rolling_avg = data[data['bike_name'] == product]['rolling_avg_7'].iloc[-1]

    last_search_val = data[data['bike_name'] == product]['search_value_lag1'].iloc[-1]
    last_bike_type_search_val = data[data['bike_name'] == product]['bike_type_search_value_lag1'].iloc[-1]
    
    future_data_copy['quantity_lag1'] = last_known_quantity
    future_data_copy['rolling_avg_7'] = last_rolling_avg

    future_data_copy['search_value_lag1'] = last_search_val
    future_data_copy['bike_type_search_value_lag1'] = last_bike_type_search_val
    
    # Encode the product
    future_data_copy['product_encoded'] = le.transform([product])[0]
    # print(future_data_copy.head())
    # Predict the quantity for the next 7 days for this product
    future_preds = model.predict(future_data_copy[['product_encoded', 'year', 'month', 'day', 'day_of_week', 'quantity_lag1', 'rolling_avg_7','search_value_lag1','bike_type_search_value_lag1']])
    
    # Store the predictions for this product
    all_future_preds[product] = future_preds



In [26]:
all_future_preds_df = pd.DataFrame(all_future_preds)

# Sum each column and create a new DataFrame with the result in one row
total_sum_row = pd.DataFrame(all_future_preds_df.sum()).T  # Transpose to get a single row
total_sum_row[total_sum_row.select_dtypes(include=['float']).columns] = total_sum_row.select_dtypes(include=['float']).astype(int)
print(total_sum_row)


    Speedster Pro  AirMaster 300  BMX Freestyle Pro  Hybrid Explorer  \
0              21             14                 28               21   

   Mountain Climber XT  Roadmaster Pro  TrailBlazer XT  UrbanCommuter 500  
0                   28              28               7                  7  


In [27]:
total_sum_row.to_csv('./datasets/sales_predictions.csv', index=False)
