In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder

In [23]:
data = pd.read_csv('./datasets/processed_sales_df.csv')
data.head()


Unnamed: 0,bike_name,bike_type,date_ordered,quantity,search_value,bike_type_search_value,year,month,day,day_of_week,quantity_lag1,rolling_avg_7,search_value_lag1,bike_type_search_value_lag1
0,Speedster Pro,Road Bike,2020-01-02,1,40,41,2020,1,2,3,1.0,1.0,40.0,41.0
1,Speedster Pro,Road Bike,2020-01-04,1,40,40,2020,1,4,5,1.0,1.0,40.0,41.0
2,Speedster Pro,Road Bike,2020-01-04,1,40,40,2020,1,4,5,1.0,1.0,40.0,40.0
3,Speedster Pro,Road Bike,2020-01-04,1,40,41,2020,1,4,5,1.0,1.0,40.0,40.0
4,Speedster Pro,Road Bike,2020-01-04,1,40,41,2020,1,4,5,1.0,1.0,40.0,41.0


In [29]:
# Step 5: Encode Categorical Data (Product)
le = LabelEncoder()
data['product_encoded'] = le.fit_transform(data['bike_name'])

X = data[['product_encoded', 'year', 'month', 'day', 'day_of_week', 'quantity_lag1', 'rolling_avg_7','search_value_lag1','bike_type_search_value_lag1']]
y = data['quantity']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Step 7: Model Training (Random Forest)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 8: Model Evaluation
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')


Mean Absolute Error: 0.31780215551143703


In [30]:
from sklearn.metrics import r2_score

r_squared = r2_score(y_test, y_pred)
print(f'R^2 Score: {r_squared}')

R^2 Score: 0.9580490940405398


In [47]:
# Step 9: Future Prediction for all products (next 7 days example)
future_dates = pd.date_range(start=data['date_ordered'].max(), periods=7, freq='D')
future_data = pd.DataFrame({'date_ordered': future_dates})
# Prepare features for each product
all_future_preds = {}
for product in data['bike_name'].unique():
    # Create a copy for each product's prediction
    future_data_copy = future_data.copy()
    
    # Set the time-based features
    future_data_copy['year'] = future_data_copy['date_ordered'].dt.year
    future_data_copy['month'] = future_data_copy['date_ordered'].dt.month
    future_data_copy['day'] = future_data_copy['date_ordered'].dt.day
    future_data_copy['day_of_week'] = future_data_copy['date_ordered'].dt.dayofweek

    # Set lag and rolling average for the product based on recent data
    last_known_quantity = data[data['bike_name'] == product]['quantity'].iloc[-1]
    last_rolling_avg = data[data['bike_name'] == product]['rolling_avg_7'].iloc[-1]

    last_search_val = data[data['bike_name'] == product]['search_value_lag1'].iloc[-1]
    last_bike_type_search_val = data[data['bike_name'] == product]['bike_type_search_value_lag1'].iloc[-1]
    
    future_data_copy['quantity_lag1'] = last_known_quantity
    future_data_copy['rolling_avg_7'] = last_rolling_avg

    future_data_copy['search_value_lag1'] = last_search_val
    future_data_copy['bike_type_search_value_lag1'] = last_bike_type_search_val
    
    # Encode the product
    future_data_copy['product_encoded'] = le.transform([product])[0]
    # print(future_data_copy.head())
    # Predict the quantity for the next 7 days for this product
    future_preds = model.predict(future_data_copy[['product_encoded', 'year', 'month', 'day', 'day_of_week', 'quantity_lag1', 'rolling_avg_7','search_value_lag1','bike_type_search_value_lag1']])
    
    # Store the predictions for this product
    all_future_preds[product] = future_preds

    # print('future')
    # print(future_preds)
# Print future predictions for each product
# for product, preds in all_future_preds.items():
#     print(f"Future Predictions for {product}: {preds}")


In [49]:
all_future_preds_df = pd.DataFrame(all_future_preds)

# Sum each column and create a new DataFrame with the result in one row
total_sum_row = pd.DataFrame(all_future_preds_df.sum()).T  # Transpose to get a single row
print(total_sum_row)


    Speedster Pro  AirMaster 300  BMX Freestyle Pro  Hybrid Explorer  \
0            21.0           14.0               28.0             21.0   

   Mountain Climber XT  Roadmaster Pro  TrailBlazer XT  UrbanCommuter 500  
0                 28.0            28.0             7.0                7.0  


In [50]:
total_sum_row.to_csv('./datasets/sales_predictions.csv', index=False)
