In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder


In [2]:
data = pd.read_csv('sales_df.csv')
data.head()

Unnamed: 0,bike_name,bike_type,date_ordered,quantity
0,TrailBlazer XT,Mountain Bike,2020-12-14,2
1,UrbanCommuter 500,Hybrid Bike,2020-07-26,3
2,Mountain Climber XT,Mountain Bike,2020-10-08,5
3,TrailBlazer XT,Mountain Bike,2019-08-10,8
4,AirMaster 300,BMX,2019-12-20,11


In [4]:
from datetime import datetime


# Step 3: Data Preprocessing
data['date_ordered'] = pd.to_datetime(data['date_ordered'])
data = data.sort_values(['bike_name', 'date_ordered'])  # Sort by product and date

# Step 4: Split Data into Training and Test Set
# Train on data before 2023

# Feature Engineering on Training Data
data['year'] = data['date_ordered'].dt.year
data['month'] = data['date_ordered'].dt.month
data['day'] = data['date_ordered'].dt.day
data['day_of_week'] = data['date_ordered'].dt.dayofweek

# Add seasonality features (cosine/sine transformation)
data['month_sin'] = np.sin(2 * np.pi * data['month'] / 12)
data['month_cos'] = np.cos(2 * np.pi * data['month'] / 12)

#
# Special events for sales spikes
christmas = [datetime(year, 12, 25) for year in range(2021, 2024)]
black_friday = [datetime(year, 11, 26) for year in range(2021, 2024)]
tour_de_france = [datetime(year, 7, 1) for year in range(2021, 2024)]
summer_season = [(datetime(year, 6, 1), datetime(year, 8, 31)) for year in range(2021, 2024)]

# Now proceed with the feature engineering
data['is_christmas'] = data['date_ordered'].apply(lambda x: 1 if x in christmas else 0)
data['is_black_friday'] = data['date_ordered'].apply(lambda x: 1 if x in black_friday else 0)
data['is_tour_de_france'] = data['date_ordered'].apply(lambda x: 1 if any(abs((x - event).days) <= 30 for event in tour_de_france) else 0)
data['is_summer'] = data['date_ordered'].apply(lambda x: 1 if any(start <= x <= end for start, end in summer_season) else 0)


# Lag features for each product's previous quantity
data['quantity_lag1'] = data.groupby('bike_name')['quantity'].shift(1)
data['rolling_avg_7'] = data.groupby('bike_name')['quantity'].rolling(window=7).mean().reset_index(0, drop=True)

# Drop rows with NaN values from lag/rolling features
data = data.dropna()


# Step 5: Encode Categorical Data (Product)
le = LabelEncoder()
# Include bike type as a categorical feature
data['bike_type_encoded'] = le.fit_transform(data['bike_type'])
data['product_encoded'] = le.fit_transform(data['bike_name'])

train_data = data[data['date_ordered'] < '2023-01-01']
test_data = data[data['date_ordered'] >= '2023-01-01']

# Step 6: Define Features and Target (including event-based features)
X_train = train_data[['product_encoded', 'bike_type_encoded', 'year', 'month', 'day', 'day_of_week', 'month_sin', 'month_cos', 
                      'quantity_lag1', 'rolling_avg_7', 'is_christmas', 'is_black_friday', 'is_tour_de_france', 'is_summer']]
y_train = train_data['quantity']

X_test = test_data[['product_encoded', 'bike_type_encoded', 'year', 'month', 'day', 'day_of_week', 'month_sin', 'month_cos', 
                    'quantity_lag1', 'rolling_avg_7', 'is_christmas', 'is_black_friday', 'is_tour_de_france', 'is_summer']]
y_test = test_data['quantity']

print(X_train.head())



       product_encoded  bike_type_encoded  year  month  day  day_of_week  \
14850                0                  3  2018      1    6            5   
1553                 0                  3  2018      1    7            6   
4501                 0                  3  2018      1    7            6   
12960                0                  3  2018      1    7            6   
3646                 0                  3  2018      1    8            0   

       month_sin  month_cos  quantity_lag1  rolling_avg_7  is_christmas  \
14850        0.5   0.866025            2.0       2.714286             0   
1553         0.5   0.866025            3.0       2.714286             0   
4501         0.5   0.866025            2.0       2.428571             0   
12960        0.5   0.866025            2.0       2.285714             0   
3646         0.5   0.866025            2.0       2.285714             0   

       is_black_friday  is_tour_de_france  is_summer  
14850                0               

In [19]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
print("XTRAINNNNN")
print(X_train.head())
# Step 8: Model Evaluation
y_pred = model.predict(X_test)



mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')
# print('aaaaa')
# print(data.head())
# Step 9: Future Prediction for all products (next 7 days example)
future_dates = pd.date_range(start=data['date_ordered'].max(), periods=7, freq='D')
future_data = pd.DataFrame({'date_ordered': future_dates})
# print(future_data.head())


# Assuming future_dates is already defined as a date range
future_data = pd.DataFrame({'date_ordered': future_dates})

# Filter rows in 'data' where 'date_ordered' is in 'future_dates'
future_data= data[data['date_ordered'].isin(future_data['date_ordered'])]






# Prepare features for each product
all_future_preds = {}
for product in data['bike_name'].unique():
    # Create a copy for each product's prediction
    future_data_copy = future_data.copy()
    print(future_data_copy.head())

    # Predict the quantity for the next 7 days for this product
    future_preds = model.predict(future_data_copy['bike_name','bike_type','date_ordered','quantity'])
    
    # Store the predictions for this product
    all_future_preds[product] = future_preds

# Print future predictions for each product
for product, preds in all_future_preds.items():
    print(f"Future Predictions for {product}: {preds}")


XTRAINNNNN
       product_encoded  bike_type_encoded  year  month  day  day_of_week  \
14850                0                  3  2018      1    6            5   
1553                 0                  3  2018      1    7            6   
4501                 0                  3  2018      1    7            6   
12960                0                  3  2018      1    7            6   
3646                 0                  3  2018      1    8            0   

       month_sin  month_cos  quantity_lag1  rolling_avg_7  is_christmas  \
14850        0.5   0.866025            2.0       2.714286             0   
1553         0.5   0.866025            3.0       2.714286             0   
4501         0.5   0.866025            2.0       2.428571             0   
12960        0.5   0.866025            2.0       2.285714             0   
3646         0.5   0.866025            2.0       2.285714             0   

       is_black_friday  is_tour_de_france  is_summer  
14850                0    

KeyError: ('bike_name', 'bike_type', 'date_ordered', 'quantity')

In [11]:
accuracy = model.score(X_test, y_test)
print(f'Accuracy on test data: {accuracy:.2f}')

Accuracy on test data: 0.85
