In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder


In [18]:
data = pd.read_csv('sales_df.csv')
data.rename(columns={'bike_name': 'x'}, inplace=True)
data.rename(columns={'bike_type': 'y'}, inplace=True)
data.rename(columns={'quantity': 'z'}, inplace=True)
data.rename(columns={'date_ordered': 'date'}, inplace=True)
print(data.head())

                   x            y        date   z
0  BMX Freestyle Pro          BMX  2022-11-29  17
1  BMX Freestyle Pro          BMX  2021-11-22  19
2      AirMaster 300          BMX  2021-09-05   2
3     Roadmaster Pro    Road Bike  2023-07-30   8
4  UrbanCommuter 500  Hybrid Bike  2023-05-13   1


In [25]:


# Parse date column (if necessary)
data['date'] = pd.to_datetime(data['date'])

# Shift the target variables to predict future values of 'x' and 'y'
data['x_future'] = data['x'].shift(-1)  # Shifting to create next time period target for x
data['y_future'] = data['y'].shift(-1)  # Shifting to create next time period target for y

# Drop the last row with NaN values due to shifting
data = data.dropna()
print(data)
# OneHotEncode the current 'x' column (which contains strings)
one_hot_encoder = OneHotEncoder(sparse_output=False, drop='first')  
x_encoded = one_hot_encoder.fit_transform(data[['x']])

# Combine the one-hot-encoded x with other features
X = pd.concat([data[['date', 'y', 'z']], pd.DataFrame(x_encoded, index=data.index)], axis=1)
# print(X)
# Convert the date into numerical format for modeling (optional)
X['date'] = X['date'].map(pd.Timestamp.toordinal)
print('x')
# Encode the target 'x_future' column
x_future_encoded = one_hot_encoder.transform(data[['x_future']])
print(x_future_encoded)
# Prepare the final dataset
y_x = pd.DataFrame(x_future_encoded, index=data.index)  # Target for x (encoded future values)
y_y = data['y_future']                # Target for y (numerical future values)

# Split into training and test sets (make sure both X and y_x have the same features and structure)
X_train, X_test, y_x_train, y_x_test = train_test_split(X, y_x, test_size=0.2, random_state=42)
_, _, y_y_train, y_y_test = train_test_split(X, y_y, test_size=0.2, random_state=42)

# Initialize the models
model_x = RandomForestRegressor(n_estimators=100, random_state=42)
model_y = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the models
model_x.fit(X_train, y_x_train)
model_y.fit(X_train, y_y_train)

# Predict
y_x_pred = model_x.predict(X_test)
y_y_pred = model_y.predict(X_test)

# Evaluate the model for y (since x is one-hot-encoded and less interpretable)
mae_x = mean_absolute_error(y_x_test, y_x_pred)
mae_y = mean_absolute_error(y_y_test, y_y_pred)

print(f'MAE for x: {mae_x}')
print(f'MAE for y: {mae_y}')

# Visualize the predictions vs actual values for y (since x is categorical)
plt.figure(figsize=(10, 5))
plt.plot(X_test['date'], y_y_test, label='Actual y')
plt.plot(X_test['date'], y_y_pred, label='Predicted y')
plt.legend()
plt.show()


                      x              y       date   z           x_future  \
0     BMX Freestyle Pro            BMX 2022-11-29  17  BMX Freestyle Pro   
1     BMX Freestyle Pro            BMX 2021-11-22  19      AirMaster 300   
2         AirMaster 300            BMX 2021-09-05   2     Roadmaster Pro   
3        Roadmaster Pro      Road Bike 2023-07-30   8  UrbanCommuter 500   
4     UrbanCommuter 500    Hybrid Bike 2023-05-13   1      AirMaster 300   
...                 ...            ...        ...  ..                ...   
4988     Roadmaster Pro      Road Bike 2021-04-04   3     Roadmaster Pro   
4989     Roadmaster Pro      Road Bike 2023-10-01   4  BMX Freestyle Pro   
4990  BMX Freestyle Pro            BMX 2023-09-19   7     TrailBlazer XT   
4991     TrailBlazer XT  Mountain Bike 2021-05-14   3      Speedster Pro   
4992      Speedster Pro      Road Bike 2023-03-02   3    Hybrid Explorer   

           y_future  
0               BMX  
1               BMX  
2         Road Bike  

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- x_future
Feature names seen at fit time, yet now missing:
- x
