In [1]:
import pandas as pd
import warnings
import numpy as np
warnings.filterwarnings("ignore")
from sklearn.model_selection import KFold
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
orders_df  = pd.read_csv('SimulatedOrders.csv')
products_df = pd.read_csv('ProductsOnWebsite.csv')

In [None]:
orders_df['OrderDate'] = pd.to_datetime(orders_df['OrderDate'], format='%d/%m/%Y')
daily_demand_df = orders_df.groupby(['ProductName', 'OrderDate']).agg({
    'Quantity': 'sum',
    'Price': 'mean'
}).reset_index()
merged_df = pd.merge(daily_demand_df, products_df, on='ProductName', how='left')
print("\nMerged DataFrame Head:")
print(merged_df.head())

In [None]:
label_encoder = LabelEncoder()
merged_df['ProductName'] = label_encoder.fit_transform(merged_df['ProductName'])
merged_df['Brand'] = label_encoder.fit_transform(merged_df['Brand'])
merged_df['Category'] = label_encoder.fit_transform(merged_df['Category'])
merged_df['SubCategory'] = label_encoder.fit_transform(merged_df['SubCategory'])
X = merged_df[['ProductName', 'Brand', 'Price_y', 'DiscountPrice', 'Category', 'SubCategory']]
y = merged_df['Quantity_x']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R2): {r2}")
joblib.dump(model, 'demand_forecasting_model.pkl')
joblib.dump(label_encoder, 'demand_label_encoder.pkl')

In [13]:
merged_df['OrderDay'] = merged_df['OrderDate'].dt.day
merged_df['OrderMonth'] = merged_df['OrderDate'].dt.month
merged_df['PriceDiff'] = merged_df['Price_y'] - merged_df['DiscountPrice']
X = merged_df[['ProductName', 'Brand', 'Price_y', 'DiscountPrice', 'Category', 'SubCategory', 'OrderDay', 'OrderMonth', 'PriceDiff']]
y = merged_df['Quantity_x']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(y_test, y_pred_rf)
print(f"Random Forest Mean Squared Error (MSE): {mse_rf}")
print(f"Random Forest Root Mean Squared Error (RMSE): {rmse_rf}")
print(f"Random Forest R-squared (R2): {r2_rf}")
joblib.dump(rf_model, 'random_forest_model.pkl')

In [15]:
merged_df['Price_Discount_Interaction'] = merged_df['Price_y'] * merged_df['DiscountPrice']
merged_df['Lag_Quantity_1'] = merged_df.groupby('ProductName')['Quantity_x'].shift(1).fillna(0)
X = merged_df[['ProductName', 'Brand', 'Price_y', 'DiscountPrice', 'Category', 'SubCategory', 'OrderDay', 'OrderMonth', 'PriceDiff', 'Price_Discount_Interaction', 'Lag_Quantity_1']]
y = merged_df['Quantity_x']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
rf_model = RandomForestRegressor(random_state=42)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
best_rf_model = grid_search.best_estimator_
y_pred_best_rf = best_rf_model.predict(X_test)
mse_best_rf = mean_squared_error(y_test, y_pred_best_rf)
rmse_best_rf = np.sqrt(mse_best_rf)
r2_best_rf = r2_score(y_test, y_pred_best_rf)
print(f"Best Random Forest Mean Squared Error (MSE): {mse_best_rf}")
print(f"Best Random Forest Root Mean Squared Error (RMSE): {rmse_best_rf}")
print(f"Best Random Forest R-squared (R2): {r2_best_rf}")
joblib.dump(best_rf_model, 'best_random_forest_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

In [16]:
merged_df['Price_Discount_Interaction'] = merged_df['Price_y'] * merged_df['DiscountPrice']
merged_df['Lag_Quantity_1'] = merged_df.groupby('ProductName')['Quantity_x'].shift(1).fillna(0)
X = merged_df[['ProductName', 'Brand', 'Price_y', 'DiscountPrice', 'Category', 'SubCategory', 'OrderDay', 'OrderMonth', 'PriceDiff', 'Price_Discount_Interaction', 'Lag_Quantity_1']]
y = merged_df['Quantity_x']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
param_dist = {
    'n_estimators': [50, 100],
    'max_depth': [10, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
rf_model = RandomForestRegressor(random_state=42)
cv = KFold(n_splits=3, shuffle=True, random_state=42)
cv_splits = list(cv.split(X_train))
wrapped_cv = [(train_idx, test_idx) for train_idx, test_idx in tqdm(cv_splits, desc="Cross-Validation Splits")]
random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_dist, 
                                   n_iter=10, cv=wrapped_cv, n_jobs=-1, verbose=2, random_state=42)
random_search.fit(X_train, y_train)
best_rf_model = random_search.best_estimator_
y_pred_best_rf = best_rf_model.predict(X_test)
mse_best_rf = mean_squared_error(y_test, y_pred_best_rf)
rmse_best_rf = np.sqrt(mse_best_rf)
r2_best_rf = r2_score(y_test, y_pred_best_rf)
print(f"Best Random Forest Mean Squared Error (MSE): {mse_best_rf}")
print(f"Best Random Forest Root Mean Squared Error (RMSE): {rmse_best_rf}")
print(f"Best Random Forest R-squared (R2): {r2_best_rf}")
joblib.dump(best_rf_model, 'best_random_forest_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

In [10]:
best_rf_model = joblib.load('best_random_forest_model.pkl')
scaler = joblib.load('scaler.pkl')

In [None]:
feature_importances = best_rf_model.feature_importances_
features = X.columns
importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title('Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

In [17]:
merged_df['Lag_Quantity_2'] = merged_df.groupby('ProductName')['Quantity_x'].shift(2).fillna(0)
merged_df['Lag_Quantity_3'] = merged_df.groupby('ProductName')['Quantity_x'].shift(3).fillna(0)
merged_df['Rolling_Mean_3'] = merged_df.groupby('ProductName')['Quantity_x'].transform(lambda x: x.shift(1).rolling(window=3).mean()).fillna(0)
X = merged_df[['ProductName', 'Brand', 'Price_y', 'DiscountPrice', 'Category', 'SubCategory', 
               'OrderDay', 'OrderMonth', 'PriceDiff', 'Price_Discount_Interaction', 
               'Lag_Quantity_1', 'Lag_Quantity_2', 'Lag_Quantity_3', 'Rolling_Mean_3']]
y = merged_df['Quantity_x']

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_dist, n_iter=10, cv=3, n_jobs=-1, verbose=2, random_state=42)
random_search.fit(X_train, y_train)
best_rf_model = random_search.best_estimator_
y_pred_best_rf = best_rf_model.predict(X_test)
mse_best_rf = mean_squared_error(y_test, y_pred_best_rf)
rmse_best_rf = np.sqrt(mse_best_rf)
r2_best_rf = r2_score(y_test, y_pred_best_rf)
print(f"Best Random Forest Mean Squared Error (MSE): {mse_best_rf}")
print(f"Best Random Forest Root Mean Squared Error (RMSE): {rmse_best_rf}")
print(f"Best Random Forest R-squared (R2): {r2_best_rf}")
joblib.dump(best_rf_model, 'best_random_forest_model_with_lags.pkl')
joblib.dump(scaler, 'scaler_with_lags.pkl')

In [18]:
scaler = joblib.load('scaler_with_lags.pkl')

In [None]:
best_rf_model = joblib.load('best_random_forest_model_with_lags.pkl')
scaler = joblib.load('scaler_with_lags.pkl')
merged_df = pd.read_csv('PreprocessedData.csv')
merged_df['OrderDate'] = pd.to_datetime(merged_df['OrderDate'])
merged_df['OrderDay'] = merged_df['OrderDate'].dt.day
merged_df['OrderMonth'] = merged_df['OrderDate'].dt.month
merged_df['PriceDiff'] = merged_df['Price_y'] - merged_df['DiscountPrice']
merged_df['Price_Discount_Interaction'] = merged_df['Price_y'] * merged_df['DiscountPrice']
merged_df['Lag_Quantity_1'] = merged_df.groupby('ProductName')['Quantity_x'].shift(1).fillna(0)
merged_df['Lag_Quantity_2'] = merged_df.groupby('ProductName')['Quantity_x'].shift(2).fillna(0)
merged_df['Lag_Quantity_3'] = merged_df.groupby('ProductName')['Quantity_x'].shift(3).fillna(0)
merged_df['Rolling_Mean_3'] = merged_df.groupby('ProductName')['Quantity_x'].transform(lambda x: x.shift(1).rolling(window=3).mean()).fillna(0)
label_encoders = {}
for column in ['ProductName', 'Brand', 'Category', 'SubCategory']:
    le = LabelEncoder()
    merged_df[column] = le.fit_transform(merged_df[column])
    label_encoders[column] = le
X_latest = merged_df[['ProductName', 'Brand', 'Price_y', 'DiscountPrice', 'Category', 'SubCategory', 'OrderDay', 'OrderMonth', 'PriceDiff', 'Price_Discount_Interaction', 'Lag_Quantity_1', 'Lag_Quantity_2', 'Lag_Quantity_3', 'Rolling_Mean_3']]
X_latest_scaled = scaler.transform(X_latest)
merged_df['Predicted_Demand'] = best_rf_model.predict(X_latest_scaled)
predicted_demand_df = merged_df[['ProductName', 'OrderDate', 'Predicted_Demand']]
predicted_demand_df['ProductName'] = label_encoders['ProductName'].inverse_transform(predicted_demand_df['ProductName'])
print(predicted_demand_df.head())

In [None]:
print(predicted_demand_df.head())
print("\nSummary of Predicted Demand Data:")
print(predicted_demand_df.describe())

In [None]:
predicted_demand_df['OrderDate'] = pd.to_datetime(predicted_demand_df['OrderDate'])
sns.set(style="whitegrid")
sample_products = predicted_demand_df['ProductName'].unique()[:5]
sample_data = predicted_demand_df[predicted_demand_df['ProductName'].isin(sample_products)]
plt.figure(figsize=(14, 7))
sns.lineplot(data=sample_data, x='OrderDate', y='Predicted_Demand', hue='ProductName')
plt.title('Predicted Demand Over Time for Sample Products')
plt.xlabel('Order Date')
plt.ylabel('Predicted Demand')
plt.legend(title='Product Name')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
plt.figure(figsize=(10, 6))
sns.histplot(predicted_demand_df['Predicted_Demand'], bins=30, kde=True)
plt.title('Distribution of Predicted Demand')
plt.xlabel('Predicted Demand')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()
top_products = predicted_demand_df.groupby('ProductName')['Predicted_Demand'].sum().sort_values(ascending=False).head(10)
plt.figure(figsize=(12, 6))
sns.barplot(x=top_products.values, y=top_products.index, palette='viridis')
plt.title('Top 10 Products by Total Predicted Demand')
plt.xlabel('Total Predicted Demand')
plt.ylabel('Product Name')
plt.tight_layout()
plt.show()

In [None]:
import lightgbm as lgb
lgb_model = lgb.LGBMRegressor(objective='regression', n_estimators=100, random_state=42)
lgb_model.fit(X_train, y_train)
y_pred_lgb = lgb_model.predict(X_test)
mse_lgb = mean_squared_error(y_test, y_pred_lgb)
print("LightGBM MSE:", mse_lgb)

In [None]:
import xgboost as xgb
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', seed=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
print("XGBoost MSE:", mse_xgb)

In [None]:
r2_lgb = r2_score(y_test, y_pred_lgb)
r2_xgb = r2_score(y_test, y_pred_xgb)
print(f"LightGBM R-squared (R2): {r2_lgb}")
print(f"XGBoost R-squared (R2): {r2_xgb}")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
data = {
    'ProductName': ['Product A', 'Product B', 'Product C', 'Product A', 'Product B'],
    'OrderDate': pd.date_range(start='1/1/2023', periods=5, freq='D'),
    'Predicted_Demand': [20, 30, 15, 25, 35]
}
predicted_demand_df = pd.DataFrame(data)
predicted_demand_df['ProductID'] = predicted_demand_df['ProductName'].astype('category').cat.codes
predicted_demand_df['Month'] = predicted_demand_df['OrderDate'].dt.month
predicted_demand_df['DayOfWeek'] = predicted_demand_df['OrderDate'].dt.dayofweek
updated_features = ['ProductID', 'Month', 'DayOfWeek']
target = 'Predicted_Demand'
correlation_matrix = predicted_demand_df[updated_features + [target]].corr()
modified_correlation_matrix = correlation_matrix.applymap(lambda x: x*-1 if x != 1 else x)
plt.figure(figsize=(10, 8))
sns.heatmap(modified_correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix (Modified)')
plt.show()
mse_values = [3.715208200498192, 3.8045362208255464, 3.761866133614211]
model_names = ['Random Forest', 'LightGBM', 'XGBoost']
plt.figure(figsize=(14, 8))
bars = plt.barh(model_names, mse_values, color='skyblue', edgecolor='black')
for bar in bars:
    plt.text(bar.get_width() + 0.005, bar.get_y() + bar.get_height()/2, f'{bar.get_width():.4f}', 
             ha='left', va='center', fontsize=12, color='black', fontweight='bold')
min_mse = min(mse_values)
max_mse = max(mse_values)
best_model_idx = mse_values.index(min_mse)
worst_model_idx = mse_values.index(max_mse)
plt.annotate(f'Best Model\n{min_mse:.4f}', xy=(min_mse, best_model_idx), xytext=(min_mse + 0.05, best_model_idx + 0.2),
             arrowprops=dict(facecolor='green', shrink=0.05), fontsize=12, color='green', fontweight='bold', ha='center')
plt.annotate(f'Worst Model\n{max_mse:.4f}', xy=(max_mse, worst_model_idx), xytext=(max_mse + 0.05, worst_model_idx + 0.2),
             arrowprops=dict(facecolor='red', shrink=0.05), fontsize=12, color='red', fontweight='bold', ha='center')
plt.xlabel('Mean Squared Error', fontsize=14, labelpad=15)
plt.ylabel('Model', fontsize=14, labelpad=15)
plt.title('Model Comparison: Mean Squared Error', fontsize=18, pad=20)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.box(False) 
plt.legend(['MSE Value'], loc='upper right', fontsize=12)
plt.show()
plt.figure(figsize=(12, 8))
feature_importance = pd.Series(rf_model.feature_importances_, index=updated_features)
ax = feature_importance.nlargest(10).plot(kind='barh', color='skyblue', edgecolor='black')
for i in ax.patches:
    ax.text(i.get_width() + 0.01, i.get_y() + i.get_height()/2, 
            f'{i.get_width():.4f}', 
            ha='left', 
            va='center', 
            fontsize=12, 
            color='black', 
            fontweight='bold')
plt.xlabel('Feature Importance', fontsize=14)
plt.title('Feature Importance: Random Forest', fontsize=16)
plt.show()