In [4]:
# 主要数据特征：
# Date：从 [start_date] 到 [end_date] 的每日记录。
# Store ID 和Product ID：商店和产品的唯一标识符。
# Category：电子产品、服装、杂货等产品类别。
# Region：商店的地理区域。
# Inventory Level：当天开始时的库存。
# Units Sold：当天销售的单位。
# Demand Forecast	：根据过去趋势预测的需求。
# Weather Condition：影响销售的每日天气。
# Holiday/Promotion：节假日或促销的指标。
import pandas as pd

from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("../data/retail_store_inventory.csv")
df.head()
df.shape
df.info()
df.describe()
df.describe(include='O')
df.isnull().sum()
df.duplicated().sum()

FileNotFoundError: [Errno 2] No such file or directory: '../data/retail_store_inventory.csv'

In [None]:
# 数据类型转换
df['Date'] = pd.to_datetime(df['Date'])
# 提取年份
df['Year'] = df['Date'].dt.year
sns.histplot(df['Inventory Level'], bins=30, kde=True)
plt.title('Distribution of Inventory Levels')
plt.xlabel('Inventory Level')
plt.ylabel('Frequency')
plt.show()
sns.set_style("darkgrid")
plt.figure(figsize=(12, 8))
sns.countplot(x="Region", hue="Category", data=df)
plt.show()

In [None]:
custom_palette = ["red", "green", "orange", "blue"]
sns.set_palette(custom_palette)
plt.figure(figsize=(12, 8))
sns.lineplot(
    x="Year",
    y="Price",
    hue="Region",
    marker="o",
    data=df
)
plt.xlabel("Year")
plt.ylabel("Price")
plt.title("Price over years")
plt.show()
plt.figure(figsize=(14, 8))
sns.lineplot(
    x="Year",
    y="Price",
    label="Price",
    data=df,
)
sns.lineplot(
    x="Year",
    y="Competitor Pricing",
    label="Competitor Pricing",
    data=df,
)
plt.xlabel("Time")
plt.ylabel("Values")
plt.title("Price vs Competitor Pricing Over Time")
plt.legend()
plt.show()
sns.boxplot(data=df[["Price", "Competitor Pricing"]])
plt.xlabel("Category")
plt.ylabel("Values")
plt.title("Boxplot of Price vs Competitor Pricing")
plt.show()
g = sns.catplot(
    x="Weather Condition",
    y="Units Sold",
    hue="Category",
    kind="bar",
    data=df
)
g.fig.suptitle("(measure by MEAN)Average Units Sold vs Weather Condition", fontsize=16)
plt.show()
g = sns.catplot(
    x="Discount",
    y="Units Sold",
    hue="Category",
    kind="bar",
    data=df
)

g.fig.suptitle("(measure by MEAN)Total Units Sold vs Discount by Category", fontsize=16)
plt.show()
sns.relplot(
    x="Weather Condition",
    y="Discount",
    hue="Category",
    kind="scatter",
    data=df
)

plt.title('(measure by MEAN)Discount by Weather Condition and Category')
plt.xlabel('Weather Condition')
plt.ylabel('Discount')
plt.show()

df_sum = df.groupby(["Discount", "Category"])["Units Sold"].sum().reset_index()
g = sns.catplot(
    x="Discount",
    y="Units Sold",
    hue="Category",
    kind="bar",
    data=df_sum
)
g.fig.suptitle("(measure by SUM)Total Units Sold vs Discount by Category", fontsize=16)
plt.show()

df_sum2 = df.groupby(["Weather Condition", "Category"])["Units Sold"].sum().reset_index()
g = sns.catplot(
    x="Weather Condition",
    y="Units Sold",
    hue="Category",
    kind="bar",
    data=df_sum2
)

g.fig.suptitle("(measure by SUM)Total Units Sold vs Weather Condition", fontsize=16)
plt.show()
plt.figure(figsize=(12, 6))
sns.scatterplot(x="Demand Forecast", y="Discount", data=df)
plt.title("Relationship Between Demand Forecast and Discount")
plt.xlabel("Demand Forecast")
plt.ylabel("Discount")
plt.show()
dftocorr = df
Lbl = LabelEncoder()
columns_to_encode = ["Seasonality", "Weather Condition", "Region", "Category"]
dftocorr[columns_to_encode] = dftocorr[columns_to_encode].apply(Lbl.fit_transform)
dftocorr = dftocorr.drop(["Date", "Store ID", "Product ID"], axis=1)
corr_matrix = dftocorr.corr()

In [None]:
# 相关系数
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", cbar=True)
plt.title("Correlation Heatmap")
plt.show()

In [None]:
# 准备建模数据
X = dftocorr.drop("Demand Forecast", axis=1)
y = dftocorr["Demand Forecast"]

In [None]:
# 标准化
X_selected = dftocorr[["Units Sold", "Inventory Level", "Units Ordered"]]
scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
X_selected = scaler.fit_transform(X_selected)

In [None]:
# 拆分数据集为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.20, random_state=44, shuffle=True)

In [None]:
# 构建随机森林模型
reg_moduel = RandomForestRegressor(random_state=44)
reg_moduel.fit(X_train, y_train)
print("train score ", reg_moduel.score(X_train, y_train))
print("test score ", reg_moduel.score(X_test, y_test))
import seaborn as sns

In [None]:
# 预测
y_pred = reg_moduel.predict(X_test)
plt.figure(figsize=(10, 6))

In [None]:
# 实际值与预测值的散点图
sns.scatterplot(x=y_test, y=y_pred, color='blue', alpha=0.4, s=80, label='Predicted vs Actual')

In [None]:
# 画出预测=实际的理想线
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--',
         label='Ideal Prediction Line')
plt.xlabel('Actual Values', fontsize=14)
plt.ylabel('Predicted Values', fontsize=14)
plt.title('Actual vs Predicted Values - Regression Model', fontsize=16)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# 模型预测
result_df = pd.DataFrame()
result_df['真实值'] = y_test
result_df['预测值'] = y_pred
result_df.head(10)

In [None]:
# 模型预测可视化
plt.figure(figsize=(10, 6))
plt.plot(range(len(y_test))[:200], y_pred[:200], 'b', label='predict')
plt.plot(range(len(y_test))[:200], y_test[:200], 'r', label='test')
plt.legend(loc='upper right', fontsize=15)
plt.xlabel('the number of car', fontdict={'weight': 'normal', 'size': 15})
plt.ylabel('value of Price', fontdict={'weight': 'normal', 'size': 15})
plt.show()