# 👗 Fashion Retail Sales 数据分析项目

本项目基于 `Fashion_Retail_Sales.csv`，按六大数据分析步骤进行：
1. 数据清洗
2. 数据预处理
3. 探索性数据分析
4. 业务关联分析
5. 时间模式挖掘
6. 建模与验证

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# 显示设置
sns.set(style='whitegrid')
plt.rcParams['figure.figsize'] = (10, 4)

# 读取数据
df = pd.read_csv('Fashion_Retail_Sales.csv')
df.head()

## Step 1️⃣ 数据清洗

In [None]:
# 重命名列，统一风格
cols = {
    'Customer Reference ID': 'customer_id',
    'Item Purchased': 'item_purchased',
    'Purchase Amount (USD)': 'purchase_amount_(usd)',
    'Date Purchase': 'date_purchase',
    'Review Rating': 'review_rating',
    'Payment Method': 'payment_method'
}
df.rename(columns=cols, inplace=True)

# 转换数据类型
df['date_purchase'] = pd.to_datetime(df['date_purchase'], format='%d-%m-%Y')
df['review_rating'] = pd.to_numeric(df['review_rating'], errors='coerce')

# 处理异常值
df = df[(df['purchase_amount_(usd)'] > 0) & (df['purchase_amount_(usd)'] < 10000)]
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df.head()

## Step 2️⃣ 数据预处理

In [None]:
# 拆解时间字段
df['purchase_year'] = df['date_purchase'].dt.year
df['purchase_month'] = df['date_purchase'].dt.month
df['purchase_weekday'] = df['date_purchase'].dt.weekday  # 0 = Monday

## Step 3️⃣ 探索性数据分析

In [None]:
# 消费金额分布
sns.histplot(df['purchase_amount_(usd)'], bins=30, kde=True)
plt.title('消费金额分布')
plt.show()

# 商品分布
df['item_purchased'].value_counts().head(10).plot(kind='bar')
plt.title('Top 10 商品')
plt.ylabel('销量')
plt.xticks(rotation=45)
plt.show()

## Step 4️⃣ 业务关联分析

In [None]:
# 消费金额与评分关系
df['rating_group'] = df['review_rating'].apply(lambda x: 'high' if x >= 4 else 'low')
sns.boxplot(x='rating_group', y='purchase_amount_(usd)', data=df)
plt.title('高评分 vs 低评分客户的消费差异')
plt.show()

# 商品评分排行
item_rating = df.groupby('item_purchased')['review_rating'].mean().sort_values(ascending=False).head(10)
item_rating.plot(kind='bar')
plt.title('评分最高的商品')
plt.ylabel('平均评分')
plt.xticks(rotation=45)
plt.show()

## Step 5️⃣ 时间模式挖掘

In [None]:
# 每月销售额趋势
monthly_sales = df.groupby(df['date_purchase'].dt.to_period('M'))['purchase_amount_(usd)'].sum()
monthly_sales.index = monthly_sales.index.to_timestamp()
monthly_sales.plot(marker='o')
plt.title('月度销售趋势')
plt.grid(True)
plt.show()

# 每日趋势 + 滚动平均
daily_sales = df.groupby('date_purchase')['purchase_amount_(usd)'].sum()
daily_sales.plot(label='每日销售', alpha=0.5)
daily_sales.rolling(7).mean().plot(label='7日平均', color='red')
plt.title('每日销售趋势 + 平滑')
plt.legend()
plt.show()

## Step 6️⃣ 建模与验证

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# 准备数据
X = np.arange(len(monthly_sales)).reshape(-1, 1)
y = monthly_sales.values
split_idx = int(len(X) * 0.8)
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

# 训练模型
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'测试集 RMSE：{rmse:.2f}')

# 可视化预测结果
dates = monthly_sales.index
train_dates = dates[:split_idx]
test_dates = dates[split_idx:]
plt.plot(train_dates, y_train, label='训练集')
plt.plot(test_dates, y_test, label='测试集')
plt.plot(test_dates, y_pred, label='预测值', linestyle='--')
plt.legend()
plt.title('月度销售额预测')
plt.show()