In [1]:
"""
Olist 电商数据分析 - 探索性数据分析 (EDA)
====================================
此脚本用于进行探索性数据分析，发现数据中的模式和趋势
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# 设置路径
BASE_DIR = Path("..") 
DATA_DIR = BASE_DIR / "Brazilian E-Commerce Public Dataset by Olist"
OUTPUT_DIR = BASE_DIR / "outputs" / "charts"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# 设置中文字体和样式
sns.set_style("whitegrid")
sns.set_palette("husl")
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei', 'sans-serif']
plt.rcParams['axes.unicode_minus'] = False 

def load_cleaned_data():
    """加载清洗后的数据"""
    print("加载数据...")
    cleaned_dir = BASE_DIR / "outputs" / "cleaned_data"
    
    # 从清洗后的文件加载
    datasets = {}
    for file in cleaned_dir.glob("*_cleaned.csv"):
        name = file.stem.replace("_cleaned", "")
        datasets[name] = pd.read_csv(file)
        print(f"加载 {name}: {len(datasets[name])} 行")
    
    return datasets

def analyze_orders(df_orders, df_order_items):
    """分析订单数据"""
    print("\n" + "=" * 60)
    print("订单分析")
    print("=" * 60)
    
    # 合并订单和订单商品
    df = df_orders.merge(
        df_order_items.groupby('order_id').agg({
            'price': 'sum',
            'freight_value': 'sum',
            'order_item_id': 'count'
        }).reset_index(),
        on='order_id',
        how='left'
    )
    
    # 只分析已交付订单
    df_delivered = df[df['order_status'] == 'delivered'].copy()
    
    print(f"\n总订单数: {len(df)}")
    print(f"已交付订单: {len(df_delivered)}")
    print(f"订单状态分布:\n{df['order_status'].value_counts()}")
    
    # 时间序列分析
    if 'order_purchase_timestamp' in df_delivered.columns:
        df_delivered['order_purchase_timestamp'] = pd.to_datetime(
            df_delivered['order_purchase_timestamp']
        )
        df_delivered['month'] = df_delivered['order_purchase_timestamp'].dt.to_period('M')
        
        monthly_sales = df_delivered.groupby('month').agg({
            'order_id': 'count',
            'price': 'sum',
            'freight_value': 'sum'
        }).reset_index()
        monthly_sales['total_value'] = monthly_sales['price'] + monthly_sales['freight_value']
        
        print("\n月度销售趋势:")
        print(monthly_sales)
        
        # 绘制月度销售趋势图
        fig, axes = plt.subplots(2, 1, figsize=(14, 10))
        
        monthly_sales['month'] = monthly_sales['month'].astype(str)
        axes[0].plot(monthly_sales['month'], monthly_sales['total_value'], marker='o')
        axes[0].set_title('月度销售额趋势', fontsize=14, fontweight='bold')
        axes[0].set_xlabel('月份')
        axes[0].set_ylabel('销售额 (BRL)')
        axes[0].tick_params(axis='x', rotation=45)
        axes[0].grid(True, alpha=0.3)
        
        axes[1].bar(monthly_sales['month'], monthly_sales['order_id'])
        axes[1].set_title('月度订单数量', fontsize=14, fontweight='bold')
        axes[1].set_xlabel('月份')
        axes[1].set_ylabel('订单数')
        axes[1].tick_params(axis='x', rotation=45)
        axes[1].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.savefig(OUTPUT_DIR / 'monthly_sales_trend.png', dpi=300, bbox_inches='tight')
        print(f"\n✓ 图表已保存: {OUTPUT_DIR / 'monthly_sales_trend.png'}")
        plt.close()
    
    # 订单价值分布
    if 'price' in df_delivered.columns:
        fig, axes = plt.subplots(1, 2, figsize=(14, 5))
        
        axes[0].hist(df_delivered['price'], bins=50, edgecolor='black', alpha=0.7)
        axes[0].set_title('订单金额分布', fontsize=14, fontweight='bold')
        axes[0].set_xlabel('订单金额 (BRL)')
        axes[0].set_ylabel('频数')
        axes[0].grid(True, alpha=0.3)
        
        # 对数尺度
        df_delivered['price_log'] = np.log1p(df_delivered['price'])
        axes[1].hist(df_delivered['price_log'], bins=50, edgecolor='black', alpha=0.7, color='orange')
        axes[1].set_title('订单金额分布（对数尺度）', fontsize=14, fontweight='bold')
        axes[1].set_xlabel('Log(订单金额 (BRL))')
        axes[1].set_ylabel('频数')
        axes[1].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.savefig(OUTPUT_DIR / 'order_value_distribution.png', dpi=300, bbox_inches='tight')
        print(f"✓ 图表已保存: {OUTPUT_DIR / 'order_value_distribution.png'}")
        plt.close()
        
        print(f"\n订单价值统计:")
        print(df_delivered['price'].describe())

def analyze_customers(df_customers, df_orders, df_order_items):
    """分析客户数据"""
    print("\n" + "=" * 60)
    print("客户分析")
    print("=" * 60)
    
    # 合并数据
    df_merged = df_orders.merge(df_customers, on='customer_id', how='left')
    df_merged = df_merged.merge(
        df_order_items.groupby('order_id').agg({
            'price': 'sum',
            'freight_value': 'sum'
        }).reset_index(),
        on='order_id',
        how='left'
    )
    
    df_delivered = df_merged[df_merged['order_status'] == 'delivered'].copy()
    
    # 各州客户分布
    state_stats = df_delivered.groupby('customer_state').agg({
        'customer_unique_id': 'nunique',
        'order_id': 'count',
        'price': 'sum'
    }).reset_index()
    state_stats.columns = ['state', 'unique_customers', 'order_count', 'revenue']
    state_stats = state_stats.sort_values('revenue', ascending=False).head(10)
    
    print("\n前10州销售情况:")
    print(state_stats)
    
    # 绘制州销售图
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    axes[0].barh(state_stats['state'], state_stats['revenue'])
    axes[0].set_title('各州销售额排名 (TOP 10)', fontsize=14, fontweight='bold')
    axes[0].set_xlabel('销售额 (BRL)')
    axes[0].invert_yaxis()
    axes[0].grid(True, alpha=0.3)
    
    axes[1].barh(state_stats['state'], state_stats['unique_customers'], color='orange')
    axes[1].set_title('各州客户数量 (TOP 10)', fontsize=14, fontweight='bold')
    axes[1].set_xlabel('客户数量')
    axes[1].invert_yaxis()
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / 'customer_state_analysis.png', dpi=300, bbox_inches='tight')
    print(f"\n✓ 图表已保存: {OUTPUT_DIR / 'customer_state_analysis.png'}")
    plt.close()
    
    # 客户复购分析
    customer_orders = df_delivered.groupby('customer_unique_id').agg({
        'order_id': 'count',
        'price': 'sum'
    }).reset_index()
    customer_orders.columns = ['customer_id', 'order_count', 'total_spent']
    
    repeat_customers = customer_orders[customer_orders['order_count'] > 1]
    repeat_rate = len(repeat_customers) / len(customer_orders) * 100
    
    print(f"\n复购分析:")
    print(f"总客户数: {len(customer_orders)}")
    print(f"复购客户数: {len(repeat_customers)}")
    print(f"复购率: {repeat_rate:.2f}%")
    
    # 绘制复购分布
    order_count_dist = customer_orders['order_count'].value_counts().sort_index().head(10)
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.bar(order_count_dist.index.astype(str), order_count_dist.values, edgecolor='black', alpha=0.7)
    ax.set_title('客户订单数量分布', fontsize=14, fontweight='bold')
    ax.set_xlabel('订单数量')
    ax.set_ylabel('客户数')
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / 'customer_repeat_purchase.png', dpi=300, bbox_inches='tight')
    print(f"✓ 图表已保存: {OUTPUT_DIR / 'customer_repeat_purchase.png'}")
    plt.close()

def analyze_products(df_products, df_order_items, df_orders):
    """分析产品数据"""
    print("\n" + "=" * 60)
    print("产品分析")
    print("=" * 60)
    
    # 合并数据
    df_merged = df_order_items.merge(df_products, on='product_id', how='left')
    df_merged = df_merged.merge(df_orders[['order_id', 'order_status']], on='order_id', how='left')
    
    df_delivered = df_merged[df_merged['order_status'] == 'delivered'].copy()
    
    # 产品类别分析
    category_col = 'product_category_name_english' if 'product_category_name_english' in df_delivered.columns else 'product_category_name'
    
    if category_col in df_delivered.columns:
        category_stats = df_delivered.groupby(category_col).agg({
            'order_id': 'nunique',
            'product_id': 'nunique',
            'price': 'sum'
        }).reset_index()
        category_stats.columns = ['category', 'order_count', 'product_count', 'revenue']
        category_stats = category_stats.sort_values('revenue', ascending=False).head(15)
        
        print("\n前15产品类别销售情况:")
        print(category_stats)
        
        # 绘制产品类别图
        fig, ax = plt.subplots(figsize=(14, 8))
        ax.barh(category_stats['category'], category_stats['revenue'])
        ax.set_title('产品类别销售额排名 (TOP 15)', fontsize=14, fontweight='bold')
        ax.set_xlabel('销售额 (BRL)')
        ax.invert_yaxis()
        ax.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.savefig(OUTPUT_DIR / 'product_category_analysis.png', dpi=300, bbox_inches='tight')
        print(f"\n✓ 图表已保存: {OUTPUT_DIR / 'product_category_analysis.png'}")
        plt.close()
    
    # 价格分析
    print(f"\n产品价格统计:")
    print(df_delivered['price'].describe())
    
    # 价格分布
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    axes[0].hist(df_delivered['price'], bins=100, edgecolor='black', alpha=0.7)
    axes[0].set_title('产品价格分布', fontsize=14, fontweight='bold')
    axes[0].set_xlabel('价格 (BRL)')
    axes[0].set_ylabel('频数')
    axes[0].set_xlim(0, 500)  # 限制范围以便更好地观察
    axes[0].grid(True, alpha=0.3)
    
    # 价格箱线图
    price_sample = df_delivered[df_delivered['price'] <= 500]['price']
    axes[1].boxplot(price_sample, vert=True)
    axes[1].set_title('产品价格箱线图 (≤500 BRL)', fontsize=14, fontweight='bold')
    axes[1].set_ylabel('价格 (BRL)')
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / 'product_price_analysis.png', dpi=300, bbox_inches='tight')
    print(f"✓ 图表已保存: {OUTPUT_DIR / 'product_price_analysis.png'}")
    plt.close()

def analyze_payments(df_payments):
    """分析支付数据"""
    print("\n" + "=" * 60)
    print("支付分析")
    print("=" * 60)
    
    payment_stats = df_payments.groupby('payment_type').agg({
        'order_id': 'nunique',
        'payment_value': ['sum', 'mean', 'count']
    }).reset_index()
    
    payment_stats.columns = ['payment_type', 'order_count', 'total_value', 'avg_value', 'payment_count']
    payment_stats = payment_stats.sort_values('total_value', ascending=False)
    
    print("\n支付方式统计:")
    print(payment_stats)
    
    # 绘制支付方式图
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    
    axes[0].bar(payment_stats['payment_type'], payment_stats['total_value'], alpha=0.7)
    axes[0].set_title('各支付方式总金额', fontsize=14, fontweight='bold')
    axes[0].set_xlabel('支付方式')
    axes[0].set_ylabel('总金额 (BRL)')
    axes[0].tick_params(axis='x', rotation=45)
    axes[0].grid(True, alpha=0.3)
    
    axes[1].bar(payment_stats['payment_type'], payment_stats['order_count'], color='orange', alpha=0.7)
    axes[1].set_title('各支付方式订单数', fontsize=14, fontweight='bold')
    axes[1].set_xlabel('支付方式')
    axes[1].set_ylabel('订单数')
    axes[1].tick_params(axis='x', rotation=45)
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / 'payment_analysis.png', dpi=300, bbox_inches='tight')
    print(f"\n✓ 图表已保存: {OUTPUT_DIR / 'payment_analysis.png'}")
    plt.close()

def analyze_reviews(df_reviews):
    """分析评价数据"""
    print("\n" + "=" * 60)
    print("评价分析")
    print("=" * 60)
    
    print(f"\n评价统计:")
    print(df_reviews['review_score'].describe())
    
    score_dist = df_reviews['review_score'].value_counts().sort_index()
    print(f"\n评分分布:\n{score_dist}")
    
    avg_score = df_reviews['review_score'].mean()
    print(f"\n平均评分: {avg_score:.2f}")
    
    # 绘制评分分布
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    axes[0].bar(score_dist.index.astype(str), score_dist.values, alpha=0.7, color='steelblue')
    axes[0].set_title('评价分数分布', fontsize=14, fontweight='bold')
    axes[0].set_xlabel('评分')
    axes[0].set_ylabel('数量')
    axes[0].grid(True, alpha=0.3)
    
    # 饼图
    axes[1].pie(score_dist.values, labels=score_dist.index.astype(str), autopct='%1.1f%%', startangle=90)
    axes[1].set_title('评价分数占比', fontsize=14, fontweight='bold')
    
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / 'review_analysis.png', dpi=300, bbox_inches='tight')
    print(f"\n✓ 图表已保存: {OUTPUT_DIR / 'review_analysis.png'}")
    plt.close()

def main():
    """主函数"""
    print("=" * 60)
    print("探索性数据分析 (EDA)")
    print("=" * 60)
    
    # 加载数据
    datasets = load_cleaned_data()
    
    # 执行各项分析
    if 'orders' in datasets and 'order_items' in datasets:
        analyze_orders(datasets['orders'], datasets['order_items'])
    
    if 'customers' in datasets and 'orders' in datasets and 'order_items' in datasets:
        analyze_customers(datasets['customers'], datasets['orders'], datasets['order_items'])
    
    if 'products' in datasets and 'order_items' in datasets and 'orders' in datasets:
        analyze_products(datasets['products'], datasets['order_items'], datasets['orders'])
    
    if 'order_payments' in datasets:
        analyze_payments(datasets['order_payments'])
    
    if 'order_reviews' in datasets:
        analyze_reviews(datasets['order_reviews'])
    
    print("\n" + "=" * 60)
    print("✓ 探索性数据分析完成！")
    print("=" * 60)

main()


探索性数据分析 (EDA)
加载数据...
加载 customers: 99441 行
加载 sellers: 3095 行
加载 products: 32951 行
加载 order_items: 112650 行
加载 orders: 99441 行
加载 order_reviews: 99224 行
加载 order_payments: 103886 行

订单分析

总订单数: 99441
已交付订单: 96478
订单状态分布:
order_status
delivered      96478
shipped         1107
canceled         625
unavailable      609
invoiced         314
processing       301
created            5
approved           2
Name: count, dtype: int64

月度销售趋势:
      month  order_id      price  freight_value  total_value
0   2016-09         1     134.97           8.49       143.46
1   2016-10       265   40325.11        6165.55     46490.66
2   2016-12         1      10.90           8.72        19.62
3   2017-01       750  111798.36       15684.01    127482.37
4   2017-02      1653  234223.40       37015.92    271239.32
5   2017-03      2546  359198.85       55132.10    414330.95
6   2017-04      2303  340669.68       50142.72    390812.40
7   2017-05      3546  489338.25       77513.15    566851.40
8   2017-06  