In [1]:
"""
Olist 电商数据分析 - 数据可视化
====================================
此脚本生成完整的可视化报告
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# 设置中文字体和样式
sns.set_style("whitegrid")
sns.set_palette("husl")
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

# 设置路径
BASE_DIR = Path("..") 
OUTPUT_DIR = BASE_DIR / "outputs" / "charts"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

def load_cleaned_data():
    """加载清洗后的数据"""
    print("加载数据...")
    cleaned_dir = BASE_DIR / "outputs" / "cleaned_data"
    
    # 从清洗后的文件加载
    datasets = {}
    for file in cleaned_dir.glob("*_cleaned.csv"):
        name = file.stem.replace("_cleaned", "")
        datasets[name] = pd.read_csv(file)
        print(f"加载 {name}: {len(datasets[name])} 行")
    
    return datasets

def create_comprehensive_dashboard(datasets):
    """创建综合仪表板"""
    print("\n" + "=" * 60)
    print("创建综合可视化仪表板")
    print("=" * 60)
    
    # 准备数据
    df_orders = datasets.get('orders', pd.DataFrame())
    df_order_items = datasets.get('order_items', pd.DataFrame())
    df_customers = datasets.get('customers', pd.DataFrame())
    df_products = datasets.get('products', pd.DataFrame())
    df_payments = datasets.get('order_payments', pd.DataFrame())
    df_reviews = datasets.get('order_reviews', pd.DataFrame())
    
    # 合并数据
    df_merged = df_orders.merge(df_order_items, on='order_id', how='inner')
    df_delivered = df_merged[df_merged['order_status'] == 'delivered'].copy()
    
    # 转换时间戳
    if 'order_purchase_timestamp' in df_delivered.columns:
        df_delivered['order_purchase_timestamp'] = pd.to_datetime(
            df_delivered['order_purchase_timestamp']
        )
        df_delivered['month'] = df_delivered['order_purchase_timestamp'].dt.to_period('M')
        df_delivered['year_month'] = df_delivered['order_purchase_timestamp'].dt.to_period('M').astype(str)
    
    # 创建综合仪表板
    fig = plt.figure(figsize=(20, 16))
    gs = fig.add_gridspec(4, 3, hspace=0.7, wspace=0.3)
    
    # 1. 月度销售额趋势
    ax1 = fig.add_subplot(gs[0, 0:2])
    if 'month' in df_delivered.columns:
        monthly_sales = df_delivered.groupby('year_month').agg({
            'price': 'sum',
            'freight_value': 'sum'
        }).reset_index()
        monthly_sales['total'] = monthly_sales['price'] + monthly_sales['freight_value']
        ax1.plot(monthly_sales['year_month'], monthly_sales['total'], marker='o', linewidth=2, markersize=8)
        ax1.set_title('月度销售额趋势', fontsize=14, fontweight='bold', pad=10)
        ax1.set_xlabel('月份', fontsize=12)
        ax1.set_ylabel('销售额 (BRL)', fontsize=12)
        ax1.tick_params(axis='x', rotation=45)
        ax1.grid(True, alpha=0.3)
    
    # 2. 订单状态分布
    ax2 = fig.add_subplot(gs[0, 2])
    if 'order_status' in df_orders.columns:
        status_counts = df_orders['order_status'].value_counts()
        ax2.pie(status_counts.values, startangle=90)
        ax2.legend(status_counts.index, title="状态", loc="upper left", bbox_to_anchor=(1, 1))
        ax2.set_title('订单状态分布', fontsize=14, fontweight='bold', pad=10)
    
    # 3. 支付方式分布
    ax3 = fig.add_subplot(gs[1, 0])
    if not df_payments.empty and 'payment_type' in df_payments.columns:
        payment_counts = df_payments['payment_type'].value_counts()
        ax3.bar(payment_counts.index, payment_counts.values, alpha=0.7)
        ax3.set_title('支付方式分布', fontsize=14, fontweight='bold', pad=10)
        ax3.set_xlabel('支付方式', fontsize=10)
        ax3.set_ylabel('订单数', fontsize=10)
        ax3.tick_params(axis='x', rotation=45)
        ax3.grid(True, alpha=0.3, axis='y')
    
    # 4. 评价分数分布
    ax4 = fig.add_subplot(gs[1, 1])
    if not df_reviews.empty and 'review_score' in df_reviews.columns:
        score_counts = df_reviews['review_score'].value_counts().sort_index()
        colors = ['red' if x <= 2 else 'orange' if x == 3 else 'green' for x in score_counts.index]
        ax4.bar(score_counts.index.astype(str), score_counts.values, alpha=0.7, color=colors)
        ax4.set_title('评价分数分布', fontsize=14, fontweight='bold', pad=10)
        ax4.set_xlabel('评分', fontsize=10)
        ax4.set_ylabel('数量', fontsize=10)
        ax4.grid(True, alpha=0.3, axis='y')
    
    # 5. 产品类别TOP 10
    ax5 = fig.add_subplot(gs[1, 2])
    if not df_products.empty:
        category_col = 'product_category_name_english' if 'product_category_name_english' in df_products.columns else 'product_category_name'
        if category_col in df_products.columns:
            df_with_category = df_delivered.merge(
                df_products[['product_id', category_col]], 
                on='product_id', 
                how='left'
            )
            top_categories = df_with_category.groupby(category_col)['price'].sum().sort_values(ascending=False).head(10)
            ax5.barh(range(len(top_categories)), top_categories.values, alpha=0.7)
            ax5.set_yticks(range(len(top_categories)))
            ax5.set_yticklabels([str(c)[:20] for c in top_categories.index], fontsize=8)
            ax5.set_title('产品类别销售额 TOP 10', fontsize=14, fontweight='bold', pad=10)
            ax5.set_xlabel('销售额 (BRL)', fontsize=10)
            ax5.invert_yaxis()
            ax5.grid(True, alpha=0.3, axis='x')
    
    # 6. 各州销售排名
    ax6 = fig.add_subplot(gs[2, 0])
    if not df_customers.empty:
        df_with_state = df_delivered.merge(
            df_customers[['customer_id', 'customer_state']],
            on='customer_id',
            how='left'
        )
        state_sales = df_with_state.groupby('customer_state')['price'].sum().sort_values(ascending=False).head(10)
        ax6.barh(range(len(state_sales)), state_sales.values, alpha=0.7, color='steelblue')
        ax6.set_yticks(range(len(state_sales)))
        ax6.set_yticklabels(state_sales.index, fontsize=10)
        ax6.set_title('各州销售额排名 TOP 10', fontsize=14, fontweight='bold', pad=10)
        ax6.set_xlabel('销售额 (BRL)', fontsize=10)
        ax6.invert_yaxis()
        ax6.grid(True, alpha=0.3, axis='x')
    
    # 7. 订单金额分布
    ax7 = fig.add_subplot(gs[2, 1])
    order_totals = df_delivered.groupby('order_id')['price'].sum()
    ax7.hist(order_totals[order_totals <= 500], bins=50, alpha=0.7, edgecolor='black')
    ax7.set_title('订单金额分布 (≤500 BRL)', fontsize=14, fontweight='bold', pad=10)
    ax7.set_xlabel('订单金额 (BRL)', fontsize=10)
    ax7.set_ylabel('频数', fontsize=10)
    ax7.grid(True, alpha=0.3, axis='y')
    
    # 8. 配送时间分析
    ax8 = fig.add_subplot(gs[2, 2])
    if 'order_delivered_customer_date' in df_delivered.columns and 'order_purchase_timestamp' in df_delivered.columns:
        df_delivered['delivery_days'] = (
            pd.to_datetime(df_delivered['order_delivered_customer_date']) - 
            pd.to_datetime(df_delivered['order_purchase_timestamp'])
        ).dt.days
        delivery_days_clean = df_delivered['delivery_days'].dropna()
        delivery_days_clean = delivery_days_clean[(delivery_days_clean >= 0) & (delivery_days_clean <= 50)]
        ax8.hist(delivery_days_clean, bins=30, alpha=0.7, edgecolor='black', color='orange')
        ax8.set_title('配送时间分布', fontsize=14, fontweight='bold', pad=10)
        ax8.set_xlabel('配送天数', fontsize=10)
        ax8.set_ylabel('订单数', fontsize=10)
        ax8.grid(True, alpha=0.3, axis='y')
    
    # 9. 月度订单数量
    ax9 = fig.add_subplot(gs[3, 0:2])
    if 'month' in df_delivered.columns:
        monthly_orders = df_delivered.groupby('year_month')['order_id'].nunique()
        ax9.bar(range(len(monthly_orders)), monthly_orders.values, alpha=0.7, color='green')
        ax9.set_xticks(range(len(monthly_orders)))
        ax9.set_xticklabels(monthly_orders.index, rotation=45, ha='right', fontsize=10)
        ax9.set_title('月度订单数量', fontsize=14, fontweight='bold', pad=10)
        ax9.set_xlabel('月份', fontsize=12)
        ax9.set_ylabel('订单数', fontsize=12)
        ax9.grid(True, alpha=0.3, axis='y')
    
    # 10. 关键指标汇总
    ax10 = fig.add_subplot(gs[3, 2])
    ax10.axis('off')
    
    # 计算关键指标
    total_revenue = df_delivered['price'].sum()
    total_orders = df_delivered['order_id'].nunique()
    total_customers = df_delivered['customer_id'].nunique() if 'customer_id' in df_delivered.columns else 0
    avg_order_value = total_revenue / total_orders if total_orders > 0 else 0
    
    metrics_text = f"""
关键指标汇总

总销售额: {total_revenue:,.0f} BRL
总订单数: {total_orders:,}
总客户数: {total_customers:,}
平均订单金额: {avg_order_value:.2f} BRL
    """
    
    ax10.text(0.1, 0.5, metrics_text, fontsize=12, verticalalignment='center',
              bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.3))
    ax10.set_title('关键指标', fontsize=14, fontweight='bold', pad=10)
    
    plt.suptitle('Olist 电商数据分析综合仪表板', fontsize=20, fontweight='bold', y=0.98)
    plt.savefig(OUTPUT_DIR / 'comprehensive_dashboard.png', dpi=300, bbox_inches='tight')
    print(f"\n✓ 综合仪表板已保存: {OUTPUT_DIR / 'comprehensive_dashboard.png'}")
    plt.close()

def create_correlation_heatmap(datasets):
    """创建相关性热力图"""
    print("\n创建相关性分析图...")
    
    df_order_items = datasets.get('order_items', pd.DataFrame())
    df_products = datasets.get('products', pd.DataFrame())
    
    # 合并数据
    df_merged = df_order_items.merge(df_products, on='product_id', how='inner')
    
    # 选择数值列
    numeric_cols = ['price', 'freight_value', 'product_weight_g', 
                   'product_length_cm', 'product_height_cm', 'product_width_cm']
    df_numeric = df_merged[numeric_cols].dropna()
    
    # 计算相关系数
    corr_matrix = df_numeric.corr()
    
    # 绘制热力图
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
                square=True, linewidths=1, cbar_kws={"shrink": 0.8})
    plt.title('产品特征相关性分析', fontsize=16, fontweight='bold', pad=20)
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / 'correlation_heatmap.png', dpi=300, bbox_inches='tight')
    print(f"✓ 相关性热力图已保存: {OUTPUT_DIR / 'correlation_heatmap.png'}")
    plt.close()

def main():
    """主函数"""
    print("=" * 60)
    print("数据可视化")
    print("=" * 60)
    
    # 加载数据
    datasets = load_cleaned_data()
    
    # 创建可视化
    create_comprehensive_dashboard(datasets)
    create_correlation_heatmap(datasets)
    
    print("\n" + "=" * 60)
    print("✓ 数据可视化完成！")
    print("=" * 60)

main()

数据可视化
加载数据...
加载 customers: 99441 行
加载 sellers: 3095 行
加载 products: 32951 行
加载 order_items: 112650 行
加载 orders: 99441 行
加载 order_reviews: 99224 行
加载 order_payments: 103886 行

创建综合可视化仪表板

✓ 综合仪表板已保存: ../outputs/charts/comprehensive_dashboard.png

创建相关性分析图...
✓ 相关性热力图已保存: ../outputs/charts/correlation_heatmap.png

✓ 数据可视化完成！
