In [1]:
"""
Olist 电商数据分析 - 高级分析
====================================
此脚本包含高级分析：RFM分析、客户细分、关联分析等
"""

import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# 设置路径
BASE_DIR = Path("..") 
OUTPUT_DIR = BASE_DIR / "outputs" / "reports"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)


def load_cleaned_data():
    """加载清洗后的数据"""
    print("加载数据...")
    cleaned_dir = BASE_DIR / "outputs" / "cleaned_data"
    
    # 从清洗后的文件加载
    datasets = {}
    for file in cleaned_dir.glob("*_cleaned.csv"):
        name = file.stem.replace("_cleaned", "")
        datasets[name] = pd.read_csv(file)
        print(f"加载 {name}: {len(datasets[name])} 行")
    
    return datasets

def rfm_analysis(df_customers, df_orders, df_order_items):
    """RFM分析：客户价值分析"""
    print("\n" + "=" * 60)
    print("RFM分析 - 客户价值分析")
    print("=" * 60)
    
    # 合并数据
    df_merged = df_orders.merge(df_customers, on='customer_id', how='inner')
    df_merged = df_merged.merge(
        df_order_items.groupby('order_id').agg({
            'price': 'sum',
            'freight_value': 'sum'
        }).reset_index(),
        on='order_id',
        how='inner'
    )
    
    # 只分析已交付订单
    df_delivered = df_merged[df_merged['order_status'] == 'delivered'].copy()
    
    # 转换时间戳
    df_delivered['order_purchase_timestamp'] = pd.to_datetime(
        df_delivered['order_purchase_timestamp']
    )
    
    # 计算RFM指标
    max_date = df_delivered['order_purchase_timestamp'].max()
    
    rfm = df_delivered.groupby('customer_unique_id').agg({
        'order_purchase_timestamp': lambda x: (max_date - x.max()).days,  # Recency
        'order_id': 'count',  # Frequency
        'price': 'sum'  # Monetary
    }).reset_index()
    
    rfm.columns = ['customer_id', 'recency', 'frequency', 'monetary']
    
    # 计算RFM分数（1-5分）
    rfm['R_score'] = pd.qcut(rfm['recency'].rank(method='first'), q=5, labels=[5, 4, 3, 2, 1], duplicates='drop')
    rfm['F_score'] = pd.qcut(rfm['frequency'].rank(method='first'), q=5, labels=[1, 2, 3, 4, 5], duplicates='drop')
    rfm['M_score'] = pd.qcut(rfm['monetary'].rank(method='first'), q=5, labels=[1, 2, 3, 4, 5], duplicates='drop')
    
    # 转换分数为数值
    rfm['R_score'] = rfm['R_score'].astype(int)
    rfm['F_score'] = rfm['F_score'].astype(int)
    rfm['M_score'] = rfm['M_score'].astype(int)
    
    # 计算RFM总分
    rfm['RFM_score'] = rfm['R_score'] + rfm['F_score'] + rfm['M_score']
    
    # 客户细分
    def segment_customer(row):
        if row['R_score'] >= 4 and row['F_score'] >= 4 and row['M_score'] >= 4:
            return 'VIP客户'
        elif row['R_score'] >= 3 and row['F_score'] >= 3 and row['M_score'] >= 3:
            return '高价值客户'
        elif row['R_score'] >= 2 and row['F_score'] >= 2:
            return '潜力客户'
        elif row['R_score'] >= 3:
            return '新客户'
        else:
            return '流失客户'
    
    rfm['segment'] = rfm.apply(segment_customer, axis=1)
    
    # 统计各细分
    segment_stats = rfm.groupby('segment').agg({
        'customer_id': 'count',
        'monetary': ['sum', 'mean'],
        'frequency': 'mean',
        'recency': 'mean'
    }).reset_index()
    
    segment_stats.columns = ['segment', 'customer_count', 'total_monetary', 'avg_monetary', 
                             'avg_frequency', 'avg_recency']
    
    print("\n客户细分统计:")
    print(segment_stats.to_string(index=False))
    
    # 保存结果
    rfm.to_csv(OUTPUT_DIR / 'rfm_analysis.csv', index=False)
    segment_stats.to_csv(OUTPUT_DIR / 'customer_segmentation.csv', index=False)
    
    print(f"\n✓ RFM分析结果已保存")
    
    return rfm, segment_stats

def customer_lifetime_value(df_customers, df_orders, df_order_items):
    """计算客户生命周期价值 (CLV)"""
    print("\n" + "=" * 60)
    print("客户生命周期价值分析 (CLV)")
    print("=" * 60)
    
    # 合并数据
    df_merged = df_orders.merge(df_customers, on='customer_id', how='inner')
    df_merged = df_merged.merge(
        df_order_items.groupby('order_id').agg({
            'price': 'sum',
            'freight_value': 'sum'
        }).reset_index(),
        on='order_id',
        how='inner'
    )
    
    df_delivered = df_merged[df_merged['order_status'] == 'delivered'].copy()
    df_delivered['order_purchase_timestamp'] = pd.to_datetime(
        df_delivered['order_purchase_timestamp']
    )
    
    # 计算CLV
    clv = df_delivered.groupby('customer_unique_id').agg({
        'order_purchase_timestamp': ['min', 'max', 'count'],
        'price': 'sum',
        'freight_value': 'sum'
    }).reset_index()
    
    clv.columns = ['customer_id', 'first_order', 'last_order', 'order_count', 
                   'total_revenue', 'total_freight']
    
    clv['total_value'] = clv['total_revenue'] + clv['total_freight']
    clv['lifetime_days'] = (clv['last_order'] - clv['first_order']).dt.days
    clv['avg_order_value'] = clv['total_value'] / clv['order_count']
    clv['avg_order_frequency'] = clv['lifetime_days'] / clv['order_count']
    
    # CLV排名
    clv_ranked = clv.sort_values('total_value', ascending=False)
    
    print(f"\n总客户数: {len(clv)}")
    print(f"平均CLV: {clv['total_value'].mean():.2f} BRL")
    print(f"中位数CLV: {clv['total_value'].median():.2f} BRL")
    
    print("\nTOP 20 高价值客户:")
    print(clv_ranked.head(20)[['customer_id', 'order_count', 'total_value', 'lifetime_days']].to_string(index=False))
    
    # 保存结果
    clv_ranked.to_csv(OUTPUT_DIR / 'customer_lifetime_value.csv', index=False)
    print(f"\n✓ CLV分析结果已保存")
    
    return clv

def product_association_analysis(df_order_items):
    """产品关联分析"""
    print("\n" + "=" * 60)
    print("产品关联分析")
    print("=" * 60)
    
    # 按订单分组，找出经常一起购买的产品
    order_products = df_order_items.groupby('order_id')['product_id'].apply(list).reset_index()
    
    # 计算产品共现矩阵（简化版）
    product_pairs = []
    
    for products in order_products['product_id'].head(10000):  # 限制样本以加快速度
        if len(products) > 1:
            for i in range(len(products)):
                for j in range(i+1, len(products)):
                    product_pairs.append(tuple(sorted([products[i], products[j]])))
    
    # 统计共现频率
    from collections import Counter
    pair_counts = Counter(product_pairs)
    
    # 获取最常见的产品对
    top_pairs = pair_counts.most_common(20)
    
    print("\n最常见的产品组合 (TOP 20):")
    for pair, count in top_pairs:
        print(f"产品对: {pair[0][:10]}... & {pair[1][:10]}... | 共现次数: {count}")
    
    # 保存结果
    pd.DataFrame(top_pairs, columns=['product_pair', 'co_occurrence_count']).to_csv(
        OUTPUT_DIR / 'product_associations.csv', index=False
    )
    
    print(f"\n✓ 产品关联分析结果已保存")
    
    return top_pairs

def logistics_analysis(df_orders):
    """物流效率分析"""
    print("\n" + "=" * 60)
    print("物流效率分析")
    print("=" * 60)
    
    df = df_orders.copy()
    
    # 转换时间戳
    date_cols = ['order_purchase_timestamp', 'order_delivered_customer_date', 
                 'order_estimated_delivery_date']
    for col in date_cols:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce')
    
    # 只分析已交付订单
    df_delivered = df[df['order_status'] == 'delivered'].copy()
    df_delivered = df_delivered.dropna(subset=['order_delivered_customer_date', 
                                               'order_purchase_timestamp'])
    
    # 计算配送时间
    df_delivered['delivery_days'] = (
        df_delivered['order_delivered_customer_date'] - 
        df_delivered['order_purchase_timestamp']
    ).dt.days
    
    # 计算是否按时交付
    if 'order_estimated_delivery_date' in df_delivered.columns:
        df_delivered['on_time'] = (
            df_delivered['order_delivered_customer_date'] <= 
            df_delivered['order_estimated_delivery_date']
        )
        on_time_rate = df_delivered['on_time'].mean() * 100
        print(f"\n准时交付率: {on_time_rate:.2f}%")
    
    # 配送时间统计
    print(f"\n配送时间统计:")
    print(df_delivered['delivery_days'].describe())
    
    # 配送时间分布
    delivery_time_dist = pd.cut(
        df_delivered['delivery_days'],
        bins=[0, 5, 10, 15, 20, 30, 100],
        labels=['0-5天', '6-10天', '11-15天', '16-20天', '21-30天', '30+天']
    ).value_counts().sort_index()
    
    print(f"\n配送时间分布:")
    print(delivery_time_dist)
    
    # 保存结果
    df_delivered[['order_id', 'delivery_days', 'on_time']].to_csv(
        OUTPUT_DIR / 'logistics_analysis.csv', index=False
    )
    
    print(f"\n✓ 物流分析结果已保存")
    
    return df_delivered

def main():
    """主函数"""
    print("=" * 60)
    print("高级分析")
    print("=" * 60)
    
    # 加载数据
    datasets = load_cleaned_data()
    
    # 执行高级分析
    if all(k in datasets for k in ['customers', 'orders', 'order_items']):
        rfm, segments = rfm_analysis(
            datasets['customers'], 
            datasets['orders'], 
            datasets['order_items']
        )
        
        clv = customer_lifetime_value(
            datasets['customers'],
            datasets['orders'],
            datasets['order_items']
        )
    
    if 'order_items' in datasets:
        product_associations = product_association_analysis(datasets['order_items'])
    
    if 'orders' in datasets:
        logistics = logistics_analysis(datasets['orders'])
    
    print("\n" + "=" * 60)
    print("✓ 高级分析完成！")
    print("=" * 60)

main()


高级分析
加载数据...
加载 customers: 99441 行
加载 sellers: 3095 行
加载 products: 32951 行
加载 order_items: 112650 行
加载 orders: 99441 行
加载 order_reviews: 99224 行
加载 order_payments: 103886 行

RFM分析 - 客户价值分析

客户细分统计:
segment  customer_count  total_monetary  avg_monetary  avg_frequency  avg_recency
  VIP客户            6474      1774913.54    274.160263       1.171764    89.894038
    新客户           11189      1533255.86    137.032430       1.000000   132.765752
   流失客户           22328      3144042.13    140.811632       1.021050   446.445898
   潜力客户           39241      4219784.03    107.535079       1.019571   202.008053
  高价值客户           14126      2549502.55    180.482978       1.054509   152.745151

✓ RFM分析结果已保存

客户生命周期价值分析 (CLV)

总客户数: 93358
平均CLV: 165.17 BRL
中位数CLV: 107.78 BRL

TOP 20 高价值客户:
                     customer_id  order_count  total_value  lifetime_days
0a0a92112bd4c708ca5fde585afaa872            1     13664.08              0
da122df9eeddfedc1dc1f5349a1a690c            2      7571.63       