In [1]:
"""
Olist 电商数据分析 - 数据加载
====================================
此脚本用于加载所有CSV数据集并进行初步检查
"""

import pandas as pd
import numpy as np
from pathlib import Path

# 设置路径
BASE_DIR = Path("..") 
DATA_DIR = BASE_DIR / "Brazilian E-Commerce Public Dataset by Olist"

# 配置pandas显示选项
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

def load_all_datasets():
    """加载所有数据集"""
    print("=" * 60)
    print("开始加载数据集...")
    print("=" * 60)
    
    datasets = {}
    
    # 定义所有数据集文件
    data_files = {
        'customers': 'olist_customers_dataset.csv',
        'orders': 'olist_orders_dataset.csv',
        'order_items': 'olist_order_items_dataset.csv',
        'products': 'olist_products_dataset.csv',
        'order_payments': 'olist_order_payments_dataset.csv',
        'order_reviews': 'olist_order_reviews_dataset.csv',
        'sellers': 'olist_sellers_dataset.csv',
        'geolocation': 'olist_geolocation_dataset.csv',
        'category_translation': 'product_category_name_translation.csv'
    }
    
    # 加载每个数据集
    for name, filename in data_files.items():
        filepath = DATA_DIR / filename
        try:
            print(f"\n正在加载 {name}...")
            df = pd.read_csv(filepath, low_memory=False)
            datasets[name] = df
            print(f"✓ 成功加载 {name}: {len(df)} 行, {len(df.columns)} 列")
            print(f"  列名: {', '.join(df.columns.tolist())}")
        except Exception as e:
            print(f"✗ 加载 {name} 失败: {str(e)}")
    
    print("\n" + "=" * 60)
    print(f"数据集加载完成！共加载 {len(datasets)} 个数据集")
    print("=" * 60)
    
    return datasets

def basic_info(datasets):
    """显示每个数据集的基本信息"""
    print("\n" + "=" * 60)
    print("数据集基本信息")
    print("=" * 60)
    
    for name, df in datasets.items():
        print(f"\n【{name.upper()}】")
        print(f"形状: {df.shape[0]} 行 × {df.shape[1]} 列")
        print(f"内存使用: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
        print(f"列名: {list(df.columns)}")
        print(f"重复行数: {df.duplicated().sum()}")
        print(f"缺失值总数: {df.isnull().sum().sum()}")
        for col, dtype in df.dtypes.items():
            null_count = df[col].isnull().sum()
            if null_count > 0:
                print(f"- {col}: {dtype} (缺失: {null_count})")

def sample_data(datasets, n=5):
    """显示每个数据集的样本数据"""
    print("\n" + "=" * 60)
    print(f"样本数据预览 (前{n}行)")
    print("=" * 60)
    
    for name, df in datasets.items():
        print(f"\n【{name.upper()}】")
        print(df.head(n))
        print("-" * 60)

def main():
    """主函数"""
    # 加载所有数据集
    datasets = load_all_datasets()
    basic_info(datasets)
    sample_data(datasets)
    return datasets
    
datasets = main()

print("\n✓ 数据加载脚本执行完成！")

开始加载数据集...

正在加载 customers...
✓ 成功加载 customers: 99441 行, 5 列
  列名: customer_id, customer_unique_id, customer_zip_code_prefix, customer_city, customer_state

正在加载 orders...
✓ 成功加载 orders: 99441 行, 8 列
  列名: order_id, customer_id, order_status, order_purchase_timestamp, order_approved_at, order_delivered_carrier_date, order_delivered_customer_date, order_estimated_delivery_date

正在加载 order_items...
✓ 成功加载 order_items: 112650 行, 7 列
  列名: order_id, order_item_id, product_id, seller_id, shipping_limit_date, price, freight_value

正在加载 products...
✓ 成功加载 products: 32951 行, 9 列
  列名: product_id, product_category_name, product_name_lenght, product_description_lenght, product_photos_qty, product_weight_g, product_length_cm, product_height_cm, product_width_cm

正在加载 order_payments...
✓ 成功加载 order_payments: 103886 行, 5 列
  列名: order_id, payment_sequential, payment_type, payment_installments, payment_value

正在加载 order_reviews...
✓ 成功加载 order_reviews: 99224 行, 7 列
  列名: review_id, order_id, review_