In [2]:
import json
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

# 配置参数
DATA_PATH = "/home/zym/DataMining/project/data/steam_games.ndjson"
SAVE_DIR = "project/results/owners_analysis"
OS_MKDIR = True  # 是否自动创建目录


def main():
    # 创建保存目录
    if OS_MKDIR:
        import os

        os.makedirs(SAVE_DIR, exist_ok=True)

    # 加载数据
    raw_values = []
    with open(DATA_PATH, "r", encoding="utf-8") as f:
        for line in f:
            try:
                game = json.loads(line)
                value = game.get("estimated_owners")
                if isinstance(value, str) and "-" in value:
                    # 标准化格式：移除空格和逗号
                    cleaned = value.replace(" ", "").replace(",", "")
                    raw_values.append(cleaned)
                elif value:  # 处理异常格式
                    raw_values.append(f"INVALID: {value}")
                else:  # 空值处理
                    raw_values.append("MISSING")
            except (json.JSONDecodeError, KeyError):
                continue

    # 统计唯一值
    counter = Counter(raw_values)
    total_records = len(raw_values)
    unique_count = len(counter)

    # 生成统计表格
    df_stats = pd.DataFrame.from_dict(counter, orient="index", columns=["count"])
    df_stats.index.name = "original_value"
    df_stats = df_stats.sort_values("count", ascending=False)

    # 保存结果
    stats_file = f"{SAVE_DIR}/owner_value_distribution.csv"
    df_stats.to_csv(stats_file, encoding="utf-8-sig")  # 支持中文编码

    # 生成可视化
    plt.figure(figsize=(12, 8))
    df_stats.head(20).plot(kind="barh", legend=False)  # 显示前20个常见值
    plt.title(
        f"Original Value Distribution of estimated_owners\n(Total {unique_count} unique values)"
    )
    plt.xlabel("Occurrence Count")
    plt.ylabel("Original Value Pattern")
    plt.gca().invert_yaxis()  # 降序排列
    plot_file = f"{SAVE_DIR}/owner_value_distribution.png"
    plt.savefig(plot_file, dpi=300, bbox_inches="tight")
    plt.close()

    # 打印结果摘要
    print(
        f"""
    ========== 分析结果 ==========
    总数据条目：{total_records:,}
    唯一值数量：{unique_count:,}
    
    前5个常见值：
    {df_stats.head(5).to_string(header=False)}
    
    完整结果已保存：
    - 数据文件：{stats_file}
    - 可视化图表：{plot_file}
    """
    )


if __name__ == "__main__":
    main()


    总数据条目：70,198
    唯一值数量：15
    
    前5个常见值：
    original_value       
0-20000         43485
0-0              9366
20000-50000      7636
50000-100000     3583
100000-200000    2374
    
    完整结果已保存：
    - 数据文件：project/results/owners_analysis/owner_value_distribution.csv
    - 可视化图表：project/results/owners_analysis/owner_value_distribution.png
    


<Figure size 1200x800 with 0 Axes>