In [None]:
import matplotlib.pyplot as plt
import json

# 假设data是已加载的列表数据
# 请确保已正确加载data变量
dataset_name = "./sc2-04-deduplicated.json"
with open(dataset_name, "r", encoding="utf-8") as f:
    data = json.load(f)

# 提取需要的各个变量数据
n_conversations = [len(d['conversations']) for d in data]
time = [d['tags']['time'] for d in data]
minerals = [d['tags']['minerals'] for d in data]
vespene = [d['tags']['vespene'] for d in data]
supply_army = [d['tags']['supply_army'] for d in data]
supply_workers = [d['tags']['supply_workers'] for d in data]
n_structures = [d['tags']['n_structures'] for d in data]
n_abilities = [d['tags']['n_abilities'] for d in data]

# 创建画布和子图布局（2行4列）
fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(20, 10))
axes = axes.flatten()  # 将二维数组转为一维数组方便遍历

# 定义所有变量及其数据
variables = [
    ("n_conversations", n_conversations),
    ("time", time),
    ("minerals", minerals),
    ("vespene", vespene),
    ("supply_army", supply_army),
    ("supply_workers", supply_workers),
    ("n_structures", n_structures),
    ("n_abilities", n_abilities)
]

# 遍历每个变量绘制分布图
for i, (name, data) in enumerate(variables):
    # 绘制直方图
    axes[i].hist(data, bins='auto', edgecolor='black', alpha=0.7)
    
    # 设置标题和坐标轴标签
    axes[i].set_title(f'Distribution of {name}')
    axes[i].set_xlabel('Value')
    axes[i].set_ylabel('Frequency')
    
    # 添加网格线
    axes[i].grid(axis='y', alpha=0.5)

# 调整布局防止重叠
plt.tight_layout()

# 保存图片（DPI=300）
plt.savefig(f'distribution_plots-{dataset_name}.png', dpi=300, bbox_inches='tight')
plt.close()