In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler
import json

with open("./sc2-04.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# 提取所有样本的 tags 向量
tags_vectors = []
for entry in data:
    tags = entry["tags"]
    # 提取数值特征（假设所有 tags 都是数值型）
    features = [
        tags["time"],
        tags["minerals"],
        tags["vespene"],
        tags["supply_army"],
        tags["supply_workers"],
        tags["n_structures"],
        tags["n_abilities"]
    ]
    tags_vectors.append(features)

# 标准化处理
scaler = StandardScaler()
tags_vectors = scaler.fit_transform(tags_vectors)

In [None]:
from sklearn.cluster import DBSCAN

# 设置 DBSCAN 参数（eps 和 min_samples 需根据数据调整）
dbscan = DBSCAN(eps=0.5, min_samples=5)
labels = dbscan.fit_predict(tags_vectors)

# 按簇分组并抽样
sampled_indices = []
for label in np.unique(labels):
    if label == -1:  # 噪声点直接保留
        noise_indices = np.where(labels == label)[0]
        sampled_indices.extend(noise_indices)
    else:
        cluster_indices = np.where(labels == label)[0]
        # 高密度簇保留一定比例
        if len(cluster_indices) > 100:
            selected = np.random.choice(cluster_indices, 100, replace=False)
        else:
            selected = cluster_indices
        sampled_indices.extend(selected)

# 保留结果
filtered_data = [data[i] for i in sampled_indices]
print(f"Original data size: {len(data)}")
print(f"Filtered data size: {len(filtered_data)}")

In [None]:
# 保存去重后的数据
with open("./sc2-04-deduplicated.json", "w", encoding="utf-8") as f:
    json.dump(filtered_data, f, ensure_ascii=False, indent=4)