In [2]:
import pandas as pd
import numpy as np
from itertools import product

csv = "../tables/A_phys_table_4D.csv"  # 改成你的路径
PREC = 6                     # 生成时用的 prec

df = pd.read_csv(csv)

# 仅保留关键信息，并对网格坐标做一致的量化
keys = ["mu_DM","sigma_DM","beta_DM","xi_DM"]
df = df[keys + ["A_phys"]].copy()
for c in keys:
    df[c] = pd.to_numeric(df[c], errors="coerce").round(PREC)

# 1) 基本规模检查
N = len(df)
mu_u, sg_u, bt_u, xi_u = (np.sort(df[c].unique()) for c in keys)
expected = len(mu_u) * len(sg_u) * len(bt_u) * len(xi_u)
print(f"#rows={N}, uniques=({len(mu_u)}, {len(sg_u)}, {len(bt_u)}, {len(xi_u)}), expected={expected}, diff={N-expected}")

# 2) 查重复：同一键出现 >1 次的行
dups = df.duplicated(subset=keys, keep=False)
dup_df = (df[dups]
          .groupby(keys, as_index=False)
          .size()
          .sort_values("size", ascending=False))
print(f"duplicate combos: {len(dup_df)} (show top 10)")
print(dup_df.head(10))

# 2.1 汇总在哪些切片重复（帮助你快速定位是哪个块被写重了）
if len(dup_df):
    print("\nby mu (重复条数汇总):")
    print(dup_df.groupby("mu_DM")["size"].sum().sort_values(ascending=False).head(10))
    print("\nby sigma (重复条数汇总):")
    print(dup_df.groupby("sigma_DM")["size"].sum().sort_values(ascending=False).head(10))
    print("\nby beta (重复条数汇总):")
    print(dup_df.groupby("beta_DM")["size"].sum().sort_values(ascending=False).head(10))
    print("\nby xi (重复条数汇总):")
    print(dup_df.groupby("xi_DM")["size"].sum().sort_values(ascending=False).head(10))

# 3) 查缺失：构造满笛卡尔积，与现有键做反连接
grid = pd.MultiIndex.from_product(
    [mu_u, sg_u, bt_u, xi_u],
    names=keys
).to_frame(index=False)

merged = grid.merge(df[keys], on=keys, how="left", indicator=True)
missing = merged[merged["_merge"] == "left_only"][keys]
print(f"\nmissing combos: {len(missing)} (show top 10)")
print(missing.head(10))


#rows=1005000, uniques=(100, 100, 100, 1), expected=1000000, diff=5000
duplicate combos: 5000 (show top 10)
      mu_DM  sigma_DM   beta_DM  xi_DM  size
0      12.0  0.100000  1.181818      0     2
3330   12.0  0.265657  2.919192      0     2
3337   12.0  0.269697  1.767677      0     2
3336   12.0  0.269697  1.585859      0     2
3335   12.0  0.269697  1.565657      0     2
3334   12.0  0.269697  1.383838      0     2
3333   12.0  0.269697  1.323232      0     2
3332   12.0  0.269697  1.262626      0     2
3331   12.0  0.269697  1.181818      0     2
3329   12.0  0.265657  2.737374      0     2

by mu (重复条数汇总):
mu_DM
12.0    10000
Name: size, dtype: int64

by sigma (重复条数汇总):
sigma_DM
0.213131    200
0.285859    200
0.196970    200
0.201010    200
0.205051    200
0.350505    200
0.326263    200
0.217172    200
0.221212    200
0.225253    200
Name: size, dtype: int64

by beta (重复条数汇总):
beta_DM
1.767677    152
1.383838    152
1.323232    152
1.262626    150
1.929293    150
1.585859    15

In [5]:
import pandas as pd
keys = ["mu_DM","sigma_DM","beta_DM","xi_DM"]
df = pd.read_csv("../tables/A_phys_table_4D.csv")
df_clean = df.drop_duplicates(subset=keys, keep="first")
print("old:", len(df), "new:", len(df_clean))  # 应从 1,005,000 变 1,000,000
df_clean.to_csv("../tables/A_phys_table_4D_new.csv", index=False)


old: 1005000 new: 1000000
