In [None]:
import pandas as pd
import numpy as np

In [2]:



src_path = "G:\\25国赛\\C题\\PythonProject\\code2\\聚类.xlsx"
df_raw = pd.read_excel(src_path)

#重命名列名，便于拼写
df = df_raw.rename(columns={"孕妇BMI": "BMI", "检测孕期": "first_time"}).copy()


if pd.to_numeric(df["first_time"], errors="coerce").median() > 40:
    df["weeks"] = pd.to_numeric(df["first_time"], errors="coerce") / 7.0
else:
    df["weeks"] = pd.to_numeric(df["first_time"], errors="coerce")

#筛选并去除空值
df = df[(df["weeks"] >= 6) & (df["weeks"] <= 35)].copy()
df = df[["BMI","weeks"]].dropna().reset_index(drop=True)

#初始化五分位BMI组
labels = [f"Q{i}" for i in range(1, 6)]
df["BMI组"], bmi_bins = pd.qcut(df["BMI"], q=5, retbins=True, labels=labels)


group_info_rows = []
for lab in labels:
    sub = df[df["BMI组"] == lab]
    group_info_rows.append({
        "BMI组": lab,
        "人数": int(len(sub)),
        "BMI范围": f"[{sub['BMI'].min():.4f}, {sub['BMI'].max():.4f}]",
        "BMI中位数": float(sub["BMI"].median()),
        "首次达标周(中位数)": float(sub["weeks"].median())
    })
group_info = pd.DataFrame(group_info_rows).sort_values("BMI中位数").reset_index(drop=True)

#滑动窗口确定风险最低值
W_MIN, W_MAX = 10.0, 25.0


C_FAIL = 1.0
C_WAIT = 0.04


def empirical_success_prob(weeks_array, w):

    if len(weeks_array) == 0:
        return np.nan
    return float(np.mean(weeks_array <= w))

def risk(w, weeks_array):

    p = empirical_success_prob(weeks_array, w)

    if np.isnan(p):
        return np.inf
    return C_FAIL * (1.0 - p) + C_WAIT * max(0.0, w - W_MIN)  #风险函数


w_grid = np.round(np.linspace(W_MIN, W_MAX, 151), 2)

opt_rows = []
for lab in labels:
    sub = df[df["BMI组"] == lab]
    wk_arr = sub["weeks"].to_numpy()
    risks = np.array([risk(w, wk_arr) for w in w_grid])
    idx = int(np.argmin(risks))
    w_star = float(w_grid[idx])
    p_star = empirical_success_prob(wk_arr, w_star)
    opt_rows.append({
        "BMI组": lab,
        "BMI范围": f"[{sub['BMI'].min():.4f}, {sub['BMI'].max():.4f}]",
        "人数": int(len(sub)),
        "推荐最优NIPT周数": round(w_star, 2),
        "该周达标概率估计": round(p_star, 3),
        "风险函数最小值": round(float(risks[idx]), 3),
        "参数(C_FAIL,C_WAIT,W_MIN,W_MAX)": f"({C_FAIL},{C_WAIT},{W_MIN},{W_MAX})"
    })
opt_table = pd.DataFrame(opt_rows).sort_values("推荐最优NIPT周数").reset_index(drop=True)


out_csv1 = "BMI_五分位分组信息.csv"
out_csv2 = "BMI组_最优NIPT周_与达标概率.csv"
out_xlsx = "问题二_分组与最优NIPT.xlsx"

group_info.to_csv(out_csv1, index=False)
opt_table.to_csv(out_csv2, index=False)