In [1]:


import pandas as pd
import numpy as np

PATH2 = r"G:\25国赛\C题\PythonProject\code3\工作簿2_筛选结果(1).xlsx"
PATH4 = r"G:\25国赛\C题\PythonProject\code3\工作簿4.xlsx"
OUT_DIR = ''

# 超参数设置
THR = 0.040
W_MIN, W_MAX, STEP = 10.0, 25.0, 0.1
C_FAIL, C_DELAY = 1.0, 0.04
QUAL_THRESHOLD = 0.60


def to_weeks(series: pd.Series) -> pd.Series:
    s = pd.to_numeric(series, errors="coerce")
    return s / 7.0 if s.median() > 40 else s


df2 = pd.read_excel(PATH2, sheet_name=0).copy()
df2 = df2.rename(columns={
    "年龄": "age", "检测孕期": "ga", "孕妇BMI": "bmi",
    "原始读段数": "reads", "Y染色体浓度": "yfrac", "怀孕次数": "grav"
})
df2["ga_weeks"] = to_weeks(df2["ga"])
df2 = df2.dropna(subset=["bmi", "ga_weeks"]).reset_index(drop=True)

labels = [f"Q{i}" for i in range(1, 6)]
df2["bmi_group"], bmi_bins = pd.qcut(df2["bmi"], q=5, labels=labels, retbins=True, duplicates="drop")

week_grid = np.round(np.arange(W_MIN, W_MAX + 1e-9, STEP), 2)


def empirical_success_prob(weeks_array: np.ndarray, w: float) -> float:
    if len(weeks_array) == 0: return np.nan
    return float(np.mean(weeks_array <= w))


def risk_curve_from_weeks(weeks_array: np.ndarray, week_grid: np.ndarray):
    F = np.array([empirical_success_prob(weeks_array, w) for w in week_grid])
    R = C_FAIL * (1.0 - F) + C_DELAY * np.maximum(0.0, week_grid - W_MIN)
    return F, R


w2_curves = {}
for lab in labels:
    wk = df2.loc[df2["bmi_group"] == lab, "ga_weeks"].to_numpy()
    F2, R2 = risk_curve_from_weeks(wk, week_grid)
    w2_curves[lab] = {"F2": F2, "R2": R2, "n2": len(wk)}

df4 = pd.read_excel(PATH4, sheet_name=0).copy()
df4 = df4.rename(columns={
    "孕妇代码": "pid", "年龄": "age", "检测孕期": "ga", "孕妇BMI": "bmi",
    "原始读段数": "reads", "Y染色体浓度": "yfrac", "怀孕次数": "grav"
})
df4["ga_weeks"] = to_weeks(df4["ga"])
df4["yfrac"] = pd.to_numeric(df4["yfrac"], errors="coerce")
df4["bmi"] = pd.to_numeric(df4["bmi"], errors="coerce")
edges = bmi_bins.copy()
edges[0] -= 1e-6;
edges[-1] += 1e-6
df4["bmi_group"] = pd.cut(df4["bmi"], bins=edges, labels=labels, include_lowest=True)
df4v = df4.dropna(subset=["ga_weeks", "yfrac", "bmi_group"]).copy()

w4_curves = {}
for lab in labels:
    sub = df4v[df4v["bmi_group"] == lab]
    n_rec = len(sub)
    if n_rec == 0:
        w4_curves[lab] = {"P4": np.full_like(week_grid, np.nan, dtype=float), "n4": 0}
        continue
    ga_arr = sub["ga_weeks"].to_numpy();
    y_arr = sub["yfrac"].to_numpy()
    P4 = np.array([np.mean((ga_arr <= w) & (y_arr >= THR)) for w in week_grid], dtype=float)
    w4_curves[lab] = {"P4": P4, "n4": n_rec}

rows_final = []
for i, lab in enumerate(labels):
    F2 = w2_curves[lab]["F2"];
    R2 = w2_curves[lab]["R2"];
    P4 = w4_curves[lab]["P4"]
    n2 = w2_curves[lab]["n2"];
    n4 = w4_curves[lab]["n4"]
    ok = (P4 >= QUAL_THRESHOLD)
    if not np.any(ok):
        rows_final.append({"BMI组": lab, "工作簿2_样本数": n2, "工作簿4_记录数": n4,
                           "合格阈值(记录级)": QUAL_THRESHOLD, "是否有合格周": "否",
                           "合格最优周": np.nan, "合格最优周_R2(来自W2)": np.nan,
                           "合格最优周_P4(来自W4_记录级)": np.nan,
                           "BMI组下限": bmi_bins[i], "BMI组上限": bmi_bins[i + 1]})
        continue
    R2_masked = np.where(ok, R2, np.inf)
    k = int(np.argmin(R2_masked))
    rows_final.append({"BMI组": lab, "工作簿2_样本数": n2, "工作簿4_记录数": n4,
                       "合格阈值(记录级)": QUAL_THRESHOLD, "是否有合格周": "是",
                       "合格最优周": float(week_grid[k]),
                       "合格最优周_R2(来自W2)": float(R2[k]),
                       "合格最优周_P4(来自W4_记录级)": float(P4[k]),
                       "BMI组下限": bmi_bins[i], "BMI组上限": bmi_bins[i + 1]})

final_table = pd.DataFrame(rows_final).sort_values("BMI组").reset_index(drop=True)
out_final = "第三问_step3_合格且风险最低_推荐周_按记录计数_阈值60.csv"
final_table.to_csv(out_final, index=False)



In [None]:
#误差检测

import pandas as pd, numpy as np, os

PATH2 = r"G:\25国赛\C题\PythonProject\code3\工作簿2_筛选结果(1).xlsx"
PATH4 = r"G:\25国赛\C题\PythonProject\code3\工作簿4.xlsx"
OUT_DIR = ""

THR = 0.040
W_MIN, W_MAX, STEP = 10.0, 25.0, 0.1
C_FAIL, C_DELAY = 1.0, 0.04
QUAL_THRESHOLD = 0.60
B = 500

rng = np.random.default_rng(12345)

def to_weeks(series: pd.Series) -> pd.Series:
    s = pd.to_numeric(series, errors="coerce")
    return s/7.0 if s.median() > 40 else s

def empirical_success_prob(weeks_array: np.ndarray, w: float) -> float:
    if len(weeks_array)==0:
        return np.nan
    return float(np.mean(weeks_array <= w))

def risk_curve_from_weeks(weeks_array: np.ndarray, week_grid: np.ndarray):
    F = np.array([empirical_success_prob(weeks_array, w) for w in week_grid])
    R = C_FAIL*(1.0 - F) + C_DELAY*np.maximum(0.0, week_grid - W_MIN)
    return F, R

df2 = pd.read_excel(PATH2, sheet_name=0).copy()
df2 = df2.rename(columns={
    "年龄":"age", "检测孕期":"ga", "孕妇BMI":"bmi",
    "原始读段数":"reads", "Y染色体浓度":"yfrac", "怀孕次数":"grav"
})
df2["ga_weeks"] = to_weeks(df2["ga"])
df2 = df2.dropna(subset=["bmi","ga_weeks"]).reset_index(drop=True)

labels = [f"Q{i}" for i in range(1,6)]
df2["bmi_group"], bmi_bins = pd.qcut(df2["bmi"], q=5, labels=labels, retbins=True, duplicates="drop")
wks_by_group = {lab: df2.loc[df2["bmi_group"]==lab,"ga_weeks"].to_numpy() for lab in labels}

week_grid = np.round(np.arange(W_MIN, W_MAX+1e-9, STEP), 2)


df4 = pd.read_excel(PATH4, sheet_name=0).copy()
df4 = df4.rename(columns={
    "孕妇代码":"pid", "年龄":"age", "检测孕期":"ga", "孕妇BMI":"bmi",
    "原始读段数":"reads", "Y染色体浓度":"yfrac", "怀孕次数":"grav"
})
df4["ga_weeks"] = to_weeks(df4["ga"])
df4["yfrac"] = pd.to_numeric(df4["yfrac"], errors="coerce")
df4["bmi"]   = pd.to_numeric(df4["bmi"], errors="coerce")

edges = bmi_bins.copy()
edges[0]  = edges[0]  - 1e-6
edges[-1] = edges[-1] + 1e-6
df4["bmi_group"] = pd.cut(df4["bmi"], bins=edges, labels=labels, include_lowest=True)
df4v = df4.dropna(subset=["ga_weeks","yfrac","bmi_group"]).copy()

P4_curves = {}
for lab in labels:
    sub = df4v[df4v["bmi_group"]==lab]
    n_rec = len(sub)
    if n_rec==0:
        P4_curves[lab] = {"P4": np.full_like(week_grid, np.nan, dtype=float), "n_rec": 0}
    else:
        ga_arr = sub["ga_weeks"].to_numpy()
        y_arr  = sub["yfrac"].to_numpy()
        P4 = np.array([ np.mean((ga_arr <= w) & (y_arr >= THR)) for w in week_grid ], dtype=float)
        P4_curves[lab] = {"P4": P4, "n_rec": n_rec}


boot_records = []
for b in range(1, 500+1):
    for lab in labels:
        wk_orig = wks_by_group[lab]
        n = len(wk_orig)
        if n == 0:
            boot_records.append({"iter": b, "BMI组": lab, "最优周_boot": np.nan})
            continue
        idx = rng.integers(0, n, size=n)
        wk_boot = wk_orig[idx]
        F2, R2 = risk_curve_from_weeks(wk_boot, week_grid)
        P4 = P4_curves[lab]["P4"]
        ok = (P4 >= QUAL_THRESHOLD)
        if not np.any(ok):
            boot_records.append({"iter": b, "BMI组": lab, "最优周_boot": np.nan})
            continue
        R2_masked = np.where(ok, R2, np.inf)
        k = int(np.argmin(R2_masked))
        boot_records.append({"iter": b, "BMI组": lab, "最优周_boot": float(week_grid[k])})

boot_df = pd.DataFrame(boot_records)

summary_rows = []
for lab in labels:
    sub = boot_df[(boot_df["BMI组"]==lab) & (~boot_df["最优周_boot"].isna())]["最优周_boot"]
    m = sub.mean() if len(sub)>0 else np.nan
    sd = sub.std(ddof=1) if len(sub)>1 else np.nan
    q025 = sub.quantile(0.025) if len(sub)>0 else np.nan
    q500 = sub.quantile(0.500) if len(sub)>0 else np.nan
    q975 = sub.quantile(0.975) if len(sub)>0 else np.nan
    summary_rows.append({
        "BMI组": lab,
        "有效自助迭代数": int(sub.shape[0]),
        "最优周_均值": float(m) if not np.isnan(m) else np.nan,
        "最优周_标准差": float(sd) if not np.isnan(sd) else np.nan,
        "最优周_p2.5": float(q025) if not np.isnan(q025) else np.nan,
        "最优周_中位数": float(q500) if not np.isnan(q500) else np.nan,
        "最优周_p97.5": float(q975) if not np.isnan(q975) else np.nan
    })
summary_df = pd.DataFrame(summary_rows).sort_values("BMI组").reset_index(drop=True)

os.makedirs(OUT_DIR, exist_ok=True)
out_boot = "第三问_step3_bootstrap_最优周分布_阈值60.csv"
out_sum  = "第三问_step3_bootstrap_最优周汇总_阈值60.csv"
boot_df.to_csv(out_boot, index=False)
summary_df.to_csv(out_sum, index=False)

summary_df.head(10), out_sum, out_boot
