In [1]:
import os
import json
import copy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import defaultdict, Counter

from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import KFold

### Config

In [5]:
target_var = "onset"   # 'add', 'onset' or 'hist'
SEED = 42
N_FOLDS = 5

OUT_DIR = "/content/drive/MyDrive/reports/ts2vec-04-lgbm-yuntao"

np.random.seed(SEED)

In [6]:
########## For Colab ##########
!pip install ts2vec
from ts2vec import TS2Vec

########## Personal ##########
from google.colab import drive
drive.mount('/content/drive')
with open(f'/content/drive/MyDrive/datasets/dataset_{target_var}.json') as f:
    content = f.read()
    data = json.loads(content)

########## Enterprise ##########
# import gcsfs
# fs = gcsfs.GCSFileSystem()
# with fs.open('gs://modoo-eod/users/datasets/dataset_hist.json') as f:
#     content = f.read()
#     data = json.loads(content)

######### Local ##########
# with open("../datasets/dataset_onset.json") as f:
#     content=f.read()
#     data=json.loads(content)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
df = pd.DataFrame.from_records(data)
print(len(df), "Measurements (raw)")

# %% Basic cleaning & static imputation
age_mean = np.mean([i[0] for i in df["static"] if pd.notna(i[0])])
bmi_mean = np.mean([i[1] for i in df["static"] if pd.notna(i[1])])

cleaned_data = []
for _, m in enumerate(data):

    # Remove measurements with empty windows
    if len(m["uc_windows"]) == 0 or len(m["fhr_windows"]) == 0:
        continue

    # Handle NaN values
    static = m["static"].copy()
    if pd.isna(static[0]):
        static[0] = age_mean
    if pd.isna(static[1]):
        static[1] = bmi_mean

    copy_m = m.copy()
    copy_m["static"] = static
    cleaned_data.append(copy_m)

cleaned_df = pd.DataFrame(cleaned_data)
print(len(cleaned_df), "Cleaned Measurements")

# gestational age in weeks (assume last static is gest_age_days)
cleaned_df["gest_age_weeks"] = [(i[-1] // 7) + 1 for i in cleaned_df["static"]]

print("Gestational age weeks distribution:")
print(cleaned_df["gest_age_weeks"].value_counts().sort_index())

base_records = cleaned_df.to_dict(orient="records")
N = len(base_records)
indices = np.arange(N)

# %% Build feature helpers
FEATURE_ORDER = ["uc_raw", "fhr_raw", "fhr_windows", "uc_windows", "static"]
FEATURE_DIMS = {
    "uc_raw": 320,
    "fhr_raw": 320,
    "fhr_windows": 24,
    "uc_windows": 20,
    "static": 8,
}


def to_feature_vec(m):
    parts = []
    for k in FEATURE_ORDER:
        v = np.asarray(m[k], dtype=np.float32).ravel()
        assert v.size == FEATURE_DIMS[k], f"{k} dim mismatch: {v.size} vs {FEATURE_DIMS[k]}"
        parts.append(v)
    x = np.concatenate(parts).astype(np.float32, copy=False)
    return x


def make_xy(measurements):
    X = np.stack([to_feature_vec(m) for m in measurements], axis=0)
    y = np.asarray([m["target"] for m in measurements], dtype=np.float32).ravel()
    weeks = np.asarray([m["gest_age_weeks"] for m in measurements], dtype=np.int32)
    return X, y, weeks


# %% 5-fold CV
kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

all_true = []
all_pred = []
all_weeks = []
fold_metrics = []
rows_week_fold = []
all_pred_rows = []   # 用于 prediction.csv

patients = list(set([i['mobile'] for i in base_records]))

for fold, (tr_idx, te_idx) in enumerate(kf.split(np.arange(len(patients))), start=1):

    print(f"\n===== Fold {fold}/{N_FOLDS} =====")

    # 每个 fold 用 base_records 的深拷贝，避免之前的修改污染

    train_patients = [] ; test_patients = []
    for i in tr_idx:
        train_patients.append(patients[i])
    for i in te_idx:
        test_patients.append(patients[i])

    train_recs = [] ; test_recs = []
    for p in train_patients:
        for m in base_records:
            if m['mobile'] == p:
                train_recs.append(copy.deepcopy(m))
    for p in test_patients:
        for m in base_records:
            if m['mobile'] == p:
                test_recs.append(copy.deepcopy(m))

    print(f"Train : {len(train_recs)} from {len(train_patients)} patients")
    print(f"Test  : {len(test_recs)} from {len(test_patients)} patients")
    # train_recs = [copy.deepcopy(base_records[i]) for i in tr_idx]
    # test_recs = [copy.deepcopy(base_records[i]) for i in te_idx]

    # print(f"Train samples: {len(train_recs)}, Test samples: {len(test_recs)}")

    # --- 1. 准备 TS2Vec 输入 ---
    train_uc_raw = np.expand_dims(np.array([m["uc_raw"] for m in train_recs], dtype=np.float32), axis=2)
    train_fhr_raw = np.expand_dims(np.array([m["fhr_raw"] for m in train_recs], dtype=np.float32), axis=2)
    test_uc_raw = np.expand_dims(np.array([m["uc_raw"] for m in test_recs], dtype=np.float32), axis=2)
    test_fhr_raw = np.expand_dims(np.array([m["fhr_raw"] for m in test_recs], dtype=np.float32), axis=2)

    print("  Train UC shape :", train_uc_raw.shape)
    print("  Train FHR shape:", train_fhr_raw.shape)
    print("  Test  UC shape :", test_uc_raw.shape)
    print("  Test  FHR shape:", test_fhr_raw.shape)

    # --- 2. TS2Vec encoder（按你原来的用法：同一个模型多次 encode） ---
    ts_model = TS2Vec(
        input_dims=1,
        output_dims=320,
        device=0,       # 如果有 CUDA 就用 GPU: 也可以改成 "cuda:0"
        batch_size=32,
    )

    # UC embeddings
    print("  [Fold %d] TS2Vec encoding UC..." % fold)
    train_uc_embed = ts_model.encode(train_uc_raw, encoding_window="full_series")
    test_uc_embed = ts_model.encode(test_uc_raw, encoding_window="full_series")

    # FHR embeddings
    print("  [Fold %d] TS2Vec encoding FHR..." % fold)
    train_fhr_embed = ts_model.encode(train_fhr_raw, encoding_window="full_series")
    test_fhr_embed = ts_model.encode(test_fhr_raw, encoding_window="full_series")

    for i, e in enumerate(train_uc_embed):
        train_recs[i]["uc_raw"] = e
    for i, e in enumerate(train_fhr_embed):
        train_recs[i]["fhr_raw"] = e
    for i, e in enumerate(test_uc_embed):
        test_recs[i]["uc_raw"] = e
    for i, e in enumerate(test_fhr_embed):
        test_recs[i]["fhr_raw"] = e

    # --- 3. 聚合 window-level features ---
    for rec in train_recs:
        uc_w = np.array([[v for _, v in w.items()] for w in rec["uc_windows"]], dtype=np.float32)
        fhr_w = np.array([[v for _, v in w.items()] for w in rec["fhr_windows"]], dtype=np.float32)
        rec["uc_windows"] = uc_w.mean(axis=0)
        rec["fhr_windows"] = fhr_w.mean(axis=0)
        assert len(rec["uc_windows"]) == 20
        assert len(rec["fhr_windows"]) == 24

    for rec in test_recs:
        uc_w = np.array([[v for _, v in w.items()] for w in rec["uc_windows"]], dtype=np.float32)
        fhr_w = np.array([[v for _, v in w.items()] for w in rec["fhr_windows"]], dtype=np.float32)
        rec["uc_windows"] = uc_w.mean(axis=0)
        rec["fhr_windows"] = fhr_w.mean(axis=0)
        assert len(rec["uc_windows"]) == 20
        assert len(rec["fhr_windows"]) == 24

    # --- 4. 构建特征矩阵 ---
    X_tr, y_tr, weeks_tr = make_xy(train_recs)
    X_te, y_te, weeks_te = make_xy(test_recs)

    print(f"[Fold {fold}] X_tr={X_tr.shape}, X_te={X_te.shape}")

    # --- 5. 训练 LightGBM ---
    lgbm = LGBMRegressor(
        n_estimators=2000,
        learning_rate=0.03,
        subsample=0.9,
        colsample_bytree=0.8,
        random_state=SEED,
        n_jobs=-1,
    )

    lgbm.fit(X_tr, y_tr)

    # --- 6. 预测 & 评估 ---
    y_tr_pred = lgbm.predict(X_tr)
    y_te_pred = lgbm.predict(X_te)

    fold_train_mae = mean_absolute_error(y_tr, y_tr_pred)
    fold_test_mae = mean_absolute_error(y_te, y_te_pred)
    fold_test_rmse = np.sqrt(mean_squared_error(y_te, y_te_pred))
    fold_test_r2 = r2_score(y_te, y_te_pred)

    print(f"[Fold {fold}] Train MAE = {fold_train_mae:.3f}")
    print(f"[Fold {fold}] Test  MAE = {fold_test_mae:.3f}, RMSE = {fold_test_rmse:.3f}, R² = {fold_test_r2:.3f}")

    fold_metrics.append({
        "fold": fold,
        "train_mae": fold_train_mae,
        "test_mae": fold_test_mae,
        "test_rmse": fold_test_rmse,
        "test_r2": fold_test_r2,
    })

    # 记录全体散点
    all_true.append(y_te)
    all_pred.append(y_te_pred)
    all_weeks.append(weeks_te)

    # prediction.csv 的记录
    df_pred_fold = pd.DataFrame({
        "fold": fold,
        "gest_age_weeks": weeks_te.astype(int),
        "y_true": y_te.astype(float),
        "y_pred": y_te_pred.astype(float),
    })
    all_pred_rows.append(df_pred_fold)

    # --- 7. 每周 MAE ---
    abs_err = np.abs(y_te_pred - y_te)
    rows_this_fold = []
    print(f"[Fold {fold}] Test MAE by gestational week:")
    for w in sorted(np.unique(weeks_te)):
        mask_w = (weeks_te == w)
        n_w = int(mask_w.sum())
        if n_w == 0:
            continue
        mae_w = float(abs_err[mask_w].mean())
        rows_this_fold.append({
            "fold": fold,
            "week": int(w),
            "n_samples": n_w,
            "mae": mae_w,
        })
        print(f"  Week {int(w):2d}: MAE={mae_w:.3f} (n={n_w})")

    rows_week_fold.extend(rows_this_fold)

    # 每折 per-week CSV（可选）
    df_fold_week = pd.DataFrame(rows_this_fold)
    fold_week_csv = os.path.join(OUT_DIR, f"fold_{fold:02d}_week_mae.csv")
    df_fold_week.to_csv(fold_week_csv, index=False)
    print(f"[Saved] {fold_week_csv}")

# %% 汇总 prediction.csv
df_pred_all = pd.concat(all_pred_rows, ignore_index=True)
pred_csv_path = os.path.join(OUT_DIR, "prediction.csv")
df_pred_all.to_csv(pred_csv_path, index=False)
print(f"\n[Saved] {pred_csv_path}")

# %% 汇总结果
all_true = np.concatenate(all_true)
all_pred = np.concatenate(all_pred)
all_weeks = np.concatenate(all_weeks)

# 每折整体性能表
df_folds = pd.DataFrame(fold_metrics)
fold_summary_csv = os.path.join(OUT_DIR, "fold_overall_metrics.csv")
df_folds.to_csv(fold_summary_csv, index=False)
print(f"[Saved] {fold_summary_csv}")

print("\nOverall across folds:")
print(df_folds)

# 每折 × 每周表
df_week_fold = pd.DataFrame(rows_week_fold)
week_fold_csv = os.path.join(OUT_DIR, "week_mae_per_fold.csv")
df_week_fold.to_csv(week_fold_csv, index=False)
print(f"[Saved] {week_fold_csv}")

# 按 week 汇总 mean / std
df_week_summary = df_week_fold.groupby("week").agg(
    mae_mean=("mae", "mean"),
    mae_std=("mae", "std"),
    n_total=("n_samples", "sum"),
    n_folds=("mae", "count"),
).reset_index()

week_summary_csv = os.path.join(OUT_DIR, "week_mae_summary.csv")
df_week_summary.to_csv(week_summary_csv, index=False)
print(f"[Saved] {week_summary_csv}")

print("\nWeek-wise MAE summary:")
print(df_week_summary)

# %% 可视化 1：per-week MAE (bar + SD)
plt.figure(figsize=(10, 5))
x = df_week_summary["week"].values
y = df_week_summary["mae_mean"].values
yerr = df_week_summary["mae_std"].values

plt.bar(x, y, yerr=yerr, capsize=3)
plt.xlabel("Gestational Age (weeks)")
plt.ylabel("MAE (days-to-onset)")
plt.title("Per-week MAE across 5-fold CV (TS2Vec + LGBM)")
plt.xticks(x)
plt.tight_layout()
bar_path = os.path.join(OUT_DIR, "week_mae_bar_with_sd.png")
plt.savefig(bar_path, dpi=200)
plt.close()
print(f"[Saved] {bar_path}")

# %% 可视化 2：True vs Pred scatter
plt.figure(figsize=(6, 6))
plt.scatter(all_true, all_pred, s=8, alpha=0.5)
mn = float(min(all_true.min(), all_pred.min()))
mx = float(max(all_true.max(), all_pred.max()))
plt.plot([mn, mx], [mn, mx], "r--", label="Ideal")

overall_mae = mean_absolute_error(all_true, all_pred)
overall_rmse = np.sqrt(mean_squared_error(all_true, all_pred))
overall_r2 = r2_score(all_true, all_pred)

plt.xlabel("True days to onset")
plt.ylabel("Predicted days to onset")
plt.title(f"5-fold CV — True vs Pred (TS2Vec + LGBM)\nMAE={overall_mae:.2f}, RMSE={overall_rmse:.2f}, R²={overall_r2:.3f}")
plt.legend(loc="upper left")
plt.tight_layout()
scatter_path = os.path.join(OUT_DIR, "cv_true_vs_pred_scatter.png")
plt.savefig(scatter_path, dpi=200)
plt.close()
print(f"[Saved] {scatter_path}")


3681 Measurements (raw)
3661 Cleaned Measurements
Gestational age weeks distribution:
gest_age_weeks
29     25
30     72
31    105
32    157
33    254
34    313
35    371
36    443
37    503
38    544
39    463
40    321
41     89
42      1
Name: count, dtype: int64

===== Fold 1/5 =====
Train : 2857 from 45 patients
Test  : 804 from 12 patients
  Train UC shape : (2857, 2048, 1)
  Train FHR shape: (2857, 2048, 1)
  Test  UC shape : (804, 2048, 1)
  Test  FHR shape: (804, 2048, 1)
  [Fold 1] TS2Vec encoding UC...
  [Fold 1] TS2Vec encoding FHR...
[Fold 1] X_tr=(2857, 692), X_te=(804, 692)
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.052193 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 170946
[LightGBM] [Info] Number of data points in the train set: 2857, number of used features: 679
[LightGBM] [Info] Start training from score 25.800959




[Fold 1] Train MAE = 0.004
[Fold 1] Test  MAE = 7.805, RMSE = 9.392, R² = 0.719
[Fold 1] Test MAE by gestational week:
  Week 30: MAE=21.813 (n=2)
  Week 31: MAE=4.376 (n=8)
  Week 32: MAE=10.630 (n=25)
  Week 33: MAE=10.863 (n=75)
  Week 34: MAE=9.196 (n=81)
  Week 35: MAE=8.404 (n=84)
  Week 36: MAE=7.101 (n=93)
  Week 37: MAE=7.767 (n=98)
  Week 38: MAE=8.376 (n=107)
  Week 39: MAE=7.686 (n=100)
  Week 40: MAE=4.774 (n=88)
  Week 41: MAE=3.714 (n=42)
  Week 42: MAE=2.773 (n=1)
[Saved] /content/drive/MyDrive/reports/ts2vec-04-lgbm-yuntao/fold_01_week_mae.csv

===== Fold 2/5 =====
Train : 3015 from 45 patients
Test  : 646 from 12 patients
  Train UC shape : (3015, 2048, 1)
  Train FHR shape: (3015, 2048, 1)
  Test  UC shape : (646, 2048, 1)
  Test  FHR shape: (646, 2048, 1)
  [Fold 2] TS2Vec encoding UC...
  [Fold 2] TS2Vec encoding FHR...
[Fold 2] X_tr=(3015, 692), X_te=(646, 692)
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.042384 seconds.




[Fold 2] Train MAE = 0.016
[Fold 2] Test  MAE = 8.803, RMSE = 12.131, R² = 0.529
[Fold 2] Test MAE by gestational week:
  Week 29: MAE=9.311 (n=4)
  Week 30: MAE=8.934 (n=9)
  Week 31: MAE=19.314 (n=22)
  Week 32: MAE=18.335 (n=42)
  Week 33: MAE=13.798 (n=42)
  Week 34: MAE=10.281 (n=55)
  Week 35: MAE=8.820 (n=66)
  Week 36: MAE=9.241 (n=82)
  Week 37: MAE=8.671 (n=84)
  Week 38: MAE=6.439 (n=94)
  Week 39: MAE=4.466 (n=83)
  Week 40: MAE=3.204 (n=52)
  Week 41: MAE=1.658 (n=11)
[Saved] /content/drive/MyDrive/reports/ts2vec-04-lgbm-yuntao/fold_02_week_mae.csv

===== Fold 3/5 =====
Train : 3016 from 46 patients
Test  : 645 from 11 patients
  Train UC shape : (3016, 2048, 1)
  Train FHR shape: (3016, 2048, 1)
  Test  UC shape : (645, 2048, 1)
  Test  FHR shape: (645, 2048, 1)
  [Fold 3] TS2Vec encoding UC...
  [Fold 3] TS2Vec encoding FHR...
[Fold 3] X_tr=(3016, 692), X_te=(645, 692)
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.043184 seconds.



[Fold 3] Train MAE = 0.012
[Fold 3] Test  MAE = 5.515, RMSE = 6.552, R² = 0.842
[Fold 3] Test MAE by gestational week:
  Week 29: MAE=5.117 (n=6)
  Week 30: MAE=3.208 (n=12)
  Week 31: MAE=2.395 (n=5)
  Week 32: MAE=6.539 (n=9)
  Week 33: MAE=5.088 (n=34)
  Week 34: MAE=5.196 (n=57)
  Week 35: MAE=5.856 (n=56)
  Week 36: MAE=6.401 (n=72)
  Week 37: MAE=7.515 (n=116)
  Week 38: MAE=5.657 (n=122)
  Week 39: MAE=4.778 (n=92)
  Week 40: MAE=2.793 (n=52)
  Week 41: MAE=1.020 (n=12)
[Saved] /content/drive/MyDrive/reports/ts2vec-04-lgbm-yuntao/fold_03_week_mae.csv

===== Fold 4/5 =====
Train : 2710 from 46 patients
Test  : 951 from 11 patients
  Train UC shape : (2710, 2048, 1)
  Train FHR shape: (2710, 2048, 1)
  Test  UC shape : (951, 2048, 1)
  Test  FHR shape: (951, 2048, 1)
  [Fold 4] TS2Vec encoding UC...
  [Fold 4] TS2Vec encoding FHR...
[Fold 4] X_tr=(2710, 692), X_te=(951, 692)
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.099686 seconds.
You



[Fold 4] Train MAE = 0.013
[Fold 4] Test  MAE = 7.428, RMSE = 9.602, R² = 0.802
[Fold 4] Test MAE by gestational week:
  Week 29: MAE=16.036 (n=15)
  Week 30: MAE=16.691 (n=49)
  Week 31: MAE=11.468 (n=58)
  Week 32: MAE=8.440 (n=68)
  Week 33: MAE=8.809 (n=66)
  Week 34: MAE=5.703 (n=77)
  Week 35: MAE=7.153 (n=96)
  Week 36: MAE=7.821 (n=101)
  Week 37: MAE=7.651 (n=114)
  Week 38: MAE=6.873 (n=120)
  Week 39: MAE=4.094 (n=102)
  Week 40: MAE=1.663 (n=72)
  Week 41: MAE=2.691 (n=13)
[Saved] /content/drive/MyDrive/reports/ts2vec-04-lgbm-yuntao/fold_04_week_mae.csv

===== Fold 5/5 =====
Train : 3046 from 46 patients
Test  : 615 from 11 patients
  Train UC shape : (3046, 2048, 1)
  Train FHR shape: (3046, 2048, 1)
  Test  UC shape : (615, 2048, 1)
  Test  FHR shape: (615, 2048, 1)
  [Fold 5] TS2Vec encoding UC...
  [Fold 5] TS2Vec encoding FHR...
[Fold 5] X_tr=(3046, 692), X_te=(615, 692)
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.089241 seco



[Fold 5] Train MAE = 0.017
[Fold 5] Test  MAE = 5.648, RMSE = 6.525, R² = 0.834
[Fold 5] Test MAE by gestational week:
  Week 31: MAE=5.861 (n=12)
  Week 32: MAE=7.493 (n=13)
  Week 33: MAE=7.853 (n=37)
  Week 34: MAE=7.461 (n=43)
  Week 35: MAE=6.625 (n=69)
  Week 36: MAE=6.459 (n=95)
  Week 37: MAE=6.013 (n=91)
  Week 38: MAE=5.280 (n=101)
  Week 39: MAE=4.518 (n=86)
  Week 40: MAE=2.538 (n=57)
  Week 41: MAE=0.921 (n=11)
[Saved] /content/drive/MyDrive/reports/ts2vec-04-lgbm-yuntao/fold_05_week_mae.csv

[Saved] /content/drive/MyDrive/reports/ts2vec-04-lgbm-yuntao/prediction.csv
[Saved] /content/drive/MyDrive/reports/ts2vec-04-lgbm-yuntao/fold_overall_metrics.csv

Overall across folds:
   fold  train_mae  test_mae  test_rmse   test_r2
0     1   0.004162  7.805046   9.391822  0.719041
1     2   0.016328  8.802806  12.131069  0.528662
2     3   0.012418  5.514725   6.551848  0.842275
3     4   0.013271  7.427822   9.602374  0.801596
4     5   0.016802  5.648214   6.525060  0.833889
[Sav

## add

In [None]:
# %% Imports
import os
import json
import copy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import defaultdict, Counter

from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import KFold

from ts2vec import TS2Vec

# %% Config
target_var = "add"   # 'add', 'onset' or 'hist'
SEED = 42
N_FOLDS = 5

DATA_ROOT = r"F:\E3A Healthcare\TS2VEC+LGBM\datasets"
json_path = os.path.join(DATA_ROOT, f"dataset_{target_var}.json")

OUT_DIR = r"F:\E3A Healthcare\TS2VEC+LGBM\cv_ts2vec_lgbm_5fold_add"
os.makedirs(OUT_DIR, exist_ok=True)

np.random.seed(SEED)

# %% Load data
with open(json_path, "r", encoding="utf-8") as f:
    data = json.load(f)

df = pd.DataFrame.from_records(data)
print(len(df), "Measurements (raw)")

# %% Basic cleaning & static imputation
age_mean = np.mean([i[0] for i in df["static"] if pd.notna(i[0])])
bmi_mean = np.mean([i[1] for i in df["static"] if pd.notna(i[1])])

cleaned_data = []
for _, m in enumerate(data):

    # Remove measurements with empty windows
    if len(m["uc_windows"]) == 0 or len(m["fhr_windows"]) == 0:
        continue

    # Handle NaN values
    static = m["static"].copy()
    if pd.isna(static[0]):
        static[0] = age_mean
    if pd.isna(static[1]):
        static[1] = bmi_mean

    copy_m = m.copy()
    copy_m["static"] = static
    cleaned_data.append(copy_m)

cleaned_df = pd.DataFrame(cleaned_data)
print(len(cleaned_df), "Cleaned Measurements")

# gestational age in weeks (assume last static is gest_age_days)
cleaned_df["gest_age_weeks"] = [(i[-1] // 7) + 1 for i in cleaned_df["static"]]

print("Gestational age weeks distribution:")
print(cleaned_df["gest_age_weeks"].value_counts().sort_index())

base_records = cleaned_df.to_dict(orient="records")
N = len(base_records)
indices = np.arange(N)

# %% Build feature helpers
FEATURE_ORDER = ["uc_raw", "fhr_raw", "fhr_windows", "uc_windows", "static"]
FEATURE_DIMS = {
    "uc_raw": 320,
    "fhr_raw": 320,
    "fhr_windows": 24,
    "uc_windows": 20,
    "static": 8,
}


def to_feature_vec(m):
    parts = []
    for k in FEATURE_ORDER:
        v = np.asarray(m[k], dtype=np.float32).ravel()
        assert v.size == FEATURE_DIMS[k], f"{k} dim mismatch: {v.size} vs {FEATURE_DIMS[k]}"
        parts.append(v)
    x = np.concatenate(parts).astype(np.float32, copy=False)
    return x


def make_xy(measurements):
    X = np.stack([to_feature_vec(m) for m in measurements], axis=0)
    y = np.asarray([m["target"] for m in measurements], dtype=np.float32).ravel()
    weeks = np.asarray([m["gest_age_weeks"] for m in measurements], dtype=np.int32)
    return X, y, weeks


# %% 5-fold CV
kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

all_true = []
all_pred = []
all_weeks = []
fold_metrics = []
rows_week_fold = []
all_pred_rows = []   # 用于 prediction.csv

for fold, (tr_idx, te_idx) in enumerate(kf.split(indices), start=1):
    print(f"\n===== Fold {fold}/{N_FOLDS} =====")

    train_recs = [copy.deepcopy(base_records[i]) for i in tr_idx]
    test_recs = [copy.deepcopy(base_records[i]) for i in te_idx]

    print(f"Train samples: {len(train_recs)}, Test samples: {len(test_recs)}")

    # --- 1. 准备 TS2Vec 输入 ---
    train_uc_raw = np.expand_dims(np.array([m["uc_raw"] for m in train_recs], dtype=np.float32), axis=2)
    train_fhr_raw = np.expand_dims(np.array([m["fhr_raw"] for m in train_recs], dtype=np.float32), axis=2)
    test_uc_raw = np.expand_dims(np.array([m["uc_raw"] for m in test_recs], dtype=np.float32), axis=2)
    test_fhr_raw = np.expand_dims(np.array([m["fhr_raw"] for m in test_recs], dtype=np.float32), axis=2)

    print("  Train UC shape :", train_uc_raw.shape)
    print("  Train FHR shape:", train_fhr_raw.shape)
    print("  Test  UC shape :", test_uc_raw.shape)
    print("  Test  FHR shape:", test_fhr_raw.shape)

    # --- 2. TS2Vec encoder（按你原来的用法：同一个模型多次 encode） ---
    ts_model = TS2Vec(
        input_dims=1,
        output_dims=320,
        device=0,       # 如果有 CUDA 就用 GPU: 也可以改成 "cuda:0"
        batch_size=32,
    )

    # UC embeddings
    print("  [Fold %d] TS2Vec encoding UC..." % fold)
    train_uc_embed = ts_model.encode(train_uc_raw, encoding_window="full_series")
    test_uc_embed = ts_model.encode(test_uc_raw, encoding_window="full_series")

    # FHR embeddings
    print("  [Fold %d] TS2Vec encoding FHR..." % fold)
    train_fhr_embed = ts_model.encode(train_fhr_raw, encoding_window="full_series")
    test_fhr_embed = ts_model.encode(test_fhr_raw, encoding_window="full_series")

    for i, e in enumerate(train_uc_embed):
        train_recs[i]["uc_raw"] = e
    for i, e in enumerate(train_fhr_embed):
        train_recs[i]["fhr_raw"] = e
    for i, e in enumerate(test_uc_embed):
        test_recs[i]["uc_raw"] = e
    for i, e in enumerate(test_fhr_embed):
        test_recs[i]["fhr_raw"] = e

    # --- 3. 聚合 window-level features ---
    for rec in train_recs:
        uc_w = np.array([[v for _, v in w.items()] for w in rec["uc_windows"]], dtype=np.float32)
        fhr_w = np.array([[v for _, v in w.items()] for w in rec["fhr_windows"]], dtype=np.float32)
        rec["uc_windows"] = uc_w.mean(axis=0)
        rec["fhr_windows"] = fhr_w.mean(axis=0)
        assert len(rec["uc_windows"]) == 20
        assert len(rec["fhr_windows"]) == 24

    for rec in test_recs:
        uc_w = np.array([[v for _, v in w.items()] for w in rec["uc_windows"]], dtype=np.float32)
        fhr_w = np.array([[v for _, v in w.items()] for w in rec["fhr_windows"]], dtype=np.float32)
        rec["uc_windows"] = uc_w.mean(axis=0)
        rec["fhr_windows"] = fhr_w.mean(axis=0)
        assert len(rec["uc_windows"]) == 20
        assert len(rec["fhr_windows"]) == 24

    # --- 4. 构建特征矩阵 ---
    X_tr, y_tr, weeks_tr = make_xy(train_recs)
    X_te, y_te, weeks_te = make_xy(test_recs)

    print(f"[Fold {fold}] X_tr={X_tr.shape}, X_te={X_te.shape}")

    # --- 5. 训练 LightGBM ---
    lgbm = LGBMRegressor(
        n_estimators=2000,
        learning_rate=0.03,
        subsample=0.9,
        colsample_bytree=0.8,
        random_state=SEED,
        n_jobs=-1,
    )

    lgbm.fit(X_tr, y_tr)

    # --- 6. 预测 & 评估 ---
    y_tr_pred = lgbm.predict(X_tr)
    y_te_pred = lgbm.predict(X_te)

    fold_train_mae = mean_absolute_error(y_tr, y_tr_pred)
    fold_test_mae = mean_absolute_error(y_te, y_te_pred)
    fold_test_rmse = np.sqrt(mean_squared_error(y_te, y_te_pred))
    fold_test_r2 = r2_score(y_te, y_te_pred)

    print(f"[Fold {fold}] Train MAE = {fold_train_mae:.3f}")
    print(f"[Fold {fold}] Test  MAE = {fold_test_mae:.3f}, RMSE = {fold_test_rmse:.3f}, R² = {fold_test_r2:.3f}")

    fold_metrics.append({
        "fold": fold,
        "train_mae": fold_train_mae,
        "test_mae": fold_test_mae,
        "test_rmse": fold_test_rmse,
        "test_r2": fold_test_r2,
    })

    # 记录全体散点
    all_true.append(y_te)
    all_pred.append(y_te_pred)
    all_weeks.append(weeks_te)

    # prediction.csv 的记录
    df_pred_fold = pd.DataFrame({
        "fold": fold,
        "gest_age_weeks": weeks_te.astype(int),
        "y_true": y_te.astype(float),
        "y_pred": y_te_pred.astype(float),
    })
    all_pred_rows.append(df_pred_fold)

    # --- 7. 每周 MAE ---
    abs_err = np.abs(y_te_pred - y_te)
    rows_this_fold = []
    print(f"[Fold {fold}] Test MAE by gestational week:")
    for w in sorted(np.unique(weeks_te)):
        mask_w = (weeks_te == w)
        n_w = int(mask_w.sum())
        if n_w == 0:
            continue
        mae_w = float(abs_err[mask_w].mean())
        rows_this_fold.append({
            "fold": fold,
            "week": int(w),
            "n_samples": n_w,
            "mae": mae_w,
        })
        print(f"  Week {int(w):2d}: MAE={mae_w:.3f} (n={n_w})")

    rows_week_fold.extend(rows_this_fold)

    # 每折 per-week CSV（可选）
    df_fold_week = pd.DataFrame(rows_this_fold)
    fold_week_csv = os.path.join(OUT_DIR, f"fold_{fold:02d}_week_mae.csv")
    df_fold_week.to_csv(fold_week_csv, index=False)
    print(f"[Saved] {fold_week_csv}")

# %% 汇总 prediction.csv
df_pred_all = pd.concat(all_pred_rows, ignore_index=True)
pred_csv_path = os.path.join(OUT_DIR, "prediction.csv")
df_pred_all.to_csv(pred_csv_path, index=False)
print(f"\n[Saved] {pred_csv_path}")

# %% 汇总结果
all_true = np.concatenate(all_true)
all_pred = np.concatenate(all_pred)
all_weeks = np.concatenate(all_weeks)

# 每折整体性能表
df_folds = pd.DataFrame(fold_metrics)
fold_summary_csv = os.path.join(OUT_DIR, "fold_overall_metrics.csv")
df_folds.to_csv(fold_summary_csv, index=False)
print(f"[Saved] {fold_summary_csv}")

print("\nOverall across folds:")
print(df_folds)

# 每折 × 每周表
df_week_fold = pd.DataFrame(rows_week_fold)
week_fold_csv = os.path.join(OUT_DIR, "week_mae_per_fold.csv")
df_week_fold.to_csv(week_fold_csv, index=False)
print(f"[Saved] {week_fold_csv}")

# 按 week 汇总 mean / std
df_week_summary = df_week_fold.groupby("week").agg(
    mae_mean=("mae", "mean"),
    mae_std=("mae", "std"),
    n_total=("n_samples", "sum"),
    n_folds=("mae", "count"),
).reset_index()

week_summary_csv = os.path.join(OUT_DIR, "week_mae_summary.csv")
df_week_summary.to_csv(week_summary_csv, index=False)
print(f"[Saved] {week_summary_csv}")

print("\nWeek-wise MAE summary:")
print(df_week_summary)

# %% 可视化 1：per-week MAE (bar + SD)
plt.figure(figsize=(10, 5))
x = df_week_summary["week"].values
y = df_week_summary["mae_mean"].values
yerr = df_week_summary["mae_std"].values

plt.bar(x, y, yerr=yerr, capsize=3)
plt.xlabel("Gestational Age (weeks)")
plt.ylabel("MAE (days-to-onset)")
plt.title("Per-week MAE across 5-fold CV (TS2Vec + LGBM)")
plt.xticks(x)
plt.tight_layout()
bar_path = os.path.join(OUT_DIR, "week_mae_bar_with_sd.png")
plt.savefig(bar_path, dpi=200)
plt.close()
print(f"[Saved] {bar_path}")

# %% 可视化 2：True vs Pred scatter
plt.figure(figsize=(6, 6))
plt.scatter(all_true, all_pred, s=8, alpha=0.5)
mn = float(min(all_true.min(), all_pred.min()))
mx = float(max(all_true.max(), all_pred.max()))
plt.plot([mn, mx], [mn, mx], "r--", label="Ideal")

overall_mae = mean_absolute_error(all_true, all_pred)
overall_rmse = np.sqrt(mean_squared_error(all_true, all_pred))
overall_r2 = r2_score(all_true, all_pred)

plt.xlabel("True days to onset")
plt.ylabel("Predicted days to onset")
plt.title(f"5-fold CV — True vs Pred (TS2Vec + LGBM)\nMAE={overall_mae:.2f}, RMSE={overall_rmse:.2f}, R²={overall_r2:.3f}")
plt.legend(loc="upper left")
plt.tight_layout()
scatter_path = os.path.join(OUT_DIR, "cv_true_vs_pred_scatter.png")
plt.savefig(scatter_path, dpi=200)
plt.close()
print(f"[Saved] {scatter_path}")
