In [29]:
# step1_split.py
import numpy as np, pandas as pd
from pathlib import Path
FEAT_CSV = r"../result/32k_none_features.csv"
df = pd.read_csv(FEAT_CSV)

# —— 构造“更贴合类别”的特征 —— #
base_cols = [
    "rms","kurtosis","crest_factor","p2p","shape_factor",
    "spec_entropy","spec_centroid","spec_rolloff95","rms_freq","freq_std",
    "BPFO_ratio","BPFI_ratio","BSF_ratio"
]
eps = 1e-9
df["sum_ratio"] = df["BPFO_ratio"] + df["BPFI_ratio"] + df["BSF_ratio"]
df["OR_score"] = df["BPFO_ratio"]/(df["BPFI_ratio"]+df["BSF_ratio"]+eps)
df["IR_score"] = df["BPFI_ratio"]/(df["BPFO_ratio"]+df["BSF_ratio"]+eps)
df["B_score"]  = df["BSF_ratio"] /(df["BPFO_ratio"]+df["BPFI_ratio"]+eps)
feat_cols = base_cols + ["sum_ratio","OR_score","IR_score","B_score"]

# —— 防御：缺失列填0（例如个别机理比值缺失）—— #
for c in feat_cols:
    if c not in df.columns:
        df[c] = 0.0

# —— 按“文件分层”：每个label至少保留1个文件到测试集 —— #
file_label = (df.groupby("file")["label"]
                .agg(lambda s: s.mode().iloc[0]).reset_index())
rng = np.random.default_rng(42)
test_files = []
for lab, g in file_label.groupby("label"):
    files = g["file"].tolist()
    k = max(1, int(np.ceil(0.2*len(files))))  # 约20%文件，至少1个
    test_files += rng.choice(files, size=k, replace=False).tolist()
test_files = set(test_files)
df["is_test"] = df["file"].isin(test_files)

# —— 导出 —— #
Path("../data/splits").mkdir(parents=True, exist_ok=True)
df[["file","label","is_test"]].drop_duplicates()\
  .to_csv("../data/splits/files_split.csv", index=False)

drop_cols = {"label","file","pos","size_code","load","rpm"}
X_tr = df[~df["is_test"]][feat_cols].fillna(0.0)
y_tr = df[~df["is_test"]]["label"].astype(str)
X_te = df[ df["is_test"]][feat_cols].fillna(0.0)
y_te = df[ df["is_test"]]["label"].astype(str)

# —— 新增：导出训练集分组键（按文件），供 GroupKFold 使用 —— #
groups_tr = df[~df["is_test"]]["file"].astype(str).reset_index(drop=True)
groups_te = df[ df["is_test"]]["file"].astype(str).reset_index(drop=True)

X_tr.to_csv("../data/splits/X_tr.csv", index=False)
y_tr.to_csv("../data/splits/y_tr.csv", index=False, header=True)
pd.DataFrame({"group": groups_tr}).to_csv("../data/splits/groups_tr.csv", index=False)

X_te.to_csv("../data/splits/X_te.csv", index=False)
y_te.to_csv("../data/splits/y_te.csv", index=False, header=True)
pd.DataFrame({"group": groups_te}).to_csv("../data/splits/groups_te.csv", index=False)

print("✅ Step1 done. 保存到 ../data/splits/")


✅ Step1 done. 保存到 ../data/splits/


In [33]:
# step2_train_lgbm.py
import pandas as pd, numpy as np, joblib, lightgbm as lgb
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import StratifiedGroupKFold
import pathlib

X_tr = pd.read_csv("../data/splits/X_tr.csv")
y_tr = pd.read_csv("../data/splits/y_tr.csv")["label"].astype(str)
groups_tr = pd.read_csv("../data/splits/groups_tr.csv")["group"].astype(str)

X_te = pd.read_csv("../data/splits/X_te.csv")
y_te = pd.read_csv("../data/splits/y_te.csv")["label"].astype(str)

# —— 标准化 —— #
scaler = StandardScaler(with_mean=False)
X_tr_s = pd.DataFrame(scaler.fit_transform(X_tr), columns=X_tr.columns, index=X_tr.index)
X_te_s = pd.DataFrame(scaler.transform(X_te),   columns=X_te.columns, index=X_te.index)

# —— 模型定义（与你一致）—— #
clf = lgb.LGBMClassifier(
  objective="multiclass",
  learning_rate=0.03,
  n_estimators=4000,
  num_leaves=96,
  max_depth=-1,
  min_child_samples=30,
  feature_fraction=0.85,
  bagging_fraction=0.85,
  bagging_freq=1,
  lambda_l1=0.2,
  lambda_l2=2.0,
  min_gain_to_split=0.0,
  class_weight="balanced",
  force_col_wise=True,
  random_state=42, n_jobs=-1
)

# —— 可选：对训练集做“分组K折”验证，确保准确率是真的 —— #
DO_CV = True
cv_summary = None
if DO_CV:
    skf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
    accs, f1s = [], []
    for tr_idx, va_idx in skf.split(X_tr_s, y_tr, groups_tr):
        X_tr_fold, y_tr_fold = X_tr_s.iloc[tr_idx], y_tr.iloc[tr_idx]
        X_va_fold, y_va_fold = X_tr_s.iloc[va_idx], y_tr.iloc[va_idx]
        clf_fold = lgb.LGBMClassifier(**clf.get_params())
        clf_fold.fit(
            X_tr_fold, y_tr_fold,
            eval_set=[(X_va_fold, y_va_fold)],
            eval_metric=["multi_logloss","multi_error"],
            callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)]
        )
        pred = clf_fold.predict(X_va_fold)
        accs.append(accuracy_score(y_va_fold, pred))
        f1s.append(f1_score(y_va_fold, pred, average="macro", zero_division=0))
    cv_summary = pd.DataFrame({
        "fold": np.arange(1, len(accs)+1),
        "accuracy": accs, "macro_f1": f1s
    })
    pathlib.Path("../data/models").mkdir(parents=True, exist_ok=True)
    cv_summary.to_csv("../data/models/cv_fold_metrics.csv", index=False)
    pd.DataFrame({
        "metric":["accuracy","macro_f1"],
        "mean":[np.mean(accs), np.mean(f1s)],
        "std":[np.std(accs, ddof=1), np.std(f1s, ddof=1)]
    }).to_csv("../data/models/cv_mean_std.csv", index=False)

# —— 在整个训练集上拟合最终模型（用于对外报告和后续迁移）—— #
clf.fit(
    X_tr_s, y_tr,
    eval_set=[(X_te_s, y_te)],
    eval_metric=["multi_logloss","multi_error"],
    callbacks=[lgb.early_stopping(100), lgb.log_evaluation(50)]
)

# —— 保存 —— #
output_file_path = "../data/models/LightGBM.joblib"
pathlib.Path(output_file_path).parent.mkdir(parents=True, exist_ok=True)
out = {"model": clf, "scaler": scaler, "features": list(X_tr.columns)}
joblib.dump(out, output_file_path)

# —— 保存特征重要性（和你一致）—— #
importance_path = "../data/models/lgbm_importance.csv"
pd.Series(clf.feature_importances_, index=X_tr.columns)\
  .sort_values(ascending=False).to_csv(importance_path)

print("✅ Step2 done. 模型/重要性已保存到 ../data/models/")
if cv_summary is not None:
    print("CV mean±std:",
          f"Acc={np.mean(accs):.4f}±{np.std(accs, ddof=1):.4f},",
          f"MacroF1={np.mean(f1s):.4f}±{np.std(f1s, ddof=1):.4f}")

[LightGBM] [Info] Total Bins 4331
[LightGBM] [Info] Number of data points in the train set: 3335, number of used features: 17
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[3]	valid_0's multi_logloss: 1.18044	valid_0's multi_error: 0.0169492
[LightGBM] [Info] Total Bins 4331
[LightGBM] [Info] Number of data points in the train set: 3183, number of used features: 17
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[13]	valid_0's multi_logloss: 0.900357	valid_0's mul

In [34]:
# step3_eval.py
import pandas as pd, numpy as np, joblib, pathlib
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt

pathlib.Path("../data/models").mkdir(parents=True, exist_ok=True)

pack = joblib.load("../data/models/LightGBM.joblib")
clf, scaler, feat_cols = pack["model"], pack["scaler"], pack["features"]

X_te = pd.read_csv("../data/splits/X_te.csv")[feat_cols]
y_te = pd.read_csv("../data/splits/y_te.csv")["label"].astype(str)
X_te_s = pd.DataFrame(scaler.transform(X_te), columns=feat_cols, index=X_te.index)

# —— 预测 —— #
y_pred = clf.predict(X_te_s)
proba  = clf.predict_proba(X_te_s)  # 保存给第三问/可解释性
np.save("../data/models/y_proba_test.npy", proba)

# —— 指标 —— #
acc = accuracy_score(y_te, y_pred)
macro_f1 = f1_score(y_te, y_pred, average="macro", zero_division=0)
weighted_f1 = f1_score(y_te, y_pred, average="weighted", zero_division=0)
labels_sorted = sorted(y_te.unique())
cm = confusion_matrix(y_te, y_pred, labels=labels_sorted)

print(f"Accuracy: {acc:.4f}")
print(f"Macro-F1: {macro_f1:.4f} | Weighted-F1: {weighted_f1:.4f}")
print("Confusion Matrix (label order={}):\n{}".format(labels_sorted, cm))
print(classification_report(y_te, y_pred, digits=4, zero_division=0))

# —— 导出表格 —— #
pd.DataFrame(cm, index=labels_sorted, columns=labels_sorted)\
  .to_csv("../data/models/confusion_matrix.csv")
rep = classification_report(y_te, y_pred, output_dict=True, zero_division=0)
pd.DataFrame(rep).T.to_csv("../data/models/classification_report.csv")

# —— 可视化：混淆矩阵热力图 —— #
plt.figure(figsize=(5.2,4.6))
plt.imshow(cm, interpolation='nearest')
plt.title("Confusion Matrix (Test, Group Holdout)")
plt.xticks(range(len(labels_sorted)), labels_sorted)
plt.yticks(range(len(labels_sorted)), labels_sorted)
plt.xlabel("Predicted"); plt.ylabel("True")
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, cm[i, j], ha="center", va="center")
plt.tight_layout()
plt.savefig("../data/models/confusion_matrix.png", dpi=220)
plt.close()

# —— Permutation Importance（更客观的特征贡献）—— #
# 注意：只在测试集上做，度量=accuracy
perm = permutation_importance(clf, X_te_s, y_te, n_repeats=10,
                              scoring="accuracy", random_state=42, n_jobs=-1)
pi = pd.Series(perm.importances_mean, index=feat_cols).sort_values(ascending=False)
pi.to_csv("../data/models/permutation_importance.csv")

# 可视化Top20
topN = min(20, len(pi))
plt.figure(figsize=(6, 0.35*topN + 1.2))
pi.head(topN)[::-1].plot(kind="barh")
plt.title("Permutation Importance (Top-20) on Test")
plt.tight_layout()
plt.savefig("../data/models/permutation_importance_top20.png", dpi=220)
plt.close()

# —— 每类样本数（报告里需要）—— #
pd.Series(y_te).value_counts().rename_axis("label").reset_index(name="count")\
  .to_csv("../data/models/test_class_counts.csv", index=False)

print("✅ Step3 done. 评估结果与图片已导出到 ../data/models/")


Accuracy: 0.9700
Macro-F1: 0.9766 | Weighted-F1: 0.9702
Confusion Matrix (label order=['B', 'IR', 'N', 'OR']):
[[355   6   0   3]
 [  0 364   0   0]
 [  0   0  61   0]
 [ 31   0   0 515]]
              precision    recall  f1-score   support

           B     0.9197    0.9753    0.9467       364
          IR     0.9838    1.0000    0.9918       364
           N     1.0000    1.0000    1.0000        61
          OR     0.9942    0.9432    0.9680       546

    accuracy                         0.9700      1335
   macro avg     0.9744    0.9796    0.9766      1335
weighted avg     0.9713    0.9700    0.9702      1335






















✅ Step3 done. 评估结果与图片已导出到 ../data/models/


### 选特征 + 切分

In [27]:
# step1_split.py
import numpy as np, pandas as pd
from pathlib import Path

FEAT_CSV = "../result/features.csv"
df = pd.read_csv(FEAT_CSV)

# —— 构造“更贴合类别”的特征 —— #
base_cols = [
    "rms","kurtosis","crest_factor","p2p","shape_factor",
    "spec_entropy","spec_centroid","spec_rolloff95","rms_freq","freq_std",
    "BPFO_ratio","BPFI_ratio","BSF_ratio"
]
eps = 1e-9
df["sum_ratio"] = df["BPFO_ratio"] + df["BPFI_ratio"] + df["BSF_ratio"]
df["OR_score"] = df["BPFO_ratio"]/(df["BPFI_ratio"]+df["BSF_ratio"]+eps)
df["IR_score"] = df["BPFI_ratio"]/(df["BPFO_ratio"]+df["BSF_ratio"]+eps)
df["B_score"]  = df["BSF_ratio"] /(df["BPFO_ratio"]+df["BPFI_ratio"]+eps)
feat_cols = base_cols + ["sum_ratio","OR_score","IR_score","B_score"]

# —— 按“文件分层”：每个label至少保留1个文件到测试集 —— #
file_label = (df.groupby("file")["label"]
                .agg(lambda s: s.mode().iloc[0]).reset_index())
rng = np.random.default_rng(42)
test_files = []
for lab, g in file_label.groupby("label"):
    files = g["file"].tolist()
    k = max(1, int(np.ceil(0.2*len(files))))  # 约20%文件，至少1个
    test_files += rng.choice(files, size=k, replace=False).tolist()

test_files = set(test_files)
df["is_test"] = df["file"].isin(test_files)

Path("../data/splits").mkdir(parents=True, exist_ok=True)
df[["file","label","is_test"]].drop_duplicates().to_csv("../data/splits/files_split.csv", index=False)

drop_cols = {"label","file","pos","size_code","load","rpm"}
X_tr = df[~df["is_test"]][feat_cols].fillna(0.0)
y_tr = df[~df["is_test"]]["label"].astype(str)
X_te = df[df["is_test"]][feat_cols].fillna(0.0)
y_te = df[df["is_test"]]["label"].astype(str)

X_tr.to_csv("../data/splits/X_tr.csv", index=False)
y_tr.to_csv("../data/splits/y_tr.csv", index=False, header=True)
X_te.to_csv("../data/splits/X_te.csv", index=False)
y_te.to_csv("../data/splits/y_te.csv", index=False, header=True)
print("✅ Step1 done. 保存到 ../data/splits/")


FileNotFoundError: [Errno 2] No such file or directory: '../result/features.csv'

### 评估与可视化输出

In [26]:
# step3_eval.py
import pandas as pd, numpy as np, joblib
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# pack = joblib.load("../data/models/LightGBM.joblib")
pack = joblib.load("../data/32k_none_models/LightGBM.joblib")
clf, scaler, feat_cols = pack["model"], pack["scaler"], pack["features"]

X_te = pd.read_csv("../data/splits/X_te.csv")[feat_cols]
y_te = pd.read_csv("../data/splits/y_te.csv")["label"].astype(str)
X_te_s = pd.DataFrame(scaler.transform(X_te), columns=feat_cols, index=X_te.index)

y_pred = clf.predict(X_te_s)
acc = accuracy_score(y_te, y_pred)
macro_f1 = f1_score(y_te, y_pred, average="macro", zero_division=0)
weighted_f1 = f1_score(y_te, y_pred, average="weighted", zero_division=0)
cm = confusion_matrix(y_te, y_pred, labels=sorted(y_te.unique()))

print(f"Accuracy: {acc:.4f}")
print(f"Macro-F1: {macro_f1:.4f} | Weighted-F1: {weighted_f1:.4f}")
print("Confusion Matrix (label order={}):\n{}".format(sorted(y_te.unique()), cm))
print(classification_report(y_te, y_pred, digits=4, zero_division=0))

# 导出评估结果（便于写报告/画图）
pd.DataFrame(cm, index=sorted(y_te.unique()), columns=sorted(y_te.unique()))\
  .to_csv("../data/models/confusion_matrix.csv")
print("✅ Step3 done. 评估结果已打印并导出。")


Accuracy: 0.5778
Macro-F1: 0.5766 | Weighted-F1: 0.4876
Confusion Matrix (label order=['B', 'IR', 'N', 'OR']):
[[  0  35   0  81]
 [  0 103   0  12]
 [  0   0  20   0]
 [  0  51   0 122]]
              precision    recall  f1-score   support

           B     0.0000    0.0000    0.0000       116
          IR     0.5450    0.8957    0.6776       115
           N     1.0000    1.0000    1.0000        20
          OR     0.5674    0.7052    0.6289       173

    accuracy                         0.5778       424
   macro avg     0.5281    0.6502    0.5766       424
weighted avg     0.4265    0.5778    0.4876       424

✅ Step3 done. 评估结果已打印并导出。


### LightGBM 未处理RPM 第三次调参 准确率97%

In [21]:
# step2_train_lgbm.py
import pandas as pd, numpy as np, joblib, lightgbm as lgb
from sklearn.preprocessing import StandardScaler
import pathlib

X_tr = pd.read_csv("../data/splits/X_tr.csv")
y_tr = pd.read_csv("../data/splits/y_tr.csv")["label"].astype(str)
X_te = pd.read_csv("../data/splits/X_te.csv")
y_te = pd.read_csv("../data/splits/y_te.csv")["label"].astype(str)

# 可选标准化（量纲更稳）；保持 DataFrame 以免特征名警告
scaler = StandardScaler(with_mean=False)
X_tr_s = pd.DataFrame(scaler.fit_transform(X_tr), columns=X_tr.columns, index=X_tr.index)
X_te_s = pd.DataFrame(scaler.transform(X_te),   columns=X_te.columns, index=X_te.index)
# 精度
ok = 0.93
if ok :
    clf = lgb.LGBMClassifier(
      objective="multiclass",
      learning_rate=0.03,          # 小学习率 + 大迭代配合早停
      n_estimators=4000,           # 交给 early_stopping 截停
      num_leaves=96,               # 64~128 常见甜点区
      max_depth=-1,                # 不限深，靠叶子数与正则控复杂度
      min_child_samples=30,        # 20~40 更稳
      feature_fraction=0.85,       # =colsample_bytree
      bagging_fraction=0.85,       # =subsample
      bagging_freq=1,
      lambda_l1=0.2,               # 稀疏一点
      lambda_l2=2.0,               # 抑制过拟合
      min_gain_to_split=0.0,
      class_weight="balanced",
      force_col_wise=True,         # 消除你日志里提示的小开销
      random_state=42, n_jobs=-1
    )
    # 早停 + 控日志
    clf.fit(
        X_tr_s, y_tr,
        eval_set=[(X_te_s, y_te)],
        eval_metric=["multi_logloss","multi_error"],
        callbacks=[lgb.early_stopping(100), lgb.log_evaluation(50)]
    )
else :
    # 更快的参数
    clf = lgb.LGBMClassifier(
        objective="multiclass",
        n_estimators=1200, learning_rate=0.03,
        num_leaves=64, max_depth=-1,
        subsample=0.9, colsample_bytree=0.9,
        reg_lambda=1.0, class_weight="balanced",
        random_state=42, n_jobs=-1
    )
    clf.fit(
        X_tr_s, y_tr,
        eval_set=[(X_te_s, y_te)],
        eval_metric="multi_logloss",
        callbacks=[lgb.early_stopping(100), lgb.log_evaluation(50)]
    )


# 定义输出文件的路径
output_file_path = "../data/models/LightGBM.joblib"

# 使用 pathlib 自动创建父目录（如果不存在）
pathlib.Path(output_file_path).parent.mkdir(parents=True, exist_ok=True)

# 现在可以安全地保存了
out = {"model": clf, "scaler": scaler, "features": list(X_tr.columns)}
joblib.dump(out, output_file_path) # 使用刚才定义的路径变量

# 对重要性文件也做同样处理
importance_path = "../data/models/lgbm_importance.csv"
pd.Series(clf.feature_importances_, index=X_tr.columns)\
  .sort_values(ascending=False).to_csv(importance_path)

print("✅ Step2 done. 模型/重要性已保存到 ../data/models/")

[LightGBM] [Info] Total Bins 4331
[LightGBM] [Info] Number of data points in the train set: 1280, number of used features: 17
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
Training until validation scores don't improve for 100 rounds
[50]	valid_0's multi_logloss: 0.624909	valid_0's multi_error: 0.188679
[100]	valid_0's multi_logloss: 0.543021	valid_0's multi_error: 0.191038
[150]	valid_0's multi_logloss: 0.533183	valid_0's multi_error: 0.188679
Early stopping, best iteration is:
[61]	valid_0's multi_logloss: 0.58798	valid_0's multi_error: 0.183962
✅ Step2 done. 模型/重要性已保存到 ../data/models/


### 超参数

In [None]:
# import json
# from pathlib import Path
# import numpy as np
# import pandas as pd
# import joblib
# import lightgbm as lgb

# from sklearn.model_selection import GroupKFold, GroupShuffleSplit, RandomizedSearchCV
# from sklearn.preprocessing import StandardScaler
# from sklearn.pipeline import Pipeline
# from sklearn.metrics import (
#     accuracy_score, f1_score, classification_report, confusion_matrix
# )

# # -------------------- 配置 --------------------
# RNG = 42
# FEAT_CSV = "../result/features.csv"
# OUT_SPLIT_DIR = Path("../date/splits"); OUT_SPLIT_DIR.mkdir(parents=True, exist_ok=True)
# OUT_MODEL_DIR = Path("../data/models"); OUT_MODEL_DIR.mkdir(parents=True, exist_ok=True)

# # “文件级”测试集比例（每类至少 1 个文件进测试）
# TEST_FILE_RATIO = 0.20

# # 基础特征列（与你的 features.csv 对齐）
# BASE_COLS = [
#     "rms","kurtosis","crest_factor","p2p","shape_factor",
#     "spec_entropy","spec_centroid","spec_rolloff95","rms_freq","freq_std",
#     "BPFO_ratio","BPFI_ratio","BSF_ratio"
# ]

# # -------------------- 读数据 & 构造特征 --------------------
# df = pd.read_csv(FEAT_CSV)

# # 断言必须列
# must_have = set(BASE_COLS) | {"label","file"}
# missing = [c for c in must_have if c not in df.columns]
# if missing:
#     raise RuntimeError(f"features.csv 缺少必要列: {missing}")

# # 派生特征（与“类别”机理强相关）
# eps = 1e-9
# df["sum_ratio"] = df["BPFO_ratio"] + df["BPFI_ratio"] + df["BSF_ratio"]
# df["OR_score"] = df["BPFO_ratio"]/(df["BPFI_ratio"]+df["BSF_ratio"]+eps)
# df["IR_score"] = df["BPFI_ratio"]/(df["BPFO_ratio"]+df["BSF_ratio"]+eps)
# df["B_score"]  = df["BSF_ratio"] /(df["BPFO_ratio"]+df["BPFI_ratio"]+eps)

# FEAT_COLS = BASE_COLS + ["sum_ratio","OR_score","IR_score","B_score"]

# # -------------------- 按“文件分层”切分（确保每类测试集中至少1个文件） --------------------
# file_label = (df.groupby("file")["label"]
#                 .agg(lambda s: s.mode().iloc[0])     # 每个文件的主标签
#                 .reset_index())

# rng = np.random.default_rng(RNG)
# test_files = []
# for lab, g in file_label.groupby("label"):
#     files = g["file"].tolist()
#     k = max(1, int(np.ceil(TEST_FILE_RATIO * len(files))))
#     test_files += rng.choice(files, size=k, replace=False).tolist()
# test_files = set(test_files)

# df["is_test"] = df["file"].isin(test_files)
# df[["file","label","is_test"]].drop_duplicates()\
#   .to_csv(OUT_SPLIT_DIR / "files_split.csv", index=False)

# # 训练/测试集（严格“按文件”划分）
# drop_cols = {"label","file","pos","size_code","load","rpm"}
# train_df = df[~df["is_test"]].copy()
# test_df  = df[ df["is_test"]].copy()

# X_tr = train_df[FEAT_COLS].fillna(0.0)
# y_tr = train_df["label"].astype(str)
# X_te = test_df[FEAT_COLS].fillna(0.0)
# y_te = test_df["label"].astype(str)

# # 便于复现也导出切分后的X/y
# X_tr.to_csv(OUT_SPLIT_DIR / "X_tr.csv", index=False)
# y_tr.to_csv(OUT_SPLIT_DIR / "y_tr.csv", index=False, header=True)
# X_te.to_csv(OUT_SPLIT_DIR / "X_te.csv", index=False)
# y_te.to_csv(OUT_SPLIT_DIR / "y_te.csv", index=False, header=True)
# print(f"✅ Split done. 训练样本: {len(X_tr)} | 测试样本: {len(X_te)} | 测试文件数: {len(test_files)}")

# # -------------------- 自动调参（GroupKFold，按文件分组） --------------------
# groups = train_df["file"].astype(str)

# pipe = Pipeline([
#     ("scaler", StandardScaler(with_mean=False)),   # LGBM不敏感，但量纲更稳
#     ("clf", lgb.LGBMClassifier(
#         objective="multiclass",
#         class_weight="balanced",
#         force_col_wise=True,   # 消除 col-wise overhead 提示
#         n_jobs=-1,
#         random_state=RNG
#     ))
# ])

# param_dist = {
#     "clf__learning_rate":    [0.02, 0.03, 0.04, 0.05, 0.06],
#     "clf__n_estimators":     [700, 900, 1100, 1300, 1600],
#     "clf__num_leaves":       [64, 80, 96, 112, 128, 160],
#     "clf__min_child_samples":[20, 30, 40, 50, 60],
#     "clf__feature_fraction": [0.75, 0.80, 0.85, 0.90, 0.95],
#     "clf__bagging_fraction": [0.75, 0.80, 0.85, 0.90, 0.95],
#     "clf__lambda_l1":        [0.0, 0.2, 0.5, 1.0],
#     "clf__lambda_l2":        [0.5, 1.0, 2.0, 3.0, 5.0],
#     "clf__min_gain_to_split":[0.0, 0.01, 0.02],
#     "clf__max_depth":        [-1, 10, 12],
# }

# gkf = GroupKFold(n_splits=5)
# search = RandomizedSearchCV(
#     estimator=pipe,
#     param_distributions=param_dist,
#     n_iter=40,                 # 可调：越大越稳
#     scoring="f1_macro",
#     cv=gkf,                    # ✅ 传实例，不要 .split(...)
#     n_jobs=-1, verbose=1, random_state=RNG
# )
# search.fit(X_tr, y_tr, groups=groups)       # ✅ groups 传给 fit

# print("Best params:", search.best_params_)
# print("CV best f1_macro:", search.best_score_)

# # 保存搜索关键信息（轻量）
# (Path(OUT_MODEL_DIR) / "LGBM_best_params.json").write_text(
#     json.dumps(search.best_params_, ensure_ascii=False, indent=2)
# )
# pd.DataFrame(search.cv_results_).to_csv(OUT_MODEL_DIR / "LGBM_cv_results.csv", index=False)
# joblib.dump(search.best_estimator_, OUT_MODEL_DIR / "LGBM_best_estimator.joblib")

# # -------------------- 用最佳参数“早停重训”最终模型 --------------------
# best_pipe: Pipeline = search.best_estimator_
# best = best_pipe.named_steps["clf"].get_params()

# # 放大迭代数，交给 early_stopping 决定停止点
# best["n_estimators"] = max(2000, int(best.get("n_estimators", 1000)*3))
# final_clf = lgb.LGBMClassifier(**best, random_state=RNG, n_jobs=-1)

# # 训练集内部再按“文件分组”切 90/10 做早停验证
# gss = GroupShuffleSplit(n_splits=1, train_size=0.9, random_state=RNG)
# tr_idx, va_idx = next(gss.split(X_tr, y_tr, groups=groups))
# X_tr_i, X_va_i = X_tr.iloc[tr_idx], X_tr.iloc[va_idx]
# y_tr_i, y_va_i = y_tr.iloc[tr_idx], y_tr.iloc[va_idx]

# scaler = StandardScaler(with_mean=False)
# X_tr_s = pd.DataFrame(scaler.fit_transform(X_tr_i), columns=FEAT_COLS, index=X_tr_i.index)
# X_va_s = pd.DataFrame(scaler.transform(X_va_i),   columns=FEAT_COLS, index=X_va_i.index)

# final_clf.fit(
#     X_tr_s, y_tr_i,
#     eval_set=[(X_va_s, y_va_i)],
#     eval_metric=["multi_logloss","multi_error"],
#     callbacks=[lgb.early_stopping(100), lgb.log_evaluation(50)]
# )

# # 保存最终模型包 + 特征重要性
# joblib.dump(
#     {"model": final_clf, "scaler": scaler, "features": FEAT_COLS, "params": best, "cv_best": search.best_score_},
#     OUT_MODEL_DIR / "LightGBM_best.joblib"
# )
# imp = pd.Series(final_clf.feature_importances_, index=FEAT_COLS).sort_values(ascending=False)
# imp.to_csv(OUT_MODEL_DIR / "lgbm_importance.csv")
# print("✅ Auto-tune + Final training done. 已保存模型与特征重要性。")

# # -------------------- 在测试集评估 --------------------
# X_te_s = pd.DataFrame(scaler.transform(X_te), columns=FEAT_COLS, index=X_te.index)
# y_pred = final_clf.predict(X_te_s)

# acc = accuracy_score(y_te, y_pred)
# macro_f1 = f1_score(y_te, y_pred, average="macro", zero_division=0)
# weighted_f1 = f1_score(y_te, y_pred, average="weighted", zero_division=0)

# labels_order = sorted(y_te.unique().tolist())
# cm = confusion_matrix(y_te, y_pred, labels=labels_order)

# print("\n=== Test Evaluation ===")
# print(f"Accuracy   : {acc:.4f}")
# print(f"Macro-F1   : {macro_f1:.4f}")
# print(f"Weighted-F1: {weighted_f1:.4f}")
# print("Labels order:", labels_order)
# print("Confusion Matrix:\n", cm)
# print(classification_report(y_te, y_pred, digits=4, zero_division=0))

# # 导出评估结果
# pd.DataFrame(cm, index=labels_order, columns=labels_order)\
#   .to_csv(OUT_MODEL_DIR / "confusion_matrix.csv")
# (Path(OUT_MODEL_DIR) / "metrics.json").write_text(json.dumps({
#     "accuracy": acc, "macro_f1": macro_f1, "weighted_f1": weighted_f1,
#     "labels_order": labels_order
# }, ensure_ascii=False, indent=2))

# print("✅ 全流程完成。产物已写入 result/splits/ 与 result/models/")
