In [9]:
# svm_classification.py
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

EMBEDDING_LLM = "data/text_embeddings.npy"
DATA_FILE = "data/data.csv"

EMBEDDING_TFIDF = "data/text_embeddings_tfidf.npy"
EMBEDDING_DETECTIVE = "data/detective_emb_384.npy"


X_LLM = np.load(EMBEDDING_LLM)
X_TFIDF = np.load(EMBEDDING_TFIDF)
X_DETECTIVE = np.load(EMBEDDING_DETECTIVE)



Y = pd.read_csv(DATA_FILE)["label"].values




In [10]:
print(f"embedding shape: {X.shape}, label: {np.unique(y)}")

unique, counts = np.unique(y, return_counts=True)
label_dist = dict(zip(unique, counts))
print("Label Distribution:")
for label, count in label_dist.items():
    percent = count / counts.sum() * 100
    print(f"  Label {label}: {count} samples ({percent:.2f}%)")

embedding shape: (10000, 384), label: [0 1]
Label Distribution:
  Label 0: 6455 samples (64.55%)
  Label 1: 3545 samples (35.45%)


### Nested CV
1. 需要一个模型构造函数 make_xxx(params)；函数返回一个可 .fit(X,y)、.predict(X) 的模型实例
2. 需要定义该模型的超参网格 PARAM_GRID_xxx；要求：每个超参至少 3 个值；不超过 3 个超参
3. 我们计算了基础的confusion matrix, F1, acc, presion, recall. 我目前想不到什么更好的。 可能可以考虑：ROC曲线+AUC计算

In [11]:
import numpy as np
from sklearn.svm import SVC
from itertools import product
from collections import Counter
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
import warnings, re
warnings.filterwarnings(
    "ignore",
    category=UserWarning,
    message=r"X does not have valid feature names.*"
)

# ============== 可配置区域 ==============
PARAM_GRID_SVC = {
    "num_leaves": [31, 63, 127],
    "min_data_in_leaf": [20, 50, 100],
    "learning_rate": [0.02, 0.05, 0.1]
}

OUTER_K = 10
INNER_K = 3
SEED = 42
POS_LABEL = 0

# ---- 构造模型 ----
def make_svc(params):
    # 哪些超参数是允许的（屏蔽掉多余的参数）
    allowed = {"num_leaves", "min_data_in_leaf", "learning_rate"}
    # 去除allowed中的值，放在一个新的字典 kwargs 中
    kwargs = {k: v for k, v in params.items() if k in allowed}
    return LGBMClassifier(
        objective="binary",
        boosting_type="gbdt",
        data_sample_strategy="goss", 
        feature_fraction=0.8,        
        force_col_wise=True,
        verbosity=-1,
        n_estimators=500,
        random_state=42,
        n_jobs=-1,

        min_data_in_leaf=kwargs["min_data_in_leaf"],
        num_leaves= kwargs["num_leaves"],
        learning_rate=kwargs["learning_rate"],
    )


# ======================================


# ---- kfold indices ----
def simple_kfold_indices(n_samples, k, rng):
    indices = np.arange(n_samples)
    rng.shuffle(indices)
    # 第k折中包含的index
    return np.array_split(indices, k)


# ---- Evaluation matrix ----
def confusion_matrix_binary(y_true, y_pred, pos=POS_LABEL):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    tp = int(np.sum((y_true == pos) & (y_pred == pos)))
    fp = int(np.sum((y_true != pos) & (y_pred == pos)))
    tn = int(np.sum((y_true != pos) & (y_pred != pos)))
    fn = int(np.sum((y_true == pos) & (y_pred != pos)))
    return tp, fp, tn, fn

def precision(tp, fp, tn, fn):
    denom = tp + fp
    return tp / denom if denom > 0 else 0.0

def recall(tp, fp, tn, fn):
    denom = tp + fn
    return tp / denom if denom > 0 else 0.0

def accuracy(tp, fp, tn, fn):
    total = tp + fp + tn + fn
    return (tp + tn) / total if total > 0 else 0.0

def f1(tp, fp, tn, fn):
    p = precision(tp, fp, tn, fn)
    r = recall(tp, fp, tn, fn)
    denom = p + r
    return 2 * p * r / denom if denom > 0 else 0.0

def compute_metrics(y_true, y_pred, pos=POS_LABEL):
    tp, fp, tn, fn = confusion_matrix_binary(y_true, y_pred, pos)
    return {
        "tp": tp, "fp": fp, "tn": tn, "fn": fn,
        "precision": precision(tp, fp, tn, fn),
        "recall": recall(tp, fp, tn, fn),
        "accuracy": accuracy(tp, fp, tn, fn),
        "f1": f1(tp, fp, tn, fn),
    }

# ---- 生成所有参数组合 ----
def param_grid_iter(param_grid):
    keys = list(param_grid.keys())
    for values in product(*[param_grid[k] for k in keys]):
        yield dict(zip(keys, values))


# ---- 内层交叉验证：选择最优参数 ----
def inner_cv_select_params(X, y, train_idx, inner_k, param_grid, rng, make_model_fn):
    # 在给定的训练样本 train_idx 上，用 inner_k 折交叉验证，对 param_grid 中所有候选超参数组合进行测试

    # len(folds)为inner_k，即inner_k组
    folds = simple_kfold_indices(len(train_idx), inner_k, rng)
    folds = [train_idx[f] for f in folds] # 相对索引转换成全局索引

    best_score = -np.inf
    best_param = None

    for params in param_grid_iter(param_grid):
        fold_scores = []
        for f in range(inner_k):
            val_idx = folds[f]
            tr_idx = np.concatenate([folds[j] for j in range(inner_k) if j != f])

            model = make_model_fn(params)
            model.fit(X[tr_idx], y[tr_idx])
            pred = model.predict(X[val_idx])
            # 以 F1 为验证分数
            tp, fp, tn, fn = confusion_matrix_binary(y[val_idx], pred, POS_LABEL)
            fold_scores.append(f1(tp, fp, tn, fn))
        
        # 核心比较部分，只有大于average的才会被计入best
        avg_f1 = float(np.mean(fold_scores))
        if avg_f1 > best_score:
            best_score = avg_f1
            best_param = params

    return best_param


# ---- 嵌套交叉验证 ----
def nested_cv(
    X, y,
    outer_k=OUTER_K,
    inner_k=INNER_K,
    param_grid=PARAM_GRID_SVC,
    make_model_fn=make_svc,
    seed=SEED
):
    rng = np.random.default_rng(seed)
    folds = simple_kfold_indices(len(X), outer_k, rng)

    outer_metrics = []            # 每折指标字典
    chosen_params_each_fold = []  # 每折最优超参
    conf_sums = {"tp":0, "fp":0, "tn":0, "fn":0}

    for i in range(outer_k):
        test_idx = folds[i]
        train_idx = np.concatenate([folds[j] for j in range(outer_k) if j != i])

        # 内层：用 F1 选最优超参
        best_param = inner_cv_select_params(X, y, train_idx, inner_k, param_grid, rng, make_model_fn)
        chosen_params_each_fold.append(best_param)

        # 外层：训练并在测试折上评估所有指标
        model = make_model_fn(best_param)
        model.fit(X[train_idx], y[train_idx])
        pred = model.predict(X[test_idx])

        mets = compute_metrics(y[test_idx], pred, POS_LABEL)
        outer_metrics.append(mets)

        # 累加混淆矩阵
        for k in ("tp","fp","tn","fn"):
            conf_sums[k] += mets[k]

        print(f"[Outer Fold {i+1}/{outer_k}] "
              f"best_params={best_param} | "
              f"Acc={mets['accuracy']:.4f} P={mets['precision']:.4f} "
              f"R={mets['recall']:.4f} F1={mets['f1']:.4f} | "
              f"Confusion(TP/FP/TN/FN)=({mets['tp']},{mets['fp']},{mets['tn']},{mets['fn']})")

    # 逐折均值/标准差
    def agg_mean_std(key):
        vals = np.array([m[key] for m in outer_metrics], dtype=float)
        return float(vals.mean()), float(vals.std())

    mean_acc, std_acc = agg_mean_std("accuracy")
    mean_p, std_p = agg_mean_std("precision")
    mean_r, std_r = agg_mean_std("recall")
    mean_f1, std_f1 = agg_mean_std("f1")

    print("\n=== Final (Outer CV) Summary ===")
    print(f"Confusion Matrix Sum over folds: TP={conf_sums['tp']} FP={conf_sums['fp']} TN={conf_sums['tn']} FN={conf_sums['fn']}")
    print(f"Accuracy : mean={mean_acc:.4f}, std={std_acc:.4f}")
    print(f"Precision: mean={mean_p:.4f},  std={std_p:.4f}")
    print(f"Recall   : mean={mean_r:.4f},  std={std_r:.4f}")
    print(f"F1-score : mean={mean_f1:.4f}, std={std_f1:.4f}")

    # 统计最优超参出现频次（仅展示）
    selections = Counter([tuple(sorted(p.items())) for p in chosen_params_each_fold])
    print("\n=== Chosen Hyperparameters Across Folds ===")
    for combo, count in selections.items():
        print(f"{dict(combo)}: chosen {count} times")

    return {
        "outer_metrics": outer_metrics,                      # 每折完整指标
        "confusion_sum": conf_sums,                          # 混淆矩阵累计
        "mean_std": {                                        # 各指标均值/方差
            "accuracy": (mean_acc, std_acc),
            "precision": (mean_p, std_p),
            "recall": (mean_r, std_r),
            "f1": (mean_f1, std_f1),
        },
        "chosen_params_each_fold": chosen_params_each_fold,  # 每折最优超参
    }

In [12]:
results = nested_cv(X_LLM, Y)
print(results["mean_std"]["f1"])

[Outer Fold 1/10] best_params={'num_leaves': 31, 'min_data_in_leaf': 100, 'learning_rate': 0.02} | Acc=0.7020 P=0.7293 R=0.8819 F1=0.7984 | Confusion(TP/FP/TN/FN)=(590,219,112,79)
[Outer Fold 2/10] best_params={'num_leaves': 127, 'min_data_in_leaf': 20, 'learning_rate': 0.02} | Acc=0.7210 P=0.7379 R=0.8973 F1=0.8098 | Confusion(TP/FP/TN/FN)=(594,211,127,68)
[Outer Fold 3/10] best_params={'num_leaves': 127, 'min_data_in_leaf': 50, 'learning_rate': 0.02} | Acc=0.7140 P=0.7289 R=0.8960 F1=0.8038 | Confusion(TP/FP/TN/FN)=(586,218,128,68)
[Outer Fold 4/10] best_params={'num_leaves': 127, 'min_data_in_leaf': 20, 'learning_rate': 0.02} | Acc=0.7100 P=0.7212 R=0.8960 F1=0.7992 | Confusion(TP/FP/TN/FN)=(577,223,133,67)
[Outer Fold 5/10] best_params={'num_leaves': 127, 'min_data_in_leaf': 20, 'learning_rate': 0.02} | Acc=0.7010 P=0.7085 R=0.8901 F1=0.7890 | Confusion(TP/FP/TN/FN)=(559,230,142,69)
[Outer Fold 6/10] best_params={'num_leaves': 127, 'min_data_in_leaf': 20, 'learning_rate': 0.02} | A

In [13]:
results = nested_cv(X_TFIDF, Y)
print(results["mean_std"]["f1"])

[Outer Fold 1/10] best_params={'num_leaves': 127, 'min_data_in_leaf': 20, 'learning_rate': 0.02} | Acc=0.7430 P=0.7588 R=0.9028 F1=0.8246 | Confusion(TP/FP/TN/FN)=(604,192,139,65)
[Outer Fold 2/10] best_params={'num_leaves': 127, 'min_data_in_leaf': 20, 'learning_rate': 0.02} | Acc=0.7090 P=0.7339 R=0.8792 F1=0.8000 | Confusion(TP/FP/TN/FN)=(582,211,127,80)
[Outer Fold 3/10] best_params={'num_leaves': 127, 'min_data_in_leaf': 20, 'learning_rate': 0.02} | Acc=0.7300 P=0.7382 R=0.9098 F1=0.8151 | Confusion(TP/FP/TN/FN)=(595,211,135,59)
[Outer Fold 4/10] best_params={'num_leaves': 127, 'min_data_in_leaf': 20, 'learning_rate': 0.02} | Acc=0.7250 P=0.7327 R=0.9022 F1=0.8086 | Confusion(TP/FP/TN/FN)=(581,212,144,63)
[Outer Fold 5/10] best_params={'num_leaves': 127, 'min_data_in_leaf': 20, 'learning_rate': 0.05} | Acc=0.7170 P=0.7143 R=0.9156 F1=0.8025 | Confusion(TP/FP/TN/FN)=(575,230,142,53)
[Outer Fold 6/10] best_params={'num_leaves': 127, 'min_data_in_leaf': 20, 'learning_rate': 0.02} | A

In [14]:
results = nested_cv(X_DETECTIVE, Y)
print(results["mean_std"]["f1"])

[Outer Fold 1/10] best_params={'num_leaves': 127, 'min_data_in_leaf': 20, 'learning_rate': 0.1} | Acc=0.8160 P=0.8551 R=0.8729 F1=0.8639 | Confusion(TP/FP/TN/FN)=(584,99,232,85)
[Outer Fold 2/10] best_params={'num_leaves': 127, 'min_data_in_leaf': 20, 'learning_rate': 0.1} | Acc=0.8280 P=0.8646 R=0.8776 F1=0.8711 | Confusion(TP/FP/TN/FN)=(581,91,247,81)
[Outer Fold 3/10] best_params={'num_leaves': 31, 'min_data_in_leaf': 100, 'learning_rate': 0.02} | Acc=0.8240 P=0.8525 R=0.8838 F1=0.8679 | Confusion(TP/FP/TN/FN)=(578,100,246,76)
[Outer Fold 4/10] best_params={'num_leaves': 127, 'min_data_in_leaf': 20, 'learning_rate': 0.02} | Acc=0.8070 P=0.8540 R=0.8447 F1=0.8493 | Confusion(TP/FP/TN/FN)=(544,93,263,100)
[Outer Fold 5/10] best_params={'num_leaves': 63, 'min_data_in_leaf': 100, 'learning_rate': 0.02} | Acc=0.7940 P=0.8276 R=0.8487 F1=0.8381 | Confusion(TP/FP/TN/FN)=(533,111,261,95)
[Outer Fold 6/10] best_params={'num_leaves': 63, 'min_data_in_leaf': 20, 'learning_rate': 0.1} | Acc=0.8