# SVM

In [None]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# MiniLM
EMBEDDING_FILE = "data/text_embeddings.npy"
DATA_FILE = "data/data.csv"

X = np.load(EMBEDDING_FILE)
y = pd.read_csv(DATA_FILE)["label"].values

# Tf-idf
EMBEDDING_FILE_TFIDF = "data/text_embeddings_tfidf.npy"

X_tfidf = np.load(EMBEDDING_FILE_TFIDF)

# encoder
EMBEDDING_FILE_EC = "data/detective_emb_384.npy"

X_encoder = np.load(EMBEDDING_FILE_EC)

In [8]:
print(f"embedding shape: {X.shape}, label: {np.unique(y)}")

unique, counts = np.unique(y, return_counts=True)
label_dist = dict(zip(unique, counts))
print("Label Distribution:")
for label, count in label_dist.items():
    percent = count / counts.sum() * 100
    print(f"  Label {label}: {count} samples ({percent:.2f}%)")

embedding shape: (10000, 384), label: [0 1]
Label Distribution:
  Label 0: 6455 samples (64.55%)
  Label 1: 3545 samples (35.45%)


In [None]:
import numpy as np
from sklearn.svm import SVC
from itertools import product
from collections import Counter

# ============== Configurable Section ==============
PARAM_GRID_SVC = {
    "C": [0.1, 1.0, 10.0, 15.0, 20],
    "gamma": [0.0001, 0.001, 0.01, 0.1, 1.0],
    "kernel": ["rbf"]
}

OUTER_K = 10
INNER_K = 3
SEED = 42
POS_LABEL = 0

# ---- Build model ----
def make_svc(params):
    # Which hyperparameters are allowed (filter out any extra ones)
    allowed = {"C", "kernel", "gamma"}
    # Keep only keys in 'allowed' and put them into a new dict 'kwargs'
    kwargs = {k: v for k, v in params.items() if k in allowed}
    # Return the model object
    if kwargs.get("kernel") == "linear" and "gamma" in kwargs:
        kwargs.pop("gamma")
    return SVC(**kwargs)

# ======================================


# ---- kfold indices ----
def simple_kfold_indices(n_samples, k, rng):
    indices = np.arange(n_samples)
    rng.shuffle(indices)
    # Indices included in the k-th fold
    return np.array_split(indices, k)


# ---- Evaluation metrics ----
def confusion_matrix_binary(y_true, y_pred, pos=POS_LABEL):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    tp = int(np.sum((y_true == pos) & (y_pred == pos)))
    fp = int(np.sum((y_true != pos) & (y_pred == pos)))
    tn = int(np.sum((y_true != pos) & (y_pred != pos)))
    fn = int(np.sum((y_true == pos) & (y_pred != pos)))
    return tp, fp, tn, fn

def precision(tp, fp, tn, fn):
    denom = tp + fp
    return tp / denom if denom > 0 else 0.0

def recall(tp, fp, tn, fn):
    denom = tp + fn
    return tp / denom if denom > 0 else 0.0

def accuracy(tp, fp, tn, fn):
    total = tp + fp + tn + fn
    return (tp + tn) / total if total > 0 else 0.0

def f1(tp, fp, tn, fn):
    p = precision(tp, fp, tn, fn)
    r = recall(tp, fp, tn, fn)
    denom = p + r
    return 2 * p * r / denom if denom > 0 else 0.0

def compute_metrics(y_true, y_pred, pos=POS_LABEL):
    tp, fp, tn, fn = confusion_matrix_binary(y_true, y_pred, pos)
    return {
        "tp": tp, "fp": fp, "tn": tn, "fn": fn,
        "precision": precision(tp, fp, tn, fn),
        "recall": recall(tp, fp, tn, fn),
        "accuracy": accuracy(tp, fp, tn, fn),
        "f1": f1(tp, fp, tn, fn),
    }

# ---- Generate all parameter combinations ----
def param_grid_iter(param_grid):
    keys = list(param_grid.keys())
    for values in product(*[param_grid[k] for k in keys]):
        yield dict(zip(keys, values))


# ---- Inner cross-validation: select best params ----
def inner_cv_select_params(X, y, train_idx, inner_k, param_grid, rng, make_model_fn):
    # On the given training samples 'train_idx', use inner_k-fold CV to test all
    # candidate hyperparameter combinations in 'param_grid'

    # len(folds) equals inner_k, i.e., inner_k groups
    folds = simple_kfold_indices(len(train_idx), inner_k, rng)
    folds = [train_idx[f] for f in folds]  # Convert relative indices to global indices

    best_score = -np.inf
    best_param = None

    for params in param_grid_iter(param_grid):
        fold_scores = []
        for f in range(inner_k):
            val_idx = folds[f]
            tr_idx = np.concatenate([folds[j] for j in range(inner_k) if j != f])

            model = make_model_fn(params)
            model.fit(X[tr_idx], y[tr_idx])
            pred = model.predict(X[val_idx])
            # Use F1 as the validation score
            tp, fp, tn, fn = confusion_matrix_binary(y[val_idx], pred, POS_LABEL)
            fold_scores.append(f1(tp, fp, tn, fn))
        
        # Core comparison: only averages higher than the current best will replace it
        avg_f1 = float(np.mean(fold_scores))
        if avg_f1 > best_score:
            best_score = avg_f1
            best_param = params

    return best_param


# ---- Nested cross-validation ----
def nested_cv(
    X, y,
    outer_k=OUTER_K,
    inner_k=INNER_K,
    param_grid=PARAM_GRID_SVC,
    make_model_fn=make_svc,
    seed=SEED
):
    rng = np.random.default_rng(seed)
    folds = simple_kfold_indices(len(X), outer_k, rng)

    outer_metrics = []            # Metrics dict for each fold
    chosen_params_each_fold = []  # Best hyperparameters per fold
    conf_sums = {"tp":0, "fp":0, "tn":0, "fn":0}

    for i in range(outer_k):
        test_idx = folds[i]
        train_idx = np.concatenate([folds[j] for j in range(outer_k) if j != i])

        # Inner: use F1 to choose the best hyperparameters
        best_param = inner_cv_select_params(X, y, train_idx, inner_k, param_grid, rng, make_model_fn)
        chosen_params_each_fold.append(best_param)

        # Outer: train and evaluate all metrics on the test fold
        model = make_model_fn(best_param)
        model.fit(X[train_idx], y[train_idx])
        pred = model.predict(X[test_idx])

        mets = compute_metrics(y[test_idx], pred, POS_LABEL)
        outer_metrics.append(mets)

        # Accumulate confusion matrix counts
        for k in ("tp","fp","tn","fn"):
            conf_sums[k] += mets[k]

        print(f"[Outer Fold {i+1}/{outer_k}] "
              f"best_params={best_param} | "
              f"Acc={mets['accuracy']:.4f} P={mets['precision']:.4f} "
              f"R={mets['recall']:.4f} F1={mets['f1']:.4f} | "
              f"Confusion(TP/FP/TN/FN)=({mets['tp']},{mets['fp']},{mets['tn']},{mets['fn']})")

    # Mean / std across folds
    def agg_mean_std(key):
        vals = np.array([m[key] for m in outer_metrics], dtype=float)
        return float(vals.mean()), float(vals.std())

    mean_acc, std_acc = agg_mean_std("accuracy")
    mean_p, std_p = agg_mean_std("precision")
    mean_r, std_r = agg_mean_std("recall")
    mean_f1, std_f1 = agg_mean_std("f1")

    print("\n=== Final (Outer CV) Summary ===")
    print(f"Confusion Matrix Sum over folds: TP={conf_sums['tp']} FP={conf_sums['fp']} TN={conf_sums['tn']} FN={conf_sums['fn']}")
    print(f"Accuracy : mean={mean_acc:.4f}, std={std_acc:.4f}")
    print(f"Precision: mean={mean_p:.4f},  std={std_p:.4f}")
    print(f"Recall   : mean={mean_r:.4f},  std={std_r:.4f}")
    print(f"F1-score : mean={mean_f1:.4f}, std={std_f1:.4f}")

    # Count the frequency of chosen hyperparameters (display only)
    selections = Counter([tuple(sorted(p.items())) for p in chosen_params_each_fold])
    print("\n=== Chosen Hyperparameters Across Folds ===")
    for combo, count in selections.items():
        print(f"{dict(combo)}: chosen {count} times")

    return {
        "outer_metrics": outer_metrics,                      # Full metrics for each fold
        "confusion_sum": conf_sums,                          # Accumulated confusion matrix
        "mean_std": {                                        # Mean / std of each metric
            "accuracy": (mean_acc, std_acc),
            "precision": (mean_p, std_p),
            "recall": (mean_r, std_r),
            "f1": (mean_f1, std_f1),
        },
        "chosen_params_each_fold": chosen_params_each_fold,  # Best hyperparameters per fold
    }


## MiniLM

In [12]:
results = nested_cv(X, y)
print(results["mean_std"]["f1"])

[Outer Fold 1/10] best_params={'C': 10.0, 'gamma': 0.1, 'kernel': 'rbf'} | Acc=0.7390 P=0.7423 R=0.9342 F1=0.8273 | Confusion(TP/FP/TN/FN)=(625,217,114,44)
[Outer Fold 2/10] best_params={'C': 10.0, 'gamma': 0.1, 'kernel': 'rbf'} | Acc=0.7280 P=0.7316 R=0.9305 F1=0.8191 | Confusion(TP/FP/TN/FN)=(616,226,112,46)
[Outer Fold 3/10] best_params={'C': 10.0, 'gamma': 0.1, 'kernel': 'rbf'} | Acc=0.7420 P=0.7380 R=0.9388 F1=0.8264 | Confusion(TP/FP/TN/FN)=(614,218,128,40)
[Outer Fold 4/10] best_params={'C': 10.0, 'gamma': 0.1, 'kernel': 'rbf'} | Acc=0.7460 P=0.7361 R=0.9441 F1=0.8272 | Confusion(TP/FP/TN/FN)=(608,218,138,36)
[Outer Fold 5/10] best_params={'C': 15.0, 'gamma': 0.1, 'kernel': 'rbf'} | Acc=0.7290 P=0.7164 R=0.9411 F1=0.8135 | Confusion(TP/FP/TN/FN)=(591,234,138,37)
[Outer Fold 6/10] best_params={'C': 10.0, 'gamma': 0.1, 'kernel': 'rbf'} | Acc=0.7190 P=0.7120 R=0.9336 F1=0.8079 | Confusion(TP/FP/TN/FN)=(591,239,128,42)
[Outer Fold 7/10] best_params={'C': 10.0, 'gamma': 0.1, 'kernel'

## TF-IDF

In [None]:
results = nested_cv(X_tfidf, y)
print(results["mean_std"]["f1"])

[Outer Fold 1/10] best_params={'C': 1.0, 'gamma': 1.0, 'kernel': 'rbf'} | Acc=0.7440 P=0.7497 R=0.9268 F1=0.8289 | Confusion(TP/FP/TN/FN)=(620,207,124,49)
[Outer Fold 2/10] best_params={'C': 1.0, 'gamma': 1.0, 'kernel': 'rbf'} | Acc=0.7380 P=0.7439 R=0.9215 F1=0.8232 | Confusion(TP/FP/TN/FN)=(610,210,128,52)
[Outer Fold 3/10] best_params={'C': 1.0, 'gamma': 1.0, 'kernel': 'rbf'} | Acc=0.7240 P=0.7299 R=0.9174 F1=0.8130 | Confusion(TP/FP/TN/FN)=(600,222,124,54)
[Outer Fold 4/10] best_params={'C': 1.0, 'gamma': 1.0, 'kernel': 'rbf'} | Acc=0.7340 P=0.7345 R=0.9193 F1=0.8166 | Confusion(TP/FP/TN/FN)=(592,214,142,52)
[Outer Fold 5/10] best_params={'C': 1.0, 'gamma': 1.0, 'kernel': 'rbf'} | Acc=0.7300 P=0.7221 R=0.9268 F1=0.8117 | Confusion(TP/FP/TN/FN)=(582,224,148,46)
[Outer Fold 6/10] best_params={'C': 1.0, 'gamma': 1.0, 'kernel': 'rbf'} | Acc=0.7210 P=0.7196 R=0.9163 F1=0.8061 | Confusion(TP/FP/TN/FN)=(580,226,141,53)
[Outer Fold 7/10] best_params={'C': 15.0, 'gamma': 1.0, 'kernel': 'rbf

## DeTeCtive

In [11]:
results = nested_cv(X_encoder, y)
print(results["mean_std"]["f1"])

[Outer Fold 1/10] best_params={'C': 20, 'gamma': 1.0, 'kernel': 'rbf'} | Acc=0.8280 P=0.8515 R=0.8999 F1=0.8750 | Confusion(TP/FP/TN/FN)=(602,105,226,67)
[Outer Fold 2/10] best_params={'C': 20, 'gamma': 1.0, 'kernel': 'rbf'} | Acc=0.8380 P=0.8655 R=0.8943 F1=0.8796 | Confusion(TP/FP/TN/FN)=(592,92,246,70)
[Outer Fold 3/10] best_params={'C': 10.0, 'gamma': 1.0, 'kernel': 'rbf'} | Acc=0.8280 P=0.8463 R=0.9006 F1=0.8726 | Confusion(TP/FP/TN/FN)=(589,107,239,65)
[Outer Fold 4/10] best_params={'C': 10.0, 'gamma': 1.0, 'kernel': 'rbf'} | Acc=0.8270 P=0.8574 R=0.8773 F1=0.8672 | Confusion(TP/FP/TN/FN)=(565,94,262,79)
[Outer Fold 5/10] best_params={'C': 20, 'gamma': 1.0, 'kernel': 'rbf'} | Acc=0.7980 P=0.8123 R=0.8822 F1=0.8458 | Confusion(TP/FP/TN/FN)=(554,128,244,74)
[Outer Fold 6/10] best_params={'C': 20, 'gamma': 1.0, 'kernel': 'rbf'} | Acc=0.8320 P=0.8394 R=0.9084 F1=0.8725 | Confusion(TP/FP/TN/FN)=(575,110,257,58)
[Outer Fold 7/10] best_params={'C': 15.0, 'gamma': 1.0, 'kernel': 'rbf'} |