## KNN

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# MiniLM
EMBEDDING_FILE = "data/text_embeddings.npy"
DATA_FILE = "data/data.csv"

X = np.load(EMBEDDING_FILE)
y = pd.read_csv(DATA_FILE)["label"].values

# Tf-idf
EMBEDDING_FILE_TFIDF = "data/text_embeddings_tfidf.npy"

X_tfidf = np.load(EMBEDDING_FILE_TFIDF)

# detective
EMBEDDING_FILE_EC = "data/detective_emb_384.npy"

X_encoder = np.load(EMBEDDING_FILE_EC)

In [36]:
print(f"embedding shape: {X.shape}, label: {np.unique(y)}")

unique, counts = np.unique(y, return_counts=True)
label_dist = dict(zip(unique, counts))
print("Label Distribution:")
for label, count in label_dist.items():
    percent = count / counts.sum() * 100
    print(f"  Label {label}: {count} samples ({percent:.2f}%)")

embedding shape: (10000, 384), label: [0 1]
Label Distribution:
  Label 0: 6455 samples (64.55%)
  Label 1: 3545 samples (35.45%)


In [None]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from itertools import product
from collections import Counter

# ============== Configurable Section ==============
PARAM_GRID_SVC = {
    "n_neighbors": [1, 2, 5, 10, 50, 100, 150],
    "weights": ["uniform", "distance"],
    "metric": ["cosine", "euclidean"],
}

OUTER_K = 10
INNER_K = 3
SEED = 42
POS_LABEL = 0

# ---- Build Model ----
def make_svc(params):
    # Which hyperparameters are allowed (filter out unnecessary ones)
    allowed = {"n_neighbors", "weights", "metric"}
    # Keep only keys in `allowed`, store them in a new dictionary `kwargs`
    kwargs = {k: v for k, v in params.items() if k in allowed}
    if kwargs.get("metric") == "cosine":
        kwargs["algorithm"] = "brute"
    return KNeighborsClassifier(**kwargs)

# ======================================


# ---- kfold indices ----
def simple_kfold_indices(n_samples, k, rng):
    indices = np.arange(n_samples)
    rng.shuffle(indices)
    # Indices included in each of the k folds
    return np.array_split(indices, k)


# ---- Evaluation metrics ----
def confusion_matrix_binary(y_true, y_pred, pos=POS_LABEL):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    tp = int(np.sum((y_true == pos) & (y_pred == pos)))
    fp = int(np.sum((y_true != pos) & (y_pred == pos)))
    tn = int(np.sum((y_true != pos) & (y_pred != pos)))
    fn = int(np.sum((y_true == pos) & (y_pred != pos)))
    return tp, fp, tn, fn

def precision(tp, fp, tn, fn):
    denom = tp + fp
    return tp / denom if denom > 0 else 0.0

def recall(tp, fp, tn, fn):
    denom = tp + fn
    return tp / denom if denom > 0 else 0.0

def accuracy(tp, fp, tn, fn):
    total = tp + fp + tn + fn
    return (tp + tn) / total if total > 0 else 0.0

def f1(tp, fp, tn, fn):
    p = precision(tp, fp, tn, fn)
    r = recall(tp, fp, tn, fn)
    denom = p + r
    return 2 * p * r / denom if denom > 0 else 0.0

def compute_metrics(y_true, y_pred, pos=POS_LABEL):
    tp, fp, tn, fn = confusion_matrix_binary(y_true, y_pred, pos)
    return {
        "tp": tp, "fp": fp, "tn": tn, "fn": fn,
        "precision": precision(tp, fp, tn, fn),
        "recall": recall(tp, fp, tn, fn),
        "accuracy": accuracy(tp, fp, tn, fn),
        "f1": f1(tp, fp, tn, fn),
    }

# ---- Generate all parameter combinations ----
def param_grid_iter(param_grid):
    keys = list(param_grid.keys())
    for values in product(*[param_grid[k] for k in keys]):
        yield dict(zip(keys, values))


# ---- Inner cross-validation: select best parameters ----
def inner_cv_select_params(X, y, train_idx, inner_k, param_grid, rng, make_model_fn):
    # On the given training samples `train_idx`, perform inner_k-fold CV to test all candidate hyperparameter combinations in param_grid

    # len(folds) equals inner_k, i.e., inner_k groups
    folds = simple_kfold_indices(len(train_idx), inner_k, rng)
    folds = [train_idx[f] for f in folds] # Convert relative indices to global indices

    best_score = -np.inf
    best_param = None

    for params in param_grid_iter(param_grid):
        fold_scores = []
        for f in range(inner_k):
            val_idx = folds[f]
            tr_idx = np.concatenate([folds[j] for j in range(inner_k) if j != f])

            model = make_model_fn(params)
            model.fit(X[tr_idx], y[tr_idx])
            pred = model.predict(X[val_idx])
            # Use F1 as the validation score
            tp, fp, tn, fn = confusion_matrix_binary(y[val_idx], pred, POS_LABEL)
            fold_scores.append(f1(tp, fp, tn, fn))
        
        # Core comparison logic — only scores higher than current average become new best
        avg_f1 = float(np.mean(fold_scores))
        if avg_f1 > best_score:
            best_score = avg_f1
            best_param = params

    return best_param


# ---- Nested cross-validation ----
def nested_cv(
    X, y,
    outer_k=OUTER_K,
    inner_k=INNER_K,
    param_grid=PARAM_GRID_SVC,
    make_model_fn=make_svc,
    seed=SEED
):
    rng = np.random.default_rng(seed)
    folds = simple_kfold_indices(len(X), outer_k, rng)

    outer_metrics = []            # Metrics for each outer fold
    chosen_params_each_fold = []  # Best hyperparameters per fold
    conf_sums = {"tp":0, "fp":0, "tn":0, "fn":0}

    for i in range(outer_k):
        test_idx = folds[i]
        train_idx = np.concatenate([folds[j] for j in range(outer_k) if j != i])

        # Inner loop: use F1 to select best hyperparameters
        best_param = inner_cv_select_params(X, y, train_idx, inner_k, param_grid, rng, make_model_fn)
        chosen_params_each_fold.append(best_param)

        # Outer loop: train and evaluate on the test fold
        model = make_model_fn(best_param)
        model.fit(X[train_idx], y[train_idx])
        pred = model.predict(X[test_idx])

        mets = compute_metrics(y[test_idx], pred, POS_LABEL)
        outer_metrics.append(mets)

        # Accumulate confusion matrix counts
        for k in ("tp","fp","tn","fn"):
            conf_sums[k] += mets[k]

        print(f"[Outer Fold {i+1}/{outer_k}] "
              f"best_params={best_param} | "
              f"Acc={mets['accuracy']:.4f} P={mets['precision']:.4f} "
              f"R={mets['recall']:.4f} F1={mets['f1']:.4f} | "
              f"Confusion(TP/FP/TN/FN)=({mets['tp']},{mets['fp']},{mets['tn']},{mets['fn']})")

    # Mean and standard deviation across folds
    def agg_mean_std(key):
        vals = np.array([m[key] for m in outer_metrics], dtype=float)
        return float(vals.mean()), float(vals.std())

    mean_acc, std_acc = agg_mean_std("accuracy")
    mean_p, std_p = agg_mean_std("precision")
    mean_r, std_r = agg_mean_std("recall")
    mean_f1, std_f1 = agg_mean_std("f1")

    print("\n=== Final (Outer CV) Summary ===")
    print(f"Confusion Matrix Sum over folds: TP={conf_sums['tp']} FP={conf_sums['fp']} TN={conf_sums['tn']} FN={conf_sums['fn']}")
    print(f"Accuracy : mean={mean_acc:.4f}, std={std_acc:.4f}")
    print(f"Precision: mean={mean_p:.4f},  std={std_p:.4f}")
    print(f"Recall   : mean={mean_r:.4f},  std={std_r:.4f}")
    print(f"F1-score : mean={mean_f1:.4f}, std={std_f1:.4f}")

    # Count frequency of chosen hyperparameter combinations (for display only)
    selections = Counter([tuple(sorted(p.items())) for p in chosen_params_each_fold])
    print("\n=== Chosen Hyperparameters Across Folds ===")
    for combo, count in selections.items():
        print(f"{dict(combo)}: chosen {count} times")

    return {
        "outer_metrics": outer_metrics,                      # Full metrics for each fold
        "confusion_sum": conf_sums,                          # Accumulated confusion matrix
        "mean_std": {                                        # Mean/std of each metric
            "accuracy": (mean_acc, std_acc),
            "precision": (mean_p, std_p),
            "recall": (mean_r, std_r),
            "f1": (mean_f1, std_f1),
        },
        "chosen_params_each_fold": chosen_params_each_fold,  # Best hyperparameters for each fold
    }


## MiniLM

In [38]:
results = nested_cv(X, y)
print(results["mean_std"]["f1"])

[Outer Fold 1/10] best_params={'n_neighbors': 100, 'weights': 'distance', 'metric': 'cosine'} | Acc=0.6760 P=0.6967 R=0.9133 F1=0.7904 | Confusion(TP/FP/TN/FN)=(611,266,65,58)
[Outer Fold 2/10] best_params={'n_neighbors': 50, 'weights': 'distance', 'metric': 'cosine'} | Acc=0.7090 P=0.7149 R=0.9320 F1=0.8092 | Confusion(TP/FP/TN/FN)=(617,246,92,45)
[Outer Fold 3/10] best_params={'n_neighbors': 50, 'weights': 'distance', 'metric': 'cosine'} | Acc=0.7030 P=0.7078 R=0.9297 F1=0.8037 | Confusion(TP/FP/TN/FN)=(608,251,95,46)
[Outer Fold 4/10] best_params={'n_neighbors': 50, 'weights': 'distance', 'metric': 'cosine'} | Acc=0.7120 P=0.7119 R=0.9286 F1=0.8059 | Confusion(TP/FP/TN/FN)=(598,242,114,46)
[Outer Fold 5/10] best_params={'n_neighbors': 50, 'weights': 'distance', 'metric': 'cosine'} | Acc=0.6940 P=0.6876 R=0.9395 F1=0.7941 | Confusion(TP/FP/TN/FN)=(590,268,104,38)
[Outer Fold 6/10] best_params={'n_neighbors': 50, 'weights': 'distance', 'metric': 'cosine'} | Acc=0.6970 P=0.6960 R=0.925

## TF-IDF

In [39]:
results = nested_cv(X_tfidf, y)
print(results["mean_std"]["f1"])

[Outer Fold 1/10] best_params={'n_neighbors': 100, 'weights': 'distance', 'metric': 'cosine'} | Acc=0.7270 P=0.7519 R=0.8834 F1=0.8124 | Confusion(TP/FP/TN/FN)=(591,195,136,78)
[Outer Fold 2/10] best_params={'n_neighbors': 100, 'weights': 'distance', 'metric': 'cosine'} | Acc=0.7040 P=0.7299 R=0.8776 F1=0.7970 | Confusion(TP/FP/TN/FN)=(581,215,123,81)
[Outer Fold 3/10] best_params={'n_neighbors': 100, 'weights': 'distance', 'metric': 'cosine'} | Acc=0.6930 P=0.7161 R=0.8792 F1=0.7893 | Confusion(TP/FP/TN/FN)=(575,228,118,79)
[Outer Fold 4/10] best_params={'n_neighbors': 100, 'weights': 'distance', 'metric': 'cosine'} | Acc=0.7120 P=0.7219 R=0.8991 F1=0.8008 | Confusion(TP/FP/TN/FN)=(579,223,133,65)
[Outer Fold 5/10] best_params={'n_neighbors': 100, 'weights': 'distance', 'metric': 'cosine'} | Acc=0.6940 P=0.7033 R=0.8869 F1=0.7845 | Confusion(TP/FP/TN/FN)=(557,235,137,71)
[Outer Fold 6/10] best_params={'n_neighbors': 100, 'weights': 'distance', 'metric': 'cosine'} | Acc=0.7090 P=0.7221

## DeTeCtive

In [40]:
results = nested_cv(X_encoder, y)
print(results["mean_std"]["f1"])

[Outer Fold 1/10] best_params={'n_neighbors': 100, 'weights': 'uniform', 'metric': 'cosine'} | Acc=0.8190 P=0.8427 R=0.8969 F1=0.8689 | Confusion(TP/FP/TN/FN)=(600,112,219,69)
[Outer Fold 2/10] best_params={'n_neighbors': 50, 'weights': 'uniform', 'metric': 'cosine'} | Acc=0.8280 P=0.8520 R=0.8958 F1=0.8733 | Confusion(TP/FP/TN/FN)=(593,103,235,69)
[Outer Fold 3/10] best_params={'n_neighbors': 50, 'weights': 'uniform', 'metric': 'cosine'} | Acc=0.8340 P=0.8466 R=0.9113 F1=0.8778 | Confusion(TP/FP/TN/FN)=(596,108,238,58)
[Outer Fold 4/10] best_params={'n_neighbors': 50, 'weights': 'uniform', 'metric': 'cosine'} | Acc=0.8140 P=0.8368 R=0.8835 F1=0.8595 | Confusion(TP/FP/TN/FN)=(569,111,245,75)
[Outer Fold 5/10] best_params={'n_neighbors': 100, 'weights': 'distance', 'metric': 'cosine'} | Acc=0.8020 P=0.8152 R=0.8854 F1=0.8489 | Confusion(TP/FP/TN/FN)=(556,126,246,72)
[Outer Fold 6/10] best_params={'n_neighbors': 100, 'weights': 'uniform', 'metric': 'cosine'} | Acc=0.8300 P=0.8350 R=0.911