Date: 16th Nov 2025

# Imports

In [25]:
# =========================
# Cell 1: Imports & CONFIG
# =========================

import os
import json
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, train_test_split, KFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    StandardScaler,
    MinMaxScaler,
    MaxAbsScaler,
    RobustScaler,
    Normalizer,
)
from sklearn.decomposition import PCA
from pathlib import Path
from time import time
from sklearn.metrics import f1_score

# Config

In [26]:
CONFIG = {
    # --- Meta / experiment info ---
    "EXP_NAME": "experiment_35A",  # any short name for this run
    "EXPERIMENTER": "Zuha",              # "Zuha" | "Maryam" | "Maham"
    "MODEL_FAMILY": "svm",          # "lightgbm" | "xgboost" | "mlp_early" | "mlp_two_tower"
                                          # "logreg" | "svm" | "rf" | "catboost" | "ensemble"

    # --- Paths (update DATA_DIR when you know Kaggle path) ---
    # On Kaggle, you'll usually have something like: "/kaggle/input/dss-ml-competition/"
    "DATA_DIR": "./data",                # folder containing train_part1.json & test.json
    "TRAIN_FILENAME": "train_p1.json",
    "TEST_FILENAME": "test.json",
    "SUBMISSION_OUTPUT_DIR": "./submissions",  # where to save CSV submissions

    # --- Target / ID / feature mode ---
    "TARGET_COLUMN": "label",            # name of target in train JSON
    "ID_COLUMN": "id",                   # for train: hashed id, for test: integer
    "FEATURE_MODE": "img+text",          # "img+text" | "img_only" | "text_only"
                                         # (later you can try ablations)

    # --- Reproducibility ---
    "SEED": 42,                          # any integer; try 42, 2025, 7, etc.
    "NP_RANDOM_SEED": 42,                # keep same as SEED or change if you want

    # --- Train / validation strategy (for local eval) ---
    "VALIDATION_SCHEME": "cv_only",      # "cv_only"          → only K-fold CV
                                         # "holdout_only"     → single train/val split
                                         # "holdout+cv"       → both split + CV

    # If using holdout (for VALIDATION_SCHEME != "cv_only"):
    "VAL_SIZE": 0.2,                     # fraction of train data for validation (e.g. 0.1, 0.2)
    "VAL_STRATIFY": True,                # True to stratify by label in train/val split
    "VAL_RANDOM_STATE": 42,              # seed for train/val split

    # --- Cross-validation (K-fold) ---
    "USE_STRATIFIED_KFOLD": True,        # almost always True for classification
    "N_FOLDS": 7,                        # typical: 3, 5, 7, 10
    "SHUFFLE_FOLDS": True,               # shuffle before splitting into folds
    "FOLDS_RANDOM_STATE": 42,            # seed for fold splitting

    # --- Scaling / normalization ---
    "SCALER_TYPE": "minmax",               # "none"     → no scaling (good for tree models)
                                         # "standard" → StandardScaler (zero mean, unit variance)
                                         # "minmax"   → MinMaxScaler (0–1)
                                         # "maxabs"   → MaxAbsScaler ([-1, 1] for sparse-like)
                                         # "robust"   → RobustScaler (robust to outliers)
                                         # "l2_norm"  → normalize each sample to unit L2 norm

    # --- Missing values handling (numeric) ---
    "MISSING_VALUE_STRATEGY": "mean",    # "none"     → assume no NaNs; just assert & crash if found
                                         # "mean"     → fill NaNs with column means
                                         # "median"   → fill with column medians
                                         # "constant" → fill with constant (see MISSING_FILL_VALUE)
                                         # "zero"     → fill with 0.0

    "MISSING_FILL_VALUE": 0.0,           # used when strategy == "constant" or "zero"

    # --- PCA dimensionality reduction ---
    "USE_PCA": True,                    # True → apply PCA after scaling (if any)
    "PCA_N_COMPONENTS": 64,             # e.g. 64, 128, 256, 512; <= 1024 total features
    "PCA_WHITEN": True,                 # True → decorrelate & scale to unit variance
    "PCA_SVD_SOLVER": "full",            # "auto" | "full" | "randomized"
    "PCA_RANDOM_STATE": 42,              # for randomized solver, etc.

    # --- Class imbalance handling (general) ---
    "IMBALANCE_MODE": "balanced",            # "none"           → do nothing
                                         # "balanced"       → use class_weight='balanced' (for LR/SVM/MLP)
                                         # "scale_pos_weight" → for tree models like XGB/LGB
                                         # "is_unbalance"   → LightGBM's built-in option

    # --- Threshold for converting probabilities → class labels ---
    "DECISION_THRESHOLD": 0.5,           # default 0.5; you can tune (0.4–0.6, etc.)

    # --- Logging / runs ---
    "SAVE_LOCAL_LOGS": True,             # later: saves experiments_master.csv etc.
    "LOGS_DIR": "./logs",                # folder for logs/CSVs if you want
}

In [27]:
# Make sure directories exist
Path(CONFIG["SUBMISSION_OUTPUT_DIR"]).mkdir(parents=True, exist_ok=True)
Path(CONFIG["LOGS_DIR"]).mkdir(parents=True, exist_ok=True)

# load data

## utils

In [28]:
# =========================
# Cell 2: Data Loading Utils
# =========================

def load_json_list(path):
    """
    Loads a JSON file that contains either:
    - a list of JSON objects, OR
    - JSON lines (one object per line)
    and returns a Python list of dicts.
    """
    path = Path(path)
    with path.open("r", encoding="utf-8") as f:
        text = f.read().strip()
        if not text:
            return []
        # Try parse as a JSON array first
        try:
            data = json.loads(text)
            if isinstance(data, list):
                return data
            else:
                # If it's a single object, wrap it
                return [data]
        except json.JSONDecodeError:
            # Fallback: assume JSON Lines format
            data = []
            for line in text.splitlines():
                line = line.strip()
                if not line:
                    continue
                data.append(json.loads(line))
            return data


def build_feature_matrix_from_records(records, feature_mode="img+text",
                                      id_key="id", target_key="label", is_train=True):
    """
    Given a list of records like:
    {
      "id": "a9d8c7...",
      "label": 0,
      "image_embedding": [...512 floats...],
      "text_embedding": [...512 floats...]
    }
    returns:
      X: np.ndarray [n_samples, n_features]
      y: np.ndarray [n_samples] (if is_train=True, else None)
      ids: list of ids (for linking back to samples / submission)
    """
    ids = []
    features = []
    labels = [] if is_train else None

    for rec in records:
        rid = rec[id_key]
        img = rec.get("image_embedding", [])
        txt = rec.get("text_embedding", [])

        # Ensure they are np arrays
        img = np.array(img, dtype=np.float32)
        txt = np.array(txt, dtype=np.float32)

        if feature_mode == "img+text":
            feat = np.concatenate([img, txt], axis=0)
        elif feature_mode == "img_only":
            feat = img
        elif feature_mode == "text_only":
            feat = txt
        else:
            raise ValueError(f"Unknown FEATURE_MODE: {feature_mode}")

        ids.append(rid)
        features.append(feat)
        if is_train:
            labels.append(rec[target_key])

    X = np.stack(features, axis=0)
    y = np.array(labels, dtype=np.int64) if is_train else None

    return X, y, ids

In [29]:
# =========================
# Cell 3: Load Train & Test
# =========================

def load_train_test(config=CONFIG):
    data_dir = Path(config["DATA_DIR"])
    train_path = data_dir / config["TRAIN_FILENAME"]
    test_path = data_dir / config["TEST_FILENAME"]

    print(f"Loading train from: {train_path}")
    train_records = load_json_list(train_path)

    print(f"Loading test from:  {test_path}")
    test_records = load_json_list(test_path)

    print(f"Train samples: {len(train_records)}")
    print(f"Test samples:  {len(test_records)}")

    X_train, y_train, train_ids = build_feature_matrix_from_records(
        train_records,
        feature_mode=config["FEATURE_MODE"],
        id_key=config["ID_COLUMN"],
        target_key=config["TARGET_COLUMN"],
        is_train=True,
    )

    X_test, _, test_ids = build_feature_matrix_from_records(
        test_records,
        feature_mode=config["FEATURE_MODE"],
        id_key=config["ID_COLUMN"],
        target_key=config["TARGET_COLUMN"],  # ignored when is_train=False
        is_train=False,
    )

    print(f"X_train shape: {X_train.shape}")  # (n_train, n_features)
    print(f"X_test shape:  {X_test.shape}")   # (n_test, n_features)

    # Basic label distribution & NaN checks
    unique, counts = np.unique(y_train, return_counts=True)
    label_dist = dict(zip(unique, counts))
    print("Label distribution (train):", label_dist)

    # NaN / inf check
    n_nan_train = np.isnan(X_train).sum()
    n_nan_test = np.isnan(X_test).sum()
    n_inf_train = np.isinf(X_train).sum()
    n_inf_test = np.isinf(X_test).sum()

    print(f"NaNs in X_train: {n_nan_train}, NaNs in X_test: {n_nan_test}")
    print(f"Infs in X_train: {n_inf_train}, Infs in X_test: {n_inf_test}")

    return X_train, y_train, train_ids, X_test, test_ids


## load train test

In [30]:
# Actually load the data (you can comment this out during debugging if needed)
X_train, y_train, train_ids, X_test, test_ids = load_train_test(CONFIG)

Loading train from: data\train_p1.json
Loading test from:  data\test.json
Train samples: 1530
Test samples:  500
X_train shape: (1530, 1024)
X_test shape:  (500, 1024)
Label distribution (train): {np.int64(0): np.int64(1326), np.int64(1): np.int64(204)}
NaNs in X_train: 0, NaNs in X_test: 0
Infs in X_train: 0, Infs in X_test: 0


# data preprocessing

## utils

In [31]:
# =========================
# Cell 4: Preprocessing (Imputer + Scaler + PCA)
# =========================

def build_imputer(config):
    """Return an sklearn SimpleImputer or None based on CONFIG."""
    strategy = config["MISSING_VALUE_STRATEGY"]

    if strategy == "none":
        return None
    elif strategy == "mean":
        return SimpleImputer(strategy="mean")
    elif strategy == "median":
        return SimpleImputer(strategy="median")
    elif strategy in ("constant", "zero"):
        fill_value = 0.0 if strategy == "zero" else config["MISSING_FILL_VALUE"]
        return SimpleImputer(strategy="constant", fill_value=fill_value)
    else:
        raise ValueError(f"Unknown MISSING_VALUE_STRATEGY: {strategy}")


def build_scaler(config):
    """Return a scaler/normalizer or None based on CONFIG['SCALER_TYPE']."""
    scaler_type = config["SCALER_TYPE"]

    if scaler_type == "none":
        return None
    elif scaler_type == "standard":
        # zero mean, unit variance per feature
        return StandardScaler()
    elif scaler_type == "minmax":
        # scales each feature to [0, 1]
        return MinMaxScaler()
    elif scaler_type == "maxabs":
        # scales each feature to [-1, 1] based on max abs value
        return MaxAbsScaler()
    elif scaler_type == "robust":
        # robust to outliers (uses median & IQR)
        return RobustScaler()
    elif scaler_type == "l2_norm":
        # normalizes each sample (row) to unit L2 norm
        # (good when you care about direction, not magnitude)
        return Normalizer(norm="l2")
    else:
        raise ValueError(f"Unknown SCALER_TYPE: {scaler_type}")


def build_pca(config, n_features: int):
    """Return a PCA object or None based on CONFIG."""
    if not config["USE_PCA"]:
        return None

    n_components = config["PCA_N_COMPONENTS"]
    # Safety: can't have more components than features
    n_components = min(n_components, n_features)

    pca = PCA(
        n_components=n_components,
        whiten=config["PCA_WHITEN"],
        svd_solver=config["PCA_SVD_SOLVER"],
        random_state=config["PCA_RANDOM_STATE"],
    )
    return pca


def fit_preprocessor(X_train: np.ndarray, config=CONFIG):
    """
    Fit imputer, scaler, and PCA on X_train according to CONFIG.
    Returns:
      X_train_proc: transformed training features
      preprocessors: dict with keys 'imputer', 'scaler', 'pca'
    """
    X_proc = X_train.astype(np.float32, copy=True)

    # 1) Missing values
    imputer = build_imputer(config)
    if imputer is not None:
        X_proc = imputer.fit_transform(X_proc)
    else:
        # If we claim there are no missing values, assert this (fail fast)
        if np.isnan(X_proc).any() or np.isinf(X_proc).any():
            raise ValueError(
                "NaN or inf detected in X_train but MISSING_VALUE_STRATEGY='none'. "
                "Change strategy to 'mean'/'median'/'zero' or clean data."
            )

    # 2) Scaling / normalization
    scaler = build_scaler(config)
    if scaler is not None:
        X_proc = scaler.fit_transform(X_proc)

    # 3) PCA
    pca = build_pca(config, n_features=X_proc.shape[1])
    if pca is not None:
        X_proc = pca.fit_transform(X_proc)
        print(f"PCA applied: new shape = {X_proc.shape}")
    else:
        print("PCA not used; shape remains:", X_proc.shape)

    preprocessors = {
        "imputer": imputer,
        "scaler": scaler,
        "pca": pca,
    }
    return X_proc, preprocessors


def transform_with_preprocessor(X: np.ndarray, preprocessors: dict):
    """
    Apply fitted imputer, scaler, and PCA to new data (val/test).
    X: np.ndarray [n_samples, n_features_orig]
    preprocessors: dict from fit_preprocessor()

    Returns:
      X_proc: transformed X
    """
    X_proc = X.astype(np.float32, copy=True)

    imputer = preprocessors.get("imputer")
    scaler = preprocessors.get("scaler")
    pca = preprocessors.get("pca")

    # 1) Missing values
    if imputer is not None:
        X_proc = imputer.transform(X_proc)
    else:
        if np.isnan(X_proc).any() or np.isinf(X_proc).any():
            raise ValueError(
                "NaN or inf detected in X but no imputer fitted (MISSING_VALUE_STRATEGY='none')."
            )

    # 2) Scaling / normalization
    if scaler is not None:
        X_proc = scaler.transform(X_proc)

    # 3) PCA
    if pca is not None:
        X_proc = pca.transform(X_proc)

    return X_proc

## preprocess the data

In [32]:
# Example: preprocess the full train & test according to CONFIG
X_train_proc, PREPROCESSORS = fit_preprocessor(X_train, CONFIG)
X_test_proc = transform_with_preprocessor(X_test, PREPROCESSORS)

print("X_train_proc shape:", X_train_proc.shape)
print("X_test_proc shape: ", X_test_proc.shape)

PCA applied: new shape = (1530, 64)
X_train_proc shape: (1530, 64)
X_test_proc shape:  (500, 64)


# train and test

## utils

In [33]:
# =========================
# Cell 5: Cross-validation utilities
# =========================

def get_cv_splits(y, config=CONFIG):
    """
    Build K-fold (or StratifiedKFold) splits based on CONFIG.
    Returns a list of (train_idx, val_idx) tuples.
    """
    n_folds = config["N_FOLDS"]
    shuffle = config["SHUFFLE_FOLDS"]
    random_state = config["FOLDS_RANDOM_STATE"]

    if config["USE_STRATIFIED_KFOLD"]:
        splitter = StratifiedKFold(
            n_splits=n_folds,
            shuffle=shuffle,
            random_state=random_state,
        )
        splits = list(splitter.split(np.zeros_like(y), y))
    else:
        splitter = KFold(
            n_splits=n_folds,
            shuffle=shuffle,
            random_state=random_state,
        )
        splits = list(splitter.split(np.zeros_like(y)))
    return splits


def run_cv_and_fit_full(build_model_fn, X, y, X_test, config=CONFIG, model_name="model"):
    """
    - build_model_fn: function that returns a *fresh* untrained model object
    - X, y: preprocessed training data (e.g. X_train_proc, y_train)
    - X_test: preprocessed test data (e.g. X_test_proc)
    - config: CONFIG dict
    - model_name: just for printing

    Returns:
      results: dict with:
        - "cv_f1_scores"
        - "cv_f1_mean"
        - "cv_f1_std"
        - "test_pred" (binary 0/1)
        - "test_proba" (or None if not available)
        - "train_time_sec"
    """
    splits = get_cv_splits(y, config)
    threshold = config["DECISION_THRESHOLD"]

    fold_scores = []
    start_time = time()

    for fold, (tr_idx, val_idx) in enumerate(splits, start=1):
        print(f"\n=== {model_name}: Fold {fold}/{len(splits)} ===")
        X_tr, X_val = X[tr_idx], X[val_idx]
        y_tr, y_val = y[tr_idx], y[val_idx]

        model = build_model_fn()

        # Fit
        model.fit(X_tr, y_tr)

        # Predict on validation
        if hasattr(model, "predict_proba"):
            val_proba = model.predict_proba(X_val)[:, 1]
            val_pred = (val_proba >= threshold).astype(int)
        else:
            # fallback: direct class prediction
            val_pred = model.predict(X_val)

        f1 = f1_score(y_val, val_pred, average="macro")
        print(f"Fold {fold} F1 Macro: {f1:.4f}")
        fold_scores.append(f1)

    train_time_sec = time() - start_time

    cv_mean = float(np.mean(fold_scores))
    cv_std = float(np.std(fold_scores))
    print("\n=== CV Summary ===")
    print(f"{model_name}: F1 Macro mean = {cv_mean:.4f}, std = {cv_std:.4f}")
    print(f"Training + CV time: {train_time_sec:.1f} sec")

    # Fit on full training data and predict on test
    full_model = build_model_fn()
    full_model.fit(X, y)

    test_proba = None
    if hasattr(full_model, "predict_proba"):
        test_proba = full_model.predict_proba(X_test)[:, 1]
        test_pred = (test_proba >= threshold).astype(int)
    else:
        test_pred = full_model.predict(X_test)

    results = {
        "cv_f1_scores": fold_scores,
        "cv_f1_mean": cv_mean,
        "cv_f1_std": cv_std,
        "test_pred": test_pred.astype(int),
        "test_proba": test_proba,
        "train_time_sec": train_time_sec,
    }
    return results


In [34]:
# =========================
# Cell 6: Submission helper
# =========================

def save_submission(test_ids, test_pred, config=CONFIG, suffix=""):
    """
    test_ids: list of ids from test.json (must match Kaggle id column)
    test_pred: numpy array of 0/1 predictions
    suffix: e.g. "lgbm_v1", "xgb_pca256", etc.
    """
    df_sub = pd.DataFrame({
        "row_id": test_ids,       # Kaggle expects row_id as per manual
        "target": test_pred.astype(int),
    })
    out_dir = Path(config["SUBMISSION_OUTPUT_DIR"])
    out_dir.mkdir(parents=True, exist_ok=True)

    exp_name = f"{config['MODEL_FAMILY']}_{config['EXPERIMENTER']}"
    if suffix:
        filename = f"{exp_name}_{suffix}.csv"
    else:
        filename = f"{exp_name}.csv"

    out_path = out_dir / filename
    df_sub.to_csv(out_path, index=False)
    print(f"Saved submission to: {out_path}")
    return out_path


In [35]:
# =========================
# Cell X: Logging helpers
# =========================

import csv
import json


def _append_row_to_csv(path: Path, fieldnames, row_dict):
    """
    Internal helper: append a row to a CSV.
    - Creates file with header if it doesn't exist.
    - Preserves column order via fieldnames.
    """
    path.parent.mkdir(parents=True, exist_ok=True)
    file_exists = path.exists()

    with path.open("a", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        if not file_exists:
            writer.writeheader()
        writer.writerow(row_dict)


def log_experiment_master(
    config,
    results,
    submission_path=None,
    notes="",
    public_lb_f1_macro=None,
    private_lb_f1_macro=None,
):
    """
    Logs a SINGLE row into logs/experiments_log.csv

    config: CONFIG dict
    results: dict from run_cv_and_fit_full (must contain:
             "cv_f1_mean", "cv_f1_std", "cv_f1_scores", "train_time_sec")
    submission_path: path of the submission CSV (string or Path)
    notes: free text (e.g. "no PCA; SCALER=standard; first lgbm run")
    public_lb_f1_macro / private_lb_f1_macro:
        - you can leave as None for now and fill manually in Google Sheet later.
    """

    log_path = Path(config["LOGS_DIR"]) / f"experiments_log_{config["EXPERIMENTER"]}.csv"

    # Master columns (same for ALL models)
    fieldnames = [
        "EXP_NAME",
        "EXPERIMENTER",
        "MODEL_FAMILY",
        "FEATURE_MODE",

        "SEED",
        "N_FOLDS",
        "VALIDATION_SCHEME",
        "VAL_SIZE",
        "VAL_STRATIFY",
        "USE_STRATIFIED_KFOLD",
        "SHUFFLE_FOLDS",
        "FOLDS_RANDOM_STATE",

        "SCALER_TYPE",
        "MISSING_VALUE_STRATEGY",
        "MISSING_FILL_VALUE",

        "USE_PCA",
        "PCA_N_COMPONENTS",
        "PCA_WHITEN",
        "PCA_SVD_SOLVER",

        "IMBALANCE_MODE",
        "DECISION_THRESHOLD",

        "train_size",
        "cv_f1_macro_mean",
        "cv_f1_macro_std",
        "cv_f1_macro_per_fold",   # we store as JSON string
        "public_lb_f1_macro",
        "private_lb_f1_macro",

        "train_time_sec",
        "submission_file",
        "notes",

        "config_json",            # full CONFIG snapshot as JSON
    ]

    row = {
        "EXP_NAME": config["EXP_NAME"],
        "EXPERIMENTER": config["EXPERIMENTER"],
        "MODEL_FAMILY": config["MODEL_FAMILY"],
        "FEATURE_MODE": config["FEATURE_MODE"],

        "SEED": config["SEED"],
        "N_FOLDS": config["N_FOLDS"],
        "VALIDATION_SCHEME": config["VALIDATION_SCHEME"],
        "VAL_SIZE": config["VAL_SIZE"],
        "VAL_STRATIFY": config["VAL_STRATIFY"],
        "USE_STRATIFIED_KFOLD": config["USE_STRATIFIED_KFOLD"],
        "SHUFFLE_FOLDS": config["SHUFFLE_FOLDS"],
        "FOLDS_RANDOM_STATE": config["FOLDS_RANDOM_STATE"],

        "SCALER_TYPE": config["SCALER_TYPE"],
        "MISSING_VALUE_STRATEGY": config["MISSING_VALUE_STRATEGY"],
        "MISSING_FILL_VALUE": config["MISSING_FILL_VALUE"],

        "USE_PCA": config["USE_PCA"],
        "PCA_N_COMPONENTS": config["PCA_N_COMPONENTS"],
        "PCA_WHITEN": config["PCA_WHITEN"],
        "PCA_SVD_SOLVER": config["PCA_SVD_SOLVER"],

        "IMBALANCE_MODE": config["IMBALANCE_MODE"],
        "DECISION_THRESHOLD": config["DECISION_THRESHOLD"],

        "train_size": int(len(y_train)),  # global y_train

        "cv_f1_macro_mean": float(results["cv_f1_mean"]),
        "cv_f1_macro_std": float(results["cv_f1_std"]),
        "cv_f1_macro_per_fold": json.dumps(results["cv_f1_scores"]),

        "public_lb_f1_macro": (
            float(public_lb_f1_macro) if public_lb_f1_macro is not None else ""
        ),
        "private_lb_f1_macro": (
            float(private_lb_f1_macro) if private_lb_f1_macro is not None else ""
        ),

        "train_time_sec": float(results["train_time_sec"]),
        "submission_file": str(submission_path) if submission_path is not None else "",
        "notes": notes,

        "config_json": json.dumps(config),
    }

    _append_row_to_csv(log_path, fieldnames, row)
    print(f"[LOG] Appended master experiment log → {log_path}")


def log_experiment_model_specific(
    model_family,
    config,
    params_dict,
    results,
    submission_path=None,
    notes="",
    public_lb_f1_macro=None,
    private_lb_f1_macro=None,
):
    """
    Logs a SINGLE row into logs/<model_family>.csv

    model_family: string, e.g. "lightgbm", "xgboost", "mlp_early", ...
    config: CONFIG dict
    params_dict: the hyperparameter dict used to construct the model
                 (e.g. the 'params' dict inside build_lgbm_model)
    results: dict from run_cv_and_fit_full
    submission_path: path to submission CSV
    """

    model_log_path = Path(config["LOGS_DIR"]) / f"{model_family}_{config["EXPERIMENTER"]}.csv"

    # General columns we want in EVERY model-family sheet
    general_cols = [
        "EXP_NAME",
        "EXPERIMENTER",
        "MODEL_FAMILY",
        "SEED",
        "N_FOLDS",
        "FEATURE_MODE",
        "SCALER_TYPE",
        "USE_PCA",
        "PCA_N_COMPONENTS",
        "IMBALANCE_MODE",
        "train_size",
        "cv_f1_macro_mean",
        "cv_f1_macro_std",
        "public_lb_f1_macro",
        "private_lb_f1_macro",
        "train_time_sec",
        "submission_file",
        "notes",
    ]

    # Model-specific hyperparam column names (sorted for a stable order)
    param_cols = sorted(list(params_dict.keys()))

    fieldnames = general_cols + param_cols

    row = {
        "EXP_NAME": config["EXP_NAME"],
        "EXPERIMENTER": config["EXPERIMENTER"],
        "MODEL_FAMILY": config["MODEL_FAMILY"],
        "SEED": config["SEED"],
        "N_FOLDS": config["N_FOLDS"],
        "FEATURE_MODE": config["FEATURE_MODE"],
        "SCALER_TYPE": config["SCALER_TYPE"],
        "USE_PCA": config["USE_PCA"],
        "PCA_N_COMPONENTS": config["PCA_N_COMPONENTS"],
        "IMBALANCE_MODE": config["IMBALANCE_MODE"],
        "train_size": int(len(y_train)),
        "cv_f1_macro_mean": float(results["cv_f1_mean"]),
        "cv_f1_macro_std": float(results["cv_f1_std"]),
        "public_lb_f1_macro": (
            float(public_lb_f1_macro) if public_lb_f1_macro is not None else ""
        ),
        "private_lb_f1_macro": (
            float(private_lb_f1_macro) if private_lb_f1_macro is not None else ""
        ),
        "train_time_sec": float(results["train_time_sec"]),
        "submission_file": str(submission_path) if submission_path is not None else "",
        "notes": notes,
    }

    # Add each hyperparam into the row
    for k in param_cols:
        row[k] = params_dict.get(k, "")

    _append_row_to_csv(model_log_path, fieldnames, row)
    print(f"[LOG] Appended model-specific log → {model_log_path}")


## train and test the model

In [36]:
# =========================
# Linear SVM: params + builder + run + log
# =========================

from sklearn.svm import LinearSVC

def get_linear_svm_params(config=CONFIG):
    """
    LinearSVC main parameters you can explore:

    Core optimization / regularization:
      - C                : float > 0
                           • smaller → stronger regularization (simpler model)
                           • typical grid: [0.01, 0.1, 1.0, 10.0, 100.0]

      - penalty          : {"l2", "l1"}
                           • "l2" is the standard choice
                           • "l1" only allowed with:
                               penalty="l1", loss="squared_hinge", dual=False

      - loss             : {"squared_hinge", "hinge"}
                           • "squared_hinge" (default), smoother, more common
                           • "hinge" is the original SVM hinge loss

      - dual             : bool
                           • True  → solves the dual problem (better when n_samples > n_features)
                           • False → solves the primal (often better when n_features > n_samples)
                           • IMPORTANT:
                               - If penalty="l1", loss must be "squared_hinge" and dual=False

      - tol              : float > 0
                           • tolerance for stopping
                           • smaller (1e-5) = more precise but slower
                           • typical: [1e-3, 1e-4, 1e-5]

    Data / formulation:
      - fit_intercept    : {True, False}
                           • True  → learns an intercept/bias term
                           • False → forces hyperplane through origin

      - intercept_scaling: float (only used when fit_intercept=True and solver is liblinear-like)
                           • default 1.0; usually you can leave this

      - class_weight     : {None, "balanced", dict}
                           • None       → no reweighting
                           • "balanced" → inverse-frequency weighting per class
                           • dict       → custom weights per class

      - multi_class      : {"ovr"}
                           • LinearSVC only supports "ovr" (one-vs-rest)
                           • for binary problem this doesn’t matter

    Misc:
      - max_iter         : int
                           • max iterations; increase if you get convergence warnings
                           • typical: [1000, 5000, 10000, 20000]

      - random_state     : int or None
                           • controls shuffling in the optimizer (for reproducibility)

      - verbose          : int
                           • 0 = silent, >0 = prints optimization info (you can ignore)
    """

    # ⚠️ Choose a *valid* combination here.
    # Example 1: Standard and safe
    penalty = "l1"              # "l2" or "l1"
    loss = "squared_hinge"      # "squared_hinge" or "hinge"
    dual = False                 # True or False (see notes above)

    # Example 2: enable l1 penalty (UNCOMMENT IF YOU WANT TO TRY IT)
    # penalty = "l1"
    # loss = "squared_hinge"    # must be "squared_hinge" with l1
    # dual = False              # must be False with l1

    params = {
        # --- core optimization ---
        "C": 8.0,                 # explore: 0.01, 0.1, 1.0, 10.0, 100.0
        "penalty": penalty,        # "l2" (safer) or "l1" (sparser, with constraints)
        "loss": loss,              # "squared_hinge" (default) or "hinge"
        "dual": dual,              # True (when n_samples > n_features), False otherwise
        "tol": 1e-4,               # explore: 1e-3, 1e-4, 1e-5

        # --- data / formulation ---
        "fit_intercept": True,     # True (default) or False
        "intercept_scaling": 1.0,  # rarely changed; relevant when fit_intercept=True

        "multi_class": "ovr",      # only "ovr" is supported for LinearSVC

        # class imbalance
        # None       → no weighting
        # "balanced" → inverse-frequency per class
        # dict       → custom weights e.g. {0: 1.0, 1: 2.0}
        "class_weight": None,

        # --- misc ---
        "max_iter": 5000,         # if you see convergence warnings, increase this
        "random_state": config["SEED"],
        "verbose": 1,
    }

    # hook to enable "balanced" from CONFIG
    if config["IMBALANCE_MODE"] == "balanced":
        params["class_weight"] = "balanced"

    return params


def build_linear_svm_model(config=CONFIG):
    params = get_linear_svm_params(config)
    model = LinearSVC(**params)
    return model

# ======= RUN ONE LinearSVC EXPERIMENT =======

linsvm_results = run_cv_and_fit_full(
    build_linear_svm_model,
    X_train_proc,
    y_train,
    X_test_proc,
    CONFIG,
    model_name="LinearSVM",
)

sub_path = save_submission(
    test_ids,
    linsvm_results["test_pred"],
    CONFIG,
)

linsvm_params = get_linear_svm_params(CONFIG)

log_experiment_master(
    config=CONFIG,
    results=linsvm_results,
    submission_path=sub_path,
    notes="LinearSVC + PCA256, standard scaler.",
)

log_experiment_model_specific(
    model_family=CONFIG["MODEL_FAMILY"],
    config=CONFIG,
    params_dict=linsvm_params,
    results=linsvm_results,
    submission_path=sub_path,
    notes="LinearSVC + PCA256, standard scaler.",
)



=== LinearSVM: Fold 1/7 ===
[LibLinear]Fold 1 F1 Macro: 0.6343

=== LinearSVM: Fold 2/7 ===
[LibLinear]Fold 2 F1 Macro: 0.6265

=== LinearSVM: Fold 3/7 ===
[LibLinear]Fold 3 F1 Macro: 0.6186

=== LinearSVM: Fold 4/7 ===
[LibLinear]Fold 4 F1 Macro: 0.6017

=== LinearSVM: Fold 5/7 ===
[LibLinear]Fold 5 F1 Macro: 0.6441

=== LinearSVM: Fold 6/7 ===
[LibLinear]Fold 6 F1 Macro: 0.6566

=== LinearSVM: Fold 7/7 ===
[LibLinear]Fold 7 F1 Macro: 0.6591

=== CV Summary ===
LinearSVM: F1 Macro mean = 0.6344, std = 0.0192
Training + CV time: 0.2 sec
[LibLinear]Saved submission to: submissions\svm_Zuha.csv
[LOG] Appended master experiment log → logs\experiments_log_Zuha.csv
[LOG] Appended model-specific log → logs\svm_Zuha.csv
