Date: 16th Nov 2025

# Imports

In [None]:
# =========================
# Cell 1: Imports & CONFIG
# =========================

import os
import json
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, train_test_split, KFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    StandardScaler,
    MinMaxScaler,
    MaxAbsScaler,
    RobustScaler,
    Normalizer,
)
from sklearn.decomposition import PCA
from pathlib import Path
from time import time
from sklearn.metrics import f1_score

# Config

In [None]:
CONFIG = {
    # --- Meta / experiment info ---
    "EXP_NAME": "exp_lgbm_baseline_v1",  # any short name for this run
    "EXPERIMENTER": "Zuha",              # "Zuha" | "Maryam" | "Maham"
    "MODEL_FAMILY": "lightgbm",          # "lightgbm" | "xgboost" | "mlp_early" | "mlp_two_tower"
                                          # "logreg" | "svm" | "rf" | "catboost" | "ensemble"

    # --- Paths (update DATA_DIR when you know Kaggle path) ---
    # On Kaggle, you'll usually have something like: "/kaggle/input/dss-ml-competition/"
    "DATA_DIR": "./data",                # folder containing train_part1.json & test.json
    "TRAIN_FILENAME": "train_part1.json",
    "TEST_FILENAME": "test.json",
    "SUBMISSION_OUTPUT_DIR": "./submissions",  # where to save CSV submissions

    # --- Target / ID / feature mode ---
    "TARGET_COLUMN": "label",            # name of target in train JSON
    "ID_COLUMN": "id",                   # for train: hashed id, for test: integer
    "FEATURE_MODE": "img+text",          # "img+text" | "img_only" | "text_only"
                                         # (later you can try ablations)

    # --- Reproducibility ---
    "SEED": 42,                          # any integer; try 42, 2025, 7, etc.
    "NP_RANDOM_SEED": 42,                # keep same as SEED or change if you want

    # --- Train / validation strategy (for local eval) ---
    "VALIDATION_SCHEME": "cv_only",      # "cv_only"          → only K-fold CV
                                         # "holdout_only"     → single train/val split
                                         # "holdout+cv"       → both split + CV

    # If using holdout (for VALIDATION_SCHEME != "cv_only"):
    "VAL_SIZE": 0.2,                     # fraction of train data for validation (e.g. 0.1, 0.2)
    "VAL_STRATIFY": True,                # True to stratify by label in train/val split
    "VAL_RANDOM_STATE": 42,              # seed for train/val split

    # --- Cross-validation (K-fold) ---
    "USE_STRATIFIED_KFOLD": True,        # almost always True for classification
    "N_FOLDS": 5,                        # typical: 3, 5, 7, 10
    "SHUFFLE_FOLDS": True,               # shuffle before splitting into folds
    "FOLDS_RANDOM_STATE": 42,            # seed for fold splitting

    # --- Scaling / normalization ---
    "SCALER_TYPE": "standard",           # "none"     → no scaling (good for tree models)
                                         # "standard" → StandardScaler (zero mean, unit variance) NEEDED FOR MLP
                                         # "minmax"   → MinMaxScaler (0–1)
                                         # "maxabs"   → MaxAbsScaler ([-1, 1] for sparse-like)
                                         # "robust"   → RobustScaler (robust to outliers)
                                         # "l2_norm"  → normalize each sample to unit L2 norm

    # --- Missing values handling (numeric) ---
    "MISSING_VALUE_STRATEGY": "none",    # "none"     → assume no NaNs; just assert & crash if found
                                         # "mean"     → fill NaNs with column means
                                         # "median"   → fill with column medians
                                         # "constant" → fill with constant (see MISSING_FILL_VALUE)
                                         # "zero"     → fill with 0.0

    "MISSING_FILL_VALUE": 0.0,           # used when strategy == "constant" or "zero"

    # --- PCA dimensionality reduction ---
    "USE_PCA": False,                    # True → apply PCA after scaling (if any)
    "PCA_N_COMPONENTS": 256,             # e.g. 64, 128, 256, 512; <= 1024 total features
    "PCA_WHITEN": False,                 # True → decorrelate & scale to unit variance
    "PCA_SVD_SOLVER": "auto",            # "auto" | "full" | "randomized"
    "PCA_RANDOM_STATE": 42,              # for randomized solver, etc.

    # --- Class imbalance handling (general) ---
    "IMBALANCE_MODE": "none",            # "none"           → do nothing
                                         # "balanced"       → use class_weight='balanced' (for LR/SVM/MLP)
                                         # "scale_pos_weight" → for tree models like XGB/LGB
                                         # "is_unbalance"   → LightGBM's built-in option

    # --- Threshold for converting probabilities → class labels ---
    "DECISION_THRESHOLD": 0.5,           # default 0.5; you can tune (0.4–0.6, etc.)

    # --- Logging / runs ---
    "SAVE_LOCAL_LOGS": True,             # later: saves experiments_master.csv etc.
    "LOGS_DIR": "./logs",                # folder for logs/CSVs if you want
}

In [None]:
# Make sure directories exist
Path(CONFIG["SUBMISSION_OUTPUT_DIR"]).mkdir(parents=True, exist_ok=True)
Path(CONFIG["LOGS_DIR"]).mkdir(parents=True, exist_ok=True)

# load data

## utils

In [None]:
# =========================
# Cell 2: Data Loading Utils
# =========================

def load_json_list(path):
    """
    Loads a JSON file that contains either:
    - a list of JSON objects, OR
    - JSON lines (one object per line)
    and returns a Python list of dicts.
    """
    path = Path(path)
    with path.open("r", encoding="utf-8") as f:
        text = f.read().strip()
        if not text:
            return []
        # Try parse as a JSON array first
        try:
            data = json.loads(text)
            if isinstance(data, list):
                return data
            else:
                # If it's a single object, wrap it
                return [data]
        except json.JSONDecodeError:
            # Fallback: assume JSON Lines format
            data = []
            for line in text.splitlines():
                line = line.strip()
                if not line:
                    continue
                data.append(json.loads(line))
            return data


def build_feature_matrix_from_records(records, feature_mode="img+text",
                                      id_key="id", target_key="label", is_train=True):
    """
    Given a list of records like:
    {
      "id": "a9d8c7...",
      "label": 0,
      "image_embedding": [...512 floats...],
      "text_embedding": [...512 floats...]
    }
    returns:
      X: np.ndarray [n_samples, n_features]
      y: np.ndarray [n_samples] (if is_train=True, else None)
      ids: list of ids (for linking back to samples / submission)
    """
    ids = []
    features = []
    labels = [] if is_train else None

    for rec in records:
        rid = rec[id_key]
        img = rec.get("image_embedding", [])
        txt = rec.get("text_embedding", [])

        # Ensure they are np arrays
        img = np.array(img, dtype=np.float32)
        txt = np.array(txt, dtype=np.float32)

        if feature_mode == "img+text":
            feat = np.concatenate([img, txt], axis=0)
        elif feature_mode == "img_only":
            feat = img
        elif feature_mode == "text_only":
            feat = txt
        else:
            raise ValueError(f"Unknown FEATURE_MODE: {feature_mode}")

        ids.append(rid)
        features.append(feat)
        if is_train:
            labels.append(rec[target_key])

    X = np.stack(features, axis=0)
    y = np.array(labels, dtype=np.int64) if is_train else None

    return X, y, ids

In [None]:
# =========================
# Cell 3: Load Train & Test
# =========================

def load_train_test(config=CONFIG):
    data_dir = Path(config["DATA_DIR"])
    train_path = data_dir / config["TRAIN_FILENAME"]
    test_path = data_dir / config["TEST_FILENAME"]

    print(f"Loading train from: {train_path}")
    train_records = load_json_list(train_path)

    print(f"Loading test from:  {test_path}")
    test_records = load_json_list(test_path)

    print(f"Train samples: {len(train_records)}")
    print(f"Test samples:  {len(test_records)}")

    X_train, y_train, train_ids = build_feature_matrix_from_records(
        train_records,
        feature_mode=config["FEATURE_MODE"],
        id_key=config["ID_COLUMN"],
        target_key=config["TARGET_COLUMN"],
        is_train=True,
    )

    X_test, _, test_ids = build_feature_matrix_from_records(
        test_records,
        feature_mode=config["FEATURE_MODE"],
        id_key=config["ID_COLUMN"],
        target_key=config["TARGET_COLUMN"],  # ignored when is_train=False
        is_train=False,
    )

    print(f"X_train shape: {X_train.shape}")  # (n_train, n_features)
    print(f"X_test shape:  {X_test.shape}")   # (n_test, n_features)

    # Basic label distribution & NaN checks
    unique, counts = np.unique(y_train, return_counts=True)
    label_dist = dict(zip(unique, counts))
    print("Label distribution (train):", label_dist)

    # NaN / inf check
    n_nan_train = np.isnan(X_train).sum()
    n_nan_test = np.isnan(X_test).sum()
    n_inf_train = np.isinf(X_train).sum()
    n_inf_test = np.isinf(X_test).sum()

    print(f"NaNs in X_train: {n_nan_train}, NaNs in X_test: {n_nan_test}")
    print(f"Infs in X_train: {n_inf_train}, Infs in X_test: {n_inf_test}")

    return X_train, y_train, train_ids, X_test, test_ids


## load train test

In [None]:
# Actually load the data (you can comment this out during debugging if needed)
X_train, y_train, train_ids, X_test, test_ids = load_train_test(CONFIG)

# data preprocessing

## utils

In [None]:
# =========================
# Cell 4: Preprocessing (Imputer + Scaler + PCA)
# =========================

def build_imputer(config):
    """Return an sklearn SimpleImputer or None based on CONFIG."""
    strategy = config["MISSING_VALUE_STRATEGY"]

    if strategy == "none":
        return None
    elif strategy == "mean":
        return SimpleImputer(strategy="mean")
    elif strategy == "median":
        return SimpleImputer(strategy="median")
    elif strategy in ("constant", "zero"):
        fill_value = 0.0 if strategy == "zero" else config["MISSING_FILL_VALUE"]
        return SimpleImputer(strategy="constant", fill_value=fill_value)
    else:
        raise ValueError(f"Unknown MISSING_VALUE_STRATEGY: {strategy}")


def build_scaler(config):
    """Return a scaler/normalizer or None based on CONFIG['SCALER_TYPE']."""
    scaler_type = config["SCALER_TYPE"]

    if scaler_type == "none":
        return None
    elif scaler_type == "standard":
        # zero mean, unit variance per feature
        return StandardScaler()
    elif scaler_type == "minmax":
        # scales each feature to [0, 1]
        return MinMaxScaler()
    elif scaler_type == "maxabs":
        # scales each feature to [-1, 1] based on max abs value
        return MaxAbsScaler()
    elif scaler_type == "robust":
        # robust to outliers (uses median & IQR)
        return RobustScaler()
    elif scaler_type == "l2_norm":
        # normalizes each sample (row) to unit L2 norm
        # (good when you care about direction, not magnitude)
        return Normalizer(norm="l2")
    else:
        raise ValueError(f"Unknown SCALER_TYPE: {scaler_type}")


def build_pca(config, n_features: int):
    """Return a PCA object or None based on CONFIG."""
    if not config["USE_PCA"]:
        return None

    n_components = config["PCA_N_COMPONENTS"]
    # Safety: can't have more components than features
    n_components = min(n_components, n_features)

    pca = PCA(
        n_components=n_components,
        whiten=config["PCA_WHITEN"],
        svd_solver=config["PCA_SVD_SOLVER"],
        random_state=config["PCA_RANDOM_STATE"],
    )
    return pca


def fit_preprocessor(X_train: np.ndarray, config=CONFIG):
    """
    Fit imputer, scaler, and PCA on X_train according to CONFIG.
    Returns:
      X_train_proc: transformed training features
      preprocessors: dict with keys 'imputer', 'scaler', 'pca'
    """
    X_proc = X_train.astype(np.float32, copy=True)

    # 1) Missing values
    imputer = build_imputer(config)
    if imputer is not None:
        X_proc = imputer.fit_transform(X_proc)
    else:
        # If we claim there are no missing values, assert this (fail fast)
        if np.isnan(X_proc).any() or np.isinf(X_proc).any():
            raise ValueError(
                "NaN or inf detected in X_train but MISSING_VALUE_STRATEGY='none'. "
                "Change strategy to 'mean'/'median'/'zero' or clean data."
            )

    # 2) Scaling / normalization
    scaler = build_scaler(config)
    if scaler is not None:
        X_proc = scaler.fit_transform(X_proc)

    # 3) PCA
    pca = build_pca(config, n_features=X_proc.shape[1])
    if pca is not None:
        X_proc = pca.fit_transform(X_proc)
        print(f"PCA applied: new shape = {X_proc.shape}")
    else:
        print("PCA not used; shape remains:", X_proc.shape)

    preprocessors = {
        "imputer": imputer,
        "scaler": scaler,
        "pca": pca,
    }
    return X_proc, preprocessors


def transform_with_preprocessor(X: np.ndarray, preprocessors: dict):
    """
    Apply fitted imputer, scaler, and PCA to new data (val/test).
    X: np.ndarray [n_samples, n_features_orig]
    preprocessors: dict from fit_preprocessor()

    Returns:
      X_proc: transformed X
    """
    X_proc = X.astype(np.float32, copy=True)

    imputer = preprocessors.get("imputer")
    scaler = preprocessors.get("scaler")
    pca = preprocessors.get("pca")

    # 1) Missing values
    if imputer is not None:
        X_proc = imputer.transform(X_proc)
    else:
        if np.isnan(X_proc).any() or np.isinf(X_proc).any():
            raise ValueError(
                "NaN or inf detected in X but no imputer fitted (MISSING_VALUE_STRATEGY='none')."
            )

    # 2) Scaling / normalization
    if scaler is not None:
        X_proc = scaler.transform(X_proc)

    # 3) PCA
    if pca is not None:
        X_proc = pca.transform(X_proc)

    return X_proc

## preprocess the data

In [None]:
# Example: preprocess the full train & test according to CONFIG
X_train_proc, PREPROCESSORS = fit_preprocessor(X_train, CONFIG)
X_test_proc = transform_with_preprocessor(X_test, PREPROCESSORS)

print("X_train_proc shape:", X_train_proc.shape)
print("X_test_proc shape: ", X_test_proc.shape)

# train and test

## utils

In [None]:
# =========================
# Cell 5: Cross-validation utilities
# =========================

def get_cv_splits(y, config=CONFIG):
    """
    Build K-fold (or StratifiedKFold) splits based on CONFIG.
    Returns a list of (train_idx, val_idx) tuples.
    """
    n_folds = config["N_FOLDS"]
    shuffle = config["SHUFFLE_FOLDS"]
    random_state = config["FOLDS_RANDOM_STATE"]

    if config["USE_STRATIFIED_KFOLD"]:
        splitter = StratifiedKFold(
            n_splits=n_folds,
            shuffle=shuffle,
            random_state=random_state,
        )
        splits = list(splitter.split(np.zeros_like(y), y))
    else:
        splitter = KFold(
            n_splits=n_folds,
            shuffle=shuffle,
            random_state=random_state,
        )
        splits = list(splitter.split(np.zeros_like(y)))
    return splits


def run_cv_and_fit_full(build_model_fn, X, y, X_test, config=CONFIG, model_name="model"):
    """
    - build_model_fn: function that returns a *fresh* untrained model object
    - X, y: preprocessed training data (e.g. X_train_proc, y_train)
    - X_test: preprocessed test data (e.g. X_test_proc)
    - config: CONFIG dict
    - model_name: just for printing

    Returns:
      results: dict with:
        - "cv_f1_scores"
        - "cv_f1_mean"
        - "cv_f1_std"
        - "test_pred" (binary 0/1)
        - "test_proba" (or None if not available)
        - "train_time_sec"
    """
    splits = get_cv_splits(y, config)
    threshold = config["DECISION_THRESHOLD"]

    fold_scores = []
    start_time = time()

    for fold, (tr_idx, val_idx) in enumerate(splits, start=1):
        print(f"\n=== {model_name}: Fold {fold}/{len(splits)} ===")
        X_tr, X_val = X[tr_idx], X[val_idx]
        y_tr, y_val = y[tr_idx], y[val_idx]

        model = build_model_fn()

        # Fit
        model.fit(X_tr, y_tr)

        # Predict on validation
        if hasattr(model, "predict_proba"):
            val_proba = model.predict_proba(X_val)[:, 1]
            val_pred = (val_proba >= threshold).astype(int)
        else:
            # fallback: direct class prediction
            val_pred = model.predict(X_val)

        f1 = f1_score(y_val, val_pred, average="macro")
        print(f"Fold {fold} F1 Macro: {f1:.4f}")
        fold_scores.append(f1)

    train_time_sec = time() - start_time

    cv_mean = float(np.mean(fold_scores))
    cv_std = float(np.std(fold_scores))
    print("\n=== CV Summary ===")
    print(f"{model_name}: F1 Macro mean = {cv_mean:.4f}, std = {cv_std:.4f}")
    print(f"Training + CV time: {train_time_sec:.1f} sec")

    # Fit on full training data and predict on test
    full_model = build_model_fn()
    full_model.fit(X, y)

    test_proba = None
    if hasattr(full_model, "predict_proba"):
        test_proba = full_model.predict_proba(X_test)[:, 1]
        test_pred = (test_proba >= threshold).astype(int)
    else:
        test_pred = full_model.predict(X_test)

    results = {
        "cv_f1_scores": fold_scores,
        "cv_f1_mean": cv_mean,
        "cv_f1_std": cv_std,
        "test_pred": test_pred.astype(int),
        "test_proba": test_proba,
        "train_time_sec": train_time_sec,
    }
    return results


In [None]:
# =========================
# Cell 6: Submission helper
# =========================

def save_submission(test_ids, test_pred, config=CONFIG, suffix=""):
    """
    test_ids: list of ids from test.json (must match Kaggle id column)
    test_pred: numpy array of 0/1 predictions
    suffix: e.g. "lgbm_v1", "xgb_pca256", etc.
    """
    df_sub = pd.DataFrame({
        "row_id": test_ids,       # Kaggle expects row_id as per manual
        "target": test_pred.astype(int),
    })
    out_dir = Path(config["SUBMISSION_OUTPUT_DIR"])
    out_dir.mkdir(parents=True, exist_ok=True)

    exp_name = config["EXP_NAME"]
    if suffix:
        filename = f"{exp_name}_{suffix}.csv"
    else:
        filename = f"{exp_name}.csv"

    out_path = out_dir / filename
    df_sub.to_csv(out_path, index=False)
    print(f"Saved submission to: {out_path}")
    return out_path


## train and test the model

In [None]:
# =========================
# Cell 9: MLP (early fusion) using sklearn
# =========================

from sklearn.neural_network import MLPClassifier


def build_mlp_early_model(config=CONFIG):
    """
    MLPClassifier params:
      - hidden_layer_sizes: tuple of layer sizes, e.g. (512, 256)
      - activation: "relu" | "tanh" | "logistic" | "identity"
      - solver: "adam" (good default) | "lbfgs" | "sgd"
      - alpha: L2 regularization term (1e-5–1e-3 typical)
      - batch_size: "auto" or int
      - learning_rate_init: initial LR (1e-4–1e-2)
      - max_iter: epochs/iterations; set high (100–500+) and rely on early_stopping
      - early_stopping: True/False (splits off validation part internally)
    NOTE:
      - MLP is sensitive to scaling; SCALER_TYPE should NOT be "none".
    """

    hidden_layer_sizes = (512, 256)  # e.g. (512,), (512, 256), (256, 128, 64)

    params = {
        "hidden_layer_sizes": hidden_layer_sizes,
        "activation": "relu",          # "relu" | "tanh" | "logistic" | "identity"
        "solver": "adam",              # "adam" good default
        "alpha": 1e-4,                 # L2 penalty; try 1e-5–1e-3
        "batch_size": "auto",          # or int like 128, 256
        "learning_rate_init": 1e-3,    # base LR; try 1e-4–1e-2
        "max_iter": 200,               # max epochs/iterations
        "early_stopping": True,        # uses internal val split
        "n_iter_no_change": 10,        # patience for early stopping
        "random_state": config["SEED"],
    }

    # Class imbalance
    if config["IMBALANCE_MODE"] == "balanced":
        params["class_weight"] = "balanced"

    model = MLPClassifier(**params)
    return model

In [None]:
mlp_results = run_cv_and_fit_full(build_mlp_early_model, X_train_proc, y_train, X_test_proc, CONFIG, model_name="MLP_Early")
sub_path = save_submission(test_ids, mlp_results["test_pred"], CONFIG, suffix="mlp_early_v1")