In [2]:
import os
import json
import numpy as np
import pandas as pd

from pathlib import Path

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression

from xgboost import XGBClassifier


# =========================
# CONFIG
# =========================
SEED = 42

# TODO: update these paths if your dataset name is different
PATH_TRAIN1 = "./data/train_p1.json"      # or train_part1.json
PATH_TRAIN2 = "./data/train_part2.json"      # or train_part2.json
PATH_TEST   = "./data/test.json"

N_FOLDS = 5


# =========================
# UTILS: Loading JSON
# =========================
def load_json_list(path):
    """
    Loads a JSON file that contains either:
    - a list of JSON objects, OR
    - JSON lines (one JSON object per line)
    and returns a Python list of dicts.
    """
    path = Path(path)
    with path.open("r", encoding="utf-8") as f:
        text = f.read().strip()
        if not text:
            return []
        # Try parse as a JSON array first
        try:
            data = json.loads(text)
            if isinstance(data, list):
                return data
            else:
                return [data]
        except json.JSONDecodeError:
            # Fallback: assume JSON Lines format
            data = []
            for line in text.splitlines():
                line = line.strip()
                if not line:
                    continue
                data.append(json.loads(line))
            return data


# =========================
# Build feature matrices
# =========================
def build_train_dataset(train_paths):
    """
    Given a list of train JSON paths, load them and build:
      - X: (n_samples, 1024) features
      - y: (n_samples,) labels
    """
    records = []
    for p in train_paths:
        print(f"Loading train file: {p}")
        part = load_json_list(p)
        print(f"  -> {len(part)} records")
        records.extend(part)

    X_list = []
    y_list = []

    for rec in records:
        img = np.array(rec["image_embedding"], dtype=np.float32)
        txt = np.array(rec["text_embedding"], dtype=np.float32)
        feat = np.concatenate([img, txt], axis=0)  # 512 + 512 = 1024
        X_list.append(feat)
        y_list.append(rec["label"])

    X = np.vstack(X_list)
    y = np.array(y_list, dtype=np.int64)

    print(f"Train X shape: {X.shape}, y shape: {y.shape}")
    return X, y


def build_test_dataset(test_path):
    """
    Loads test.json and builds:
      - X_test: (n_test, 1024)
      - test_ids: list of ids (row_id)
    """
    print(f"Loading test file: {test_path}")
    records = load_json_list(test_path)
    print(f"  -> {len(records)} test records")

    X_list = []
    ids = []

    for rec in records:
        img = np.array(rec["image_embedding"], dtype=np.float32)
        txt = np.array(rec["text_embedding"], dtype=np.float32)
        feat = np.concatenate([img, txt], axis=0)
        X_list.append(feat)
        ids.append(rec["id"])

    X_test = np.vstack(X_list)
    test_ids = np.array(ids)
    print(f"Test X shape: {X_test.shape}")
    return X_test, test_ids


# =========================
# Model Builders
# =========================
def build_xgb_model(y):
    """
    Builds an XGBClassifier with decent defaults and
    scale_pos_weight based on label imbalance.
    """
    pos = np.sum(y == 1)
    neg = np.sum(y == 0)
    if pos == 0:
        spw = 1.0
    else:
        spw = neg / pos

    print(f"XGB scale_pos_weight = {spw:.4f} (neg={neg}, pos={pos})")

    model = XGBClassifier(
        n_estimators=500,
        max_depth=7,
        learning_rate=0.03,
        subsample=0.8,
        colsample_bytree=0.8,
        min_child_weight=1,
        reg_lambda=1.0,
        reg_alpha=0.0,
        objective="binary:logistic",
        eval_metric="logloss",
        tree_method="hist",
        n_jobs=-1,
        random_state=SEED,
        scale_pos_weight=spw,
    )
    return model


def build_logreg_model():
    """
    Logistic Regression with class_weight balanced.
    """
    model = LogisticRegression(
        C=1.0,
        max_iter=2000,
        class_weight="balanced",
        solver="liblinear",  # robust for small-ish dims
        random_state=SEED,
    )
    return model


# =========================
# Training + CV + Threshold tuning
# =========================
def train_with_cv_and_predict(X, y, X_test):
    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

    oof_pred = np.zeros(len(y), dtype=float)
    test_pred_folds = []

    for fold, (tr_idx, val_idx) in enumerate(skf.split(X, y), start=1):
        print(f"\n=== Fold {fold}/{N_FOLDS} ===")
        X_tr, X_val = X[tr_idx], X[val_idx]
        y_tr, y_val = y[tr_idx], y[val_idx]

        # XGBoost
        xgb = build_xgb_model(y_tr)
        xgb.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=False)
        p_val_xgb = xgb.predict_proba(X_val)[:, 1]
        p_test_xgb = xgb.predict_proba(X_test)[:, 1]

        # Logistic Regression
        lr = build_logreg_model()
        lr.fit(X_tr, y_tr)
        p_val_lr = lr.predict_proba(X_val)[:, 1]
        p_test_lr = lr.predict_proba(X_test)[:, 1]

        # Ensemble (simple average)
        p_val = 0.5 * p_val_xgb + 0.5 * p_val_lr
        p_test = 0.5 * p_test_xgb + 0.5 * p_test_lr

        # Store out-of-fold predictions
        oof_pred[val_idx] = p_val
        test_pred_folds.append(p_test)

        # quick fold F1 with 0.5 threshold
        preds_val_05 = (p_val >= 0.5).astype(int)
        f1_fold = f1_score(y_val, preds_val_05, average="macro")
        print(f"Fold {fold} F1 (thr=0.5): {f1_fold:.5f}")

    # Average test predictions across folds
    test_pred_mean = np.mean(test_pred_folds, axis=0)

    # Threshold tuning on OOF predictions
    best_thr = 0.5
    best_f1 = -1.0
    for thr in np.linspace(0.2, 0.8, 61):  # step 0.01
        preds_bin = (oof_pred >= thr).astype(int)
        f1 = f1_score(y, preds_bin, average="macro")
        if f1 > best_f1:
            best_f1 = f1
            best_thr = thr

    print(f"\nBest OOF Macro F1 = {best_f1:.5f} at threshold = {best_thr:.3f}")

    # Final test predictions using tuned threshold
    final_test_preds = (test_pred_mean >= best_thr).astype(int)
    return final_test_preds, best_f1, best_thr


# =========================
# MAIN
# =========================
def main():
    # Collect train paths (use part2 if it exists)
    train_paths = [PATH_TRAIN1]
    if os.path.exists(PATH_TRAIN2):
        train_paths.append(PATH_TRAIN2)
    else:
        print(f"WARNING: {PATH_TRAIN2} not found, training only on part1")

    # Load data
    X_train, y_train = build_train_dataset(train_paths)
    X_test, test_ids = build_test_dataset(PATH_TEST)

    # Standardize
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Train + CV + predict
    test_preds, oof_f1, best_thr = train_with_cv_and_predict(
        X_train_scaled, y_train, X_test_scaled
    )

    # Build submission
    sub = pd.DataFrame(
        {
            "row_id": test_ids,
            "target": test_preds.astype(int),
        }
    )
    sub = sub.sort_values("row_id")  # just to be safe
    sub.to_csv("submission.csv", index=False)

    print("\nSaved submission.csv")
    print(f"OOF Macro F1 (for reference): {oof_f1:.5f} @ thr={best_thr:.3f}")


main()


Loading train file: ./data/train_p1.json
  -> 1530 records
Loading train file: ./data/train_part2.json
  -> 1531 records
Train X shape: (3061, 1024), y shape: (3061,)
Loading test file: ./data/test.json
  -> 500 test records
Test X shape: (500, 1024)

=== Fold 1/5 ===
XGB scale_pos_weight = 6.6981 (neg=2130, pos=318)
Fold 1 F1 (thr=0.5): 0.68525

=== Fold 2/5 ===
XGB scale_pos_weight = 6.7013 (neg=2131, pos=318)
Fold 2 F1 (thr=0.5): 0.68024

=== Fold 3/5 ===
XGB scale_pos_weight = 6.7013 (neg=2131, pos=318)
Fold 3 F1 (thr=0.5): 0.63706

=== Fold 4/5 ===
XGB scale_pos_weight = 6.6771 (neg=2130, pos=319)
Fold 4 F1 (thr=0.5): 0.61798

=== Fold 5/5 ===
XGB scale_pos_weight = 6.6771 (neg=2130, pos=319)
Fold 5 F1 (thr=0.5): 0.64765

Best OOF Macro F1 = 0.65853 at threshold = 0.540

Saved submission.csv
OOF Macro F1 (for reference): 0.65853 @ thr=0.540
