
# Catboost SHROOM 2024 — Model-Aware


## Installs & Imports

In [None]:
%pip -q install bert-score

In [None]:
import os, re, json, random
import numpy as np
import pandas as pd
from collections import Counter
from tqdm import tqdm

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score
from scipy.stats import spearmanr
from sklearn.calibration import IsotonicRegression
from sklearn.linear_model import Ridge
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
import joblib

from sklearn.metrics import accuracy_score, f1_score
from scipy.stats import spearmanr

import torch
from sentence_transformers import SentenceTransformer
from bert_score import score as bertscore_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import transformers

SBERT_NAME    = "sentence-transformers/all-MiniLM-L6-v2"
NLI_MODEL     = "roberta-large-mnli"

RANDOM_SEED = 42
N_FOLDS = 5
OUT_DIR = "artifacts_upgrade_complete"
os.makedirs(OUT_DIR, exist_ok=True)

random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Data reading 

In [3]:
file_path = "/kaggle/input/shroom-aware/"
path_val_model_aware = file_path + "val.model-aware.v2.json"
path_test_model_aware = file_path + "test.model-aware.json"

def load_data(path):
    with open(path, "r") as f:
        data = json.load(f)  
    return pd.DataFrame(data)

val_df = load_data(path_val_model_aware)

test_df = load_data(path_test_model_aware)


label_mapping = {'Hallucination': 1, 'Not Hallucination': 0}

y_train = val_df['label'].map(label_mapping)
y_test = test_df['label'].map(label_mapping)

label_mapping = {'Hallucination': 1, 'Not Hallucination': 0}
val_df['label'] = val_df['label'].astype(str).str.strip().str.title()
val_df['label_num'] = val_df['label'].map(label_mapping)

source_df = val_df

## Feature extraction

In [None]:
sbert_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

_nli_tok = AutoTokenizer.from_pretrained("roberta-large-mnli")
_nli_model = AutoModelForSequenceClassification.from_pretrained("roberta-large-mnli").to(device)
_nli_model.eval()


In [6]:
transformers.logging.set_verbosity_error()


def compute_bleu(reference, hypothesis):
    ref_tokens = [reference.split()]
    hyp_tokens = hypothesis.split()
    smoothing = SmoothingFunction().method4
    return sentence_bleu(ref_tokens, hyp_tokens, smoothing_function=smoothing)

def extract_nli_logits(premise, hypothesis):
    inputs = _nli_tok(premise, hypothesis, return_tensors='pt', truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = _nli_model(**inputs)
    logits = outputs.logits.squeeze().detach().cpu().numpy()
    return logits

def extract_features(df, is_test=False, batch_size=16):

    rows = []
    srcs = df["src"].astype(str).tolist()
    tgts = df["tgt"].astype(str).tolist()
    hyps = df["hyp"].astype(str).tolist()
    n = len(df)

    def lexical_feats(src, hyp):
        s_tok, h_tok = src.split(), hyp.split()
        len_src, len_hyp = len(s_tok), len(h_tok)
        overlap = len(set(s_tok) & set(h_tok)) / (len(set(h_tok)) + 1e-6)
        return len_src, len_hyp, overlap, (len_hyp + 1e-6) / (len_src + 1e-6)

    lex_data = [lexical_feats(srcs[i], hyps[i]) for i in range(n)]
    lex_df = pd.DataFrame(lex_data, columns=["len_src", "len_hyp", "overlap_ratio", "len_ratio"])

    sbert_cosines = np.full(n, np.nan)
    
    for i in tqdm(range(0, n, batch_size), desc="SBERT batches"):
        s = sbert_model.encode(srcs[i:i+batch_size], normalize_embeddings=True)
        h = sbert_model.encode(hyps[i:i+batch_size], normalize_embeddings=True)
        sbert_cosines[i:i+batch_size] = np.sum(s * h, axis=1)

    entail, neutral, contra = np.full(n, np.nan), np.full(n, np.nan), np.full(n, np.nan)
    _nli_model.eval()
    for i in tqdm(range(0, n, batch_size), desc="NLI batches"):
        batch_prem = tgts[i:i+batch_size]
        batch_hyp = hyps[i:i+batch_size]
        enc = _nli_tok(batch_prem, batch_hyp, return_tensors="pt", padding=True, truncation=True, max_length=256)
        enc = {k: v.to(device) for k, v in enc.items()}
        # Extract the raw model logits
        with torch.no_grad():
            logits = _nli_model(**enc).logits.detach().cpu().numpy()
        entail[i:i+batch_size] = logits[:, 2]
        neutral[i:i+batch_size] = logits[:, 1]
        contra[i:i+batch_size] = logits[:, 0]

    bertscores = np.full(n, np.nan)

    P, R, F = bertscore_score(hyps, srcs, lang="en", verbose=False, batch_size=batch_size)
    bertscores = F.numpy()

    features = pd.concat([lex_df], axis=1)
    features["sbert_cosine"] = sbert_cosines
    features["bertscore_F1"] = bertscores
    features["entailment_logit"] = entail
    features["neutral_logit"] = neutral
    features["contradiction_logit"] = contra
    features["nli_margin"] = entail - contra

    features["tgt_len"] = df["tgt"].apply(lambda x: len(str(x).split()))
    features["task"] = df.get("task", "unknown_task")
    features["model_id"] = df.get("model", "unknown_model")

    return features



## Build matrices

In [None]:
X_all = extract_features(source_df)
y_all = source_df["label_num"].astype(int).values

In [8]:
cat_features = []
for c in ["task","model_id"]:
    if c in X_all.columns:
        cat_features.append(X_all.columns.get_loc(c))

print("X shape:", X_all.shape, "| y shape:", y_all.shape, "| cat_features:", cat_features)


X shape: (501, 13) | y shape: (501,) | cat_features: [11, 12]


## Catboost

In [32]:
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_SEED)

oof_score = np.zeros(len(X_all), dtype=float)
fold_metrics = []

for fold, (tr, va) in enumerate(skf.split(X_all, y_all), 1):

    X_tr, X_va = X_all.iloc[tr], X_all.iloc[va]
    y_tr, y_va = y_all[tr], y_all[va]

    train_pool = Pool(X_tr, y_tr, cat_features=cat_features if cat_features else None)
    valid_pool = Pool(X_va, y_va, cat_features=cat_features if cat_features else None)

    model = CatBoostClassifier(
        loss_function="Logloss",
        depth=8, learning_rate=0.05, l2_leaf_reg=6.0,
        iterations=2000, random_seed=RANDOM_SEED,
        eval_metric="AUC", verbose=False,
        od_type="Iter", od_wait=200
    )

    model.fit(train_pool, eval_set=valid_pool, use_best_model=True)

    # Store out-of-fold probability scores
    oof_score[va] = model.predict_proba(X_va)[:, 1]

    # Find best threshold for this fold
    best_f1, best_thr = 0, 0.5
    for thr in np.linspace(0.05, 0.95, 19):
        f1 = f1_score(y_va, (oof_score[va] >= thr).astype(int), average="macro")
        if f1 > best_f1:
            best_f1, best_thr = f1, thr

    fold_metrics.append({
        "fold": fold,
        "f1_macro": best_f1,
        "thr": best_thr
    })

    model.save_model(os.path.join(OUT_DIR, f"unified_model_fold{fold}.cbm"))

pd.DataFrame(fold_metrics)


Unnamed: 0,fold,f1_macro,thr
0,1,0.676022,0.4
1,2,0.865131,0.45
2,3,0.739583,0.65
3,4,0.767673,0.65
4,5,0.761386,0.5


In [40]:
def predict_unified(df, threshold=None):
    X_te = extract_features(df, is_test=True)

    preds = np.zeros(len(X_te))

    # Average predictions from all saved folds
    for fold in range(1, N_FOLDS + 1):
        model = CatBoostClassifier()
        model.load_model(os.path.join(OUT_DIR, f"unified_model_fold{fold}.cbm"))
        preds += model.predict_proba(X_te)[:, 1] / N_FOLDS

    if threshold is not None:
        yhat = (preds >= threshold).astype(int)
    else:
        yhat = None

    return pd.DataFrame({
        "id": X_te.get("id", pd.Series(range(len(X_te)))),
        "score": preds,
        "pred": yhat
    })


In [41]:
best_global_thr = 0.5
best_global_f1 = 0

for thr in np.linspace(0.05, 0.95, 19):
    f1 = f1_score(y_all, (oof_score >= thr).astype(int), average="macro")
    if f1 > best_global_f1:
        best_global_f1, best_global_thr = f1, thr

print("Best threshold:", best_global_thr)


Best threshold: 0.49999999999999994


In [42]:
sub = predict_unified(test_df, threshold=best_global_thr)


SBERT batches: 100%|██████████| 94/94 [00:02<00:00, 36.55it/s]
NLI batches: 100%|██████████| 94/94 [00:21<00:00,  4.32it/s]


In [43]:
y_pred = sub["pred"].astype(int).values

y_score = sub["score"].astype(float).values

# Metrics
f1 = f1_score(y_true, y_pred, average="macro")
acc = accuracy_score(y_true, y_pred)
rho = spearmanr(y_true, y_score).correlation

print(f"Macro F1      : {f1:.4f}")
print(f"Accuracy      : {acc:.4f}")
print(f"Spearman Rho  : {rho:.4f}")



===== UNIFIED MODEL METRICS =====
Macro F1      : 0.7379
Accuracy      : 0.7633
Spearman Rho  : 0.5328



In [44]:
df_save = sub[["id", "score"]].rename(columns={"score": "catboost_prob"})
df_save.to_csv("submission_catboost.csv", index=False)

print("Saved prediction file:")
print(df_save.head())


Saved prediction file:
   id  catboost_prob
0   0       0.353216
1   1       0.326115
2   2       0.536144
3   3       0.606929
4   4       0.307855
