# NLI base results: ConvBERT Turkish Cased (dbmdz/convbert-base-turkish-cased)

Loads [yilmazzey/sdp2-nli](https://huggingface.co/datasets/yilmazzey/sdp2-nli) (snli_tr_1_1, multinli_tr_1_1, trglue_mnli) and runs **test-only** evaluation with this model.

**No prompts:** BERT NLI is sequence-pair classification (premise [SEP] hypothesis → label).

**Splits:** Test only where available: snli → `test`; multinli → `validation_matched`/`validation_mismatched` (no test); trglue → `test_matched`/`test_mismatched`.

**Metrics:** Accuracy, macro F1, per-class F1, confusion matrix (CSV + plot). Model is raw pretrained ConvBERT for Turkish (~110M params, cased); random classification head (~33% expected). Efficient alternative to standard BERT.

In [1]:
REPO_ID = "yilmazzey/sdp2-nli"
CONFIGS = ["snli_tr_1_1", "multinli_tr_1_1", "trglue_mnli"]
MODEL_ID = "dbmdz/convbert-base-turkish-cased"
NUM_LABELS = 3  # entailment, neutral, contradiction
RESULTS_DIR = "results"
# Lower to 16 or 8 if CPU is slow
BATCH_SIZE = 32
EVAL_SPLITS = {
    "snli_tr_1_1": ["test"],
    "multinli_tr_1_1": ["validation_matched", "validation_mismatched"],
    "trglue_mnli": ["test_matched", "test_mismatched"],
}

In [2]:
import json
import random
from collections import Counter
from pathlib import Path

import numpy as np
import torch
from datasets import load_dataset
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding

try:
    import matplotlib.pyplot as plt
    import seaborn as sns
    HAS_PLOT = True
except ImportError:
    HAS_PLOT = False

LABEL_NAMES = ["entailment", "neutral", "contradiction"]

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load all three dataset configs
datasets = {}
for cfg in CONFIGS:
    print(f"Loading {REPO_ID} :: {cfg} ...")
    datasets[cfg] = load_dataset(REPO_ID, cfg)
    print("  splits:", list(datasets[cfg].keys()))

Loading yilmazzey/sdp2-nli :: snli_tr_1_1 ...
  splits: ['train', 'validation', 'test']
Loading yilmazzey/sdp2-nli :: multinli_tr_1_1 ...
  splits: ['train', 'validation_matched', 'validation_mismatched']
Loading yilmazzey/sdp2-nli :: trglue_mnli ...
  splits: ['train', 'validation_matched', 'validation_mismatched', 'test_matched', 'test_mismatched']


In [4]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID, num_labels=NUM_LABELS, ignore_mismatched_sizes=True)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
model.eval()
print(f"Using device: {device}")
print("Model loaded successfully")

Loading weights: 100%|██████████| 281/281 [00:00<00:00, 1982.20it/s, Materializing param=convbert.encoder.layer.11.output.dense.weight]                                
[1mConvBertForSequenceClassification LOAD REPORT[0m from: dbmdz/convbert-base-turkish-cased
Key                        | Status     | 
---------------------------+------------+-
embeddings.position_ids    | UNEXPECTED | 
classifier.dense.weight    | MISSING    | 
classifier.dense.bias      | MISSING    | 
classifier.out_proj.bias   | MISSING    | 
classifier.out_proj.weight | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Using device: cpu
Model loaded successfully


In [5]:
def tokenize_fn(examples):
    return tokenizer(
        examples["premise"],
        examples["hypothesis"],
        truncation=True,
        max_length=256,
    )


def run_inference(ds):
    remove_cols = [c for c in ds.column_names if c != "label"]
    ds = ds.map(
        tokenize_fn,
        batched=True,
        remove_columns=remove_cols,
        desc="Tokenize",
    )
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    def collate_fn(examples):
        labels = torch.tensor([ex["label"] for ex in examples])
        batch = data_collator([{k: v for k, v in ex.items() if k != "label"} for ex in examples])
        batch["labels"] = labels
        return batch

    loader = torch.utils.data.DataLoader(ds, batch_size=BATCH_SIZE, collate_fn=collate_fn)
    preds_list, labels_list = [], []
    with torch.no_grad():
        for batch in tqdm(loader, desc="Inference"):
            out = model(
                input_ids=batch["input_ids"].to(device),
                attention_mask=batch["attention_mask"].to(device),
            )
            preds_list.append(out.logits.argmax(-1).cpu().numpy())
            labels_list.append(batch["labels"].numpy())
    y_pred = np.concatenate(preds_list)
    y_true = np.concatenate(labels_list)
    return y_true, y_pred

In [6]:
def compute_metrics(y_true, y_pred):
    acc = float(accuracy_score(y_true, y_pred))
    f1_macro = float(f1_score(y_true, y_pred, average="macro", zero_division=0))
    f1_per_class = f1_score(y_true, y_pred, average=None, zero_division=0)
    f1_per_class = {LABEL_NAMES[i]: float(f1_per_class[i]) for i in range(NUM_LABELS)}
    cm = confusion_matrix(y_true, y_pred)
    out = {"accuracy": acc, "f1_macro": f1_macro, "f1_per_class": f1_per_class}
    return out, cm


def save_confusion_plot(cm, path):
    if not HAS_PLOT:
        return
    fig, ax = plt.subplots(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt="d", xticklabels=LABEL_NAMES, yticklabels=LABEL_NAMES, ax=ax)
    ax.set_xlabel("Predicted")
    ax.set_ylabel("True")
    plt.tight_layout()
    plt.savefig(path)
    plt.close()

In [7]:
Path(RESULTS_DIR).mkdir(parents=True, exist_ok=True)
all_metrics = {}

for config_name in CONFIGS:
    ds_dict = datasets[config_name]
    split_names = EVAL_SPLITS[config_name]
    all_metrics[config_name] = {}

    for split_name in split_names:
        if split_name not in ds_dict:
            print(f"  Skip {config_name}/{split_name} (missing)")
            continue
        ds = ds_dict[split_name]
        print(f"Evaluating {config_name} / {split_name} ...")
        y_true, y_pred = run_inference(ds)
        print("True label dist:", dict(Counter(y_true)))
        print("Pred label dist:", dict(Counter(y_pred)))
        metrics, cm = compute_metrics(y_true, y_pred)
        all_metrics[config_name][split_name] = metrics

        cm_path = Path(RESULTS_DIR) / f"confusion_{config_name}_{split_name}.csv"
        np.savetxt(cm_path, cm, fmt="%d", delimiter=",")
        save_confusion_plot(cm, Path(RESULTS_DIR) / f"confusion_{config_name}_{split_name}.png")

        print(f"  accuracy={metrics['accuracy']:.4f}, f1_macro={metrics['f1_macro']:.4f}")

with open(Path(RESULTS_DIR) / "metrics.json", "w") as f:
    json.dump(all_metrics, f, indent=2)
print(f"Saved {RESULTS_DIR}/metrics.json")

Evaluating snli_tr_1_1 / test ...


Tokenize: 100%|██████████| 9824/9824 [00:00<00:00, 49033.48 examples/s]
Inference: 100%|██████████| 307/307 [18:36<00:00,  3.64s/it]


True label dist: {np.int64(1): 3219, np.int64(0): 3368, np.int64(2): 3237}
Pred label dist: {np.int64(1): 9780, np.int64(2): 44}
  accuracy=0.3278, f1_macro=0.1676
Evaluating multinli_tr_1_1 / validation_matched ...


Tokenize: 100%|██████████| 9809/9809 [00:00<00:00, 45373.01 examples/s]
Inference: 100%|██████████| 307/307 [29:58<00:00,  5.86s/it]


True label dist: {np.int64(1): 3123, np.int64(2): 3211, np.int64(0): 3475}
Pred label dist: {np.int64(1): 9713, np.int64(2): 94, np.int64(0): 2}
  accuracy=0.3186, f1_macro=0.1678
Evaluating multinli_tr_1_1 / validation_mismatched ...


Tokenize: 100%|██████████| 9825/9825 [00:00<00:00, 35019.22 examples/s]
Inference: 100%|██████████| 308/308 [25:41<00:00,  5.00s/it]


True label dist: {np.int64(2): 3240, np.int64(0): 3456, np.int64(1): 3129}
Pred label dist: {np.int64(1): 9748, np.int64(2): 76, np.int64(0): 1}
  accuracy=0.3187, f1_macro=0.1666
Evaluating trglue_mnli / test_matched ...


Tokenize: 100%|██████████| 9008/9008 [00:00<00:00, 65797.36 examples/s]
Inference: 100%|██████████| 282/282 [20:22<00:00,  4.34s/it]


True label dist: {np.int64(1): 3138, np.int64(2): 2946, np.int64(0): 2924}
Pred label dist: {np.int64(1): 8993, np.int64(2): 15}
  accuracy=0.3485, f1_macro=0.1732
Evaluating trglue_mnli / test_mismatched ...


Tokenize: 100%|██████████| 9217/9217 [00:00<00:00, 60841.93 examples/s]
Inference: 100%|██████████| 289/289 [20:18<00:00,  4.22s/it]

True label dist: {np.int64(1): 3043, np.int64(0): 3101, np.int64(2): 3073}
Pred label dist: {np.int64(1): 9209, np.int64(2): 8}
  accuracy=0.3302, f1_macro=0.1659
Saved results/metrics.json





In [8]:
# Summary: per config/split
for config_name, splits in all_metrics.items():
    for split_name, m in splits.items():
        print(f"{config_name} / {split_name}: acc={m['accuracy']:.4f}, F1_macro={m['f1_macro']:.4f}, F1_per_class={m['f1_per_class']}")

snli_tr_1_1 / test: acc=0.3278, F1_macro=0.1676, F1_per_class={'entailment': 0.0, 'neutral': 0.4929609969997692, 'contradiction': 0.009753124047546479}
multinli_tr_1_1 / validation_matched: acc=0.3186, F1_macro=0.1678, F1_per_class={'entailment': 0.0005752085130859936, 'neutral': 0.481146774696167, 'contradiction': 0.02178517397881997}
multinli_tr_1_1 / validation_mismatched: acc=0.3187, F1_macro=0.1666, F1_per_class={'entailment': 0.0005785363031530228, 'neutral': 0.48163392094431934, 'contradiction': 0.017490952955367914}
trglue_mnli / test_matched: acc=0.3485, F1_macro=0.1732, F1_per_class={'entailment': 0.0, 'neutral': 0.5168576374577528, 'contradiction': 0.002701789935832489}
trglue_mnli / test_mismatched: acc=0.3302, F1_macro=0.1659, F1_per_class={'entailment': 0.0, 'neutral': 0.49640874959190334, 'contradiction': 0.0012982797792924375}
