# NLI base results: Turkish-Gemma-9b-T1 (ytu-ce-cosmos/Turkish-Gemma-9b-T1)

Loads [yilmazzey/sdp2-nli](https://huggingface.co/datasets/yilmazzey/sdp2-nli) (snli_tr_1_1, multinli_tr_1_1, trglue_mnli) and runs **test-only** evaluation with this model.

9B generative LLM (Gemma-2 based, Turkish instruction-tuned with reasoning/thinking). Zero-shot prompted NLI evaluation (no fine-tuning). Expect variable but potentially strong performance due to Turkish adaptation. Outputs parsed to 0=entailment, 1=neutral, 2=contradiction.

**Splits:** snli → test; multinli → validation_matched/mismatched; trglue → test_matched/test_mismatched. **Metrics:** Accuracy, macro F1, per-class F1, confusion matrix (CSV + plot).

In [1]:
REPO_ID = "yilmazzey/sdp2-nli"
CONFIGS = ["snli_tr_1_1", "multinli_tr_1_1", "trglue_mnli"]
MODEL_ID = "ytu-ce-cosmos/Turkish-Gemma-9b-T1"
NUM_LABELS = 3  # entailment, neutral, contradiction
RESULTS_DIR = "results"
# Lower to 8-16 if GPU memory low (9B model is heavy). If CPU only, expect very slow run.
BATCH_SIZE = 32
EVAL_SPLITS = {
    "snli_tr_1_1": ["test"],
    "multinli_tr_1_1": ["validation_matched", "validation_mismatched"],
    "trglue_mnli": ["test_matched", "test_mismatched"],
}

In [2]:
# Colab: uncomment and run once to install/upgrade (Runtime -> Change runtime type -> GPU)
# !pip install -q -U transformers datasets accelerate scikit-learn tqdm

import json
import random
from collections import Counter
from pathlib import Path

import numpy as np
import torch
from datasets import load_dataset
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from tqdm import tqdm
from transformers import pipeline

try:
    import matplotlib.pyplot as plt
    import seaborn as sns
    HAS_PLOT = True
except ImportError:
    HAS_PLOT = False

LABEL_NAMES = ["entailment", "neutral", "contradiction"]

# Colab: confirm GPU (e.g. Tesla T4 / A100)
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
else:
    print("No GPU; 9B model will be very slow on CPU.")

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load all three dataset configs
datasets = {}
for cfg in CONFIGS:
    print(f"Loading {REPO_ID} :: {cfg} ...")
    datasets[cfg] = load_dataset(REPO_ID, cfg)
    print("  splits:", list(datasets[cfg].keys()))

Loading yilmazzey/sdp2-nli :: snli_tr_1_1 ...
  splits: ['train', 'validation', 'test']
Loading yilmazzey/sdp2-nli :: multinli_tr_1_1 ...
  splits: ['train', 'validation_matched', 'validation_mismatched']
Loading yilmazzey/sdp2-nli :: trglue_mnli ...
  splits: ['train', 'validation_matched', 'validation_mismatched', 'test_matched', 'test_mismatched']


In [4]:
print("Loading model and tokenizer (9B params; first run may download ~18GB and take several minutes)...")
generator = pipeline(
    "text-generation",
    model=MODEL_ID,
    device=0 if torch.cuda.is_available() else -1,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    trust_remote_code=True,
    model_kwargs={"low_cpu_mem_usage": True},
)
if hasattr(generator.tokenizer, "pad_token") and generator.tokenizer.pad_token is None:
    generator.tokenizer.pad_token = generator.tokenizer.eos_token
print("Model loaded.")

`torch_dtype` is deprecated! Use `dtype` instead!


Loading model and tokenizer (9B params; first run may download ~18GB and take several minutes)...


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]Fetching 4 files: 100%|██████████| 4/4 [1:23:50<00:00, 1257.72s/it]  
The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Loading weights: 100%|██████████| 464/464 [00:20<00:00, 22.34it/s, Materializing param=model.norm.weight]                                


Model loaded.


In [5]:
def nli_prompt(premise, hypothesis):
    return f"""Premise: {premise}
Hypothesis: {hypothesis}
Does the premise entail, is neutral to, or contradict the hypothesis? Answer with only one word: entailment, neutral, or contradiction."""


LABEL_WORD_TO_ID = {
    "entailment": 0,
    "neutral": 1,
    "contradiction": 2,
    "içerme": 0,
    "tarafsız": 1,
    "nötr": 1,
    "çelişki": 2,
}


def parse_generated_label(generated_text, prompt_text):
    """Take first word after prompt; map to 0=entailment, 1=neutral, 2=contradiction. Default 1 if unparseable."""
    continuation = generated_text
    if generated_text.startswith(prompt_text):
        continuation = generated_text[len(prompt_text):]
    continuation = continuation.strip()
    first_word = (continuation.split()[0].lower().rstrip(".,;:") if continuation else "neutral")
    return LABEL_WORD_TO_ID.get(first_word, 1)


def run_prompted_inference(ds):
    premises = ds["premise"]
    hypotheses = ds["hypothesis"]
    labels = ds["label"]
    n = len(labels)
    prompts = [nli_prompt(p, h) for p, h in zip(premises, hypotheses)]
    y_pred = []
    for start in tqdm(range(0, n, BATCH_SIZE), desc="Inference"):
        batch_prompts = prompts[start : start + BATCH_SIZE]
        out = generator(
            batch_prompts,
            max_new_tokens=10,
            do_sample=False,
            pad_token_id=generator.tokenizer.pad_token_id or generator.tokenizer.eos_token_id,
        )
        for i, prompt_text in enumerate(batch_prompts):
            gen_text = out[i][0]["generated_text"]
            y_pred.append(parse_generated_label(gen_text, prompt_text))
    y_true = list(labels)
    y_true = np.array(y_true, dtype=np.int64)
    y_pred = np.array(y_pred, dtype=np.int64)
    print("True label dist:", dict(Counter(y_true)))
    print("Pred label dist:", dict(Counter(y_pred)))
    return y_true, y_pred

In [6]:
def compute_metrics(y_true, y_pred):
    acc = float(accuracy_score(y_true, y_pred))
    f1_macro = float(f1_score(y_true, y_pred, average="macro", zero_division=0))
    f1_per_class = f1_score(y_true, y_pred, average=None, zero_division=0)
    f1_per_class = {LABEL_NAMES[i]: float(f1_per_class[i]) for i in range(NUM_LABELS)}
    cm = confusion_matrix(y_true, y_pred)
    out = {"accuracy": acc, "f1_macro": f1_macro, "f1_per_class": f1_per_class}
    return out, cm


def save_confusion_plot(cm, path):
    if not HAS_PLOT:
        return
    fig, ax = plt.subplots(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt="d", xticklabels=LABEL_NAMES, yticklabels=LABEL_NAMES, ax=ax)
    ax.set_xlabel("Predicted")
    ax.set_ylabel("True")
    plt.tight_layout()
    plt.savefig(path)
    plt.close()

In [None]:
Path(RESULTS_DIR).mkdir(parents=True, exist_ok=True)
all_metrics = {}

for config_name in CONFIGS:
    ds_dict = datasets[config_name]
    split_names = EVAL_SPLITS[config_name]
    all_metrics[config_name] = {}

    for split_name in split_names:
        if split_name not in ds_dict:
            print(f"  Skip {config_name}/{split_name} (missing)")
            continue
        ds = ds_dict[split_name]
        print(f"Evaluating {config_name} / {split_name} ...")
        y_true, y_pred = run_prompted_inference(ds)
        metrics, cm = compute_metrics(y_true, y_pred)
        all_metrics[config_name][split_name] = metrics

        cm_path = Path(RESULTS_DIR) / f"confusion_{config_name}_{split_name}.csv"
        np.savetxt(cm_path, cm, fmt="%d", delimiter=",")
        save_confusion_plot(cm, Path(RESULTS_DIR) / f"confusion_{config_name}_{split_name}.png")

        print(f"  accuracy={metrics['accuracy']:.4f}, f1_macro={metrics['f1_macro']:.4f}")

with open(Path(RESULTS_DIR) / "metrics.json", "w") as f:
    json.dump(all_metrics, f, indent=2)
print(f"Saved {RESULTS_DIR}/metrics.json")

Evaluating snli_tr_1_1 / test ...


Inference:   0%|          | 0/307 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Passing `generation_config` together with generation-related arguments=({'do_sample', 'pad_token_id', 'max_new_tokens'}) is deprecated and will be removed in future versions. Please pass either a `generation_config` object OR all generation parameters explicitly, but not both.
Both `max_new_tokens` (=10) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=10) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_toke

In [None]:
# Summary: per config/split
for config_name, splits in all_metrics.items():
    for split_name, m in splits.items():
        print(f"{config_name} / {split_name}: acc={m['accuracy']:.4f}, F1_macro={m['f1_macro']:.4f}, F1_per_class={m['f1_per_class']}")