# NLI base results: Qwen2-1.5B-Instruct (Qwen/Qwen2-1.5B-Instruct)

Loads [yilmazzey/sdp2-nli](https://huggingface.co/datasets/yilmazzey/sdp2-nli) (snli_tr_1_1, multinli_tr_1_1, trglue_mnli) and runs **test-only** evaluation with this model.

1.5B generative LLM (Qwen2 series, instruct-tuned with SFT/DPO). Multilingual support (improved tokenizer, strong on non-English like TurkishMMLU ~66.85%). Zero-shot prompted NLI (no fine-tuning). Expected ~50-65% on reasoning/NLI benchmarks due to size. Outputs parsed to 0=entailment, 1=neutral, 2=contradiction.

**Splits:** snli → test; multinli → validation_matched/mismatched; trglue → test_matched/test_mismatched. **Metrics:** Accuracy, macro F1, per-class F1, confusion matrix (CSV + plot).

In [None]:
# Install dependencies (run once)
!pip install -q -U transformers datasets accelerate scikit-learn tqdm huggingface_hub[hf_transfer]

import json
import random
from collections import Counter
from pathlib import Path

import numpy as np
import torch
from datasets import load_dataset
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from tqdm import tqdm
from transformers import pipeline, AutoTokenizer

try:
    import matplotlib.pyplot as plt
    import seaborn as sns
    HAS_PLOT = True
except ImportError:
    HAS_PLOT = False

LABEL_NAMES = ["entailment", "neutral", "contradiction"]

# Enable faster Hugging Face downloads
import os
os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = '1'

if torch.backends.mps.is_available():
    print("Apple Silicon MPS available; using for acceleration.")
elif torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
else:
    print("No GPU/MPS; running on CPU (1.5B very fast even on CPU).")

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

zsh:1: no matches found: huggingface_hub[hf_transfer]


  from .autonotebook import tqdm as notebook_tqdm


Apple Silicon MPS available; using for acceleration.


In [None]:
REPO_ID = "yilmazzey/sdp2-nli"
CONFIGS = ["snli_tr_1_1", "multinli_tr_1_1", "trglue_mnli"]
MODEL_ID = "Qwen/Qwen2-1.5B-Instruct"
NUM_LABELS = 3  # entailment, neutral, contradiction
RESULTS_DIR = "results"
# Lower to 4-8 if memory low (1.5B very lightweight). Even 32 is fine on M4.
BATCH_SIZE = 16
EVAL_SPLITS = {
    "snli_tr_1_1": ["test"],
    "multinli_tr_1_1": ["validation_matched", "validation_mismatched"],
    "trglue_mnli": ["test_matched", "test_mismatched"],
}

In [None]:
# Load all three dataset configs
datasets = {}
for cfg in CONFIGS:
    print(f"Loading {REPO_ID} :: {cfg} ...")
    datasets[cfg] = load_dataset(REPO_ID, cfg)
    print("  splits:", list(datasets[cfg].keys()))

Loading yilmazzey/sdp2-nli :: snli_tr_1_1 ...
  splits: ['train', 'validation', 'test']
Loading yilmazzey/sdp2-nli :: multinli_tr_1_1 ...
  splits: ['train', 'validation_matched', 'validation_mismatched']
Loading yilmazzey/sdp2-nli :: trglue_mnli ...
  splits: ['train', 'validation_matched', 'validation_mismatched', 'test_matched', 'test_mismatched']


In [None]:
print("Loading Qwen2-1.5B-Instruct (text-generation pipeline)...")

device = "mps" if torch.backends.mps.is_available() else 0 if torch.cuda.is_available() else -1
generator = pipeline(
    "text-generation",
    model=MODEL_ID,
    device=device,
    torch_dtype=torch.bfloat16 if torch.backends.mps.is_available() or torch.cuda.is_available() else torch.float32,
    max_length=None,  # Silence max_length warning
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Model loaded successfully.")

`torch_dtype` is deprecated! Use `dtype` instead!


Loading Qwen2-1.5B-Instruct (text-generation pipeline)...


Loading weights: 100%|██████████| 338/338 [00:00<00:00, 1729.05it/s, Materializing param=model.norm.weight]                              
Passing `generation_config` together with generation-related arguments=({'max_length'}) is deprecated and will be removed in future versions. Please pass either a `generation_config` object OR all generation parameters explicitly, but not both.


Model loaded successfully.


In [None]:
def nli_prompt(premise, hypothesis):
    return [
        {"role": "system", "content": "You are a helpful assistant for natural language inference. Classify the relationship between premise and hypothesis as entailment, neutral, or contradiction. Respond with exactly one word only: entailment, neutral, or contradiction. No explanation, no other text."},
        {"role": "user", "content": f"Premise: {premise}\nHypothesis: {hypothesis}\nLabel:"}
    ]

def parse_generated_label(gen_text, formatted_prompt):
    # Remove prompt part + any leading/trailing junk
    continuation = gen_text[len(formatted_prompt):].strip().lower()
    if not continuation:
        return 1  # neutral fallback
    
    # Take first word, remove punctuation
    first_word = continuation.split()[0].rstrip('.,!?;:')
    
    label_map = {"entailment": 0, "neutral": 1, "contradiction": 2}
    return label_map.get(first_word, 1)  # Default to neutral if unknown

def run_prompted_inference(ds):
    premises = list(ds["premise"])
    hypotheses = list(ds["hypothesis"])
    labels = list(ds["label"])
    n = len(ds)
    y_pred = []
    all_generations = []  # Collect all for full debug
    
    for start in tqdm(range(0, n, BATCH_SIZE), desc="Inference"):
        end = min(start + BATCH_SIZE, n)
        batch_premises = premises[start:end]
        batch_hypotheses = hypotheses[start:end]
        batch_prompts = [nli_prompt(p, h) for p, h in zip(batch_premises, batch_hypotheses)]
        
        formatted_prompts = tokenizer.apply_chat_template(batch_prompts, tokenize=False, add_generation_prompt=True)
        
        out = generator(
            formatted_prompts,
            max_new_tokens=5,           # Very strict to force single word
            do_sample=False,
            temperature=0.0,
            pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
            max_length=None,
        )
        
        all_generations.extend(out)
        
        for i, gen in enumerate(out):
            gen_text = gen[0]["generated_text"]
            parsed = parse_generated_label(gen_text, formatted_prompts[i])
            y_pred.append(parsed)
    
    y_true = np.array(labels, dtype=np.int64)
    y_pred = np.array(y_pred, dtype=np.int64)
    
    # Debug: first 5 + every 100th
    for i in range(min(5, n)):
        print(f"Debug Sample {i}: Generated: {all_generations[i][0]['generated_text']}, Parsed Label: {y_pred[i]}")
    for i in range(100, n, 100):
        if i < n:
            print(f"Debug Sample {i}: Generated: {all_generations[i][0]['generated_text']}, Parsed Label: {y_pred[i]}")
    
    print("True label dist:", dict(Counter(y_true)))
    print("Pred label dist:", dict(Counter(y_pred)))
    
    return y_true, y_pred

In [None]:
def compute_metrics(y_true, y_pred):
    acc = float(accuracy_score(y_true, y_pred))
    f1_macro = float(f1_score(y_true, y_pred, average="macro", zero_division=0))
    f1_per_class = f1_score(y_true, y_pred, average=None, zero_division=0)
    f1_per_class = {LABEL_NAMES[i]: float(f1_per_class[i]) for i in range(NUM_LABELS)}
    cm = confusion_matrix(y_true, y_pred)
    out = {"accuracy": acc, "f1_macro": f1_macro, "f1_per_class": f1_per_class}
    return out, cm


def save_confusion_plot(cm, path):
    if not HAS_PLOT:
        return
    fig, ax = plt.subplots(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt="d", xticklabels=LABEL_NAMES, yticklabels=LABEL_NAMES, ax=ax)
    ax.set_xlabel("Predicted")
    ax.set_ylabel("True")
    plt.tight_layout()
    plt.savefig(path)
    plt.close()

In [None]:
Path(RESULTS_DIR).mkdir(parents=True, exist_ok=True)
all_metrics = {}

for config_name in CONFIGS:
    ds_dict = datasets[config_name]
    split_names = EVAL_SPLITS[config_name]
    all_metrics[config_name] = {}

    for split_name in split_names:
        if split_name not in ds_dict:
            print(f"  Skip {config_name}/{split_name} (missing)")
            continue
        ds = ds_dict[split_name]
        print(f"Evaluating {config_name} / {split_name} ...")
        y_true, y_pred = run_prompted_inference(ds)
        metrics, cm = compute_metrics(y_true, y_pred)
        all_metrics[config_name][split_name] = metrics

        cm_path = Path(RESULTS_DIR) / f"confusion_{config_name}_{split_name}.csv"
        np.savetxt(cm_path, cm, fmt="%d", delimiter=",")
        save_confusion_plot(cm, Path(RESULTS_DIR) / f"confusion_{config_name}_{split_name}.png")

        print(f"  accuracy={metrics['accuracy']:.4f}, f1_macro={metrics['f1_macro']:.4f}")

with open(Path(RESULTS_DIR) / "metrics.json", "w") as f:
    json.dump(all_metrics, f, indent=2)
print(f"Saved {RESULTS_DIR}/metrics.json")

Evaluating snli_tr_1_1 / test ...


Inference:   0%|          | 0/614 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Passing `generation_config` together with generation-related arguments=({'pad_token_id', 'do_sample', 'temperature', 'max_new_tokens'}) is deprecated and will be removed in future versions. Please pass either a `generation_config` object OR all generation parameters explicitly, but not both.
Inference: 100%|██████████| 614/614 [21:42<00:00,  2.12s/it]


Debug Sample 0: Generated: <|im_start|>system
You are a helpful assistant for natural language inference. Classify the relationship between premise and hypothesis as entailment, neutral, or contradiction. Respond with exactly one word only: entailment, neutral, or contradiction. No explanation, no other text.<|im_end|>
<|im_start|>user
Premise: Bu kilise korosu, kilisedeki kitaptan neşeli şarkılar söylerken kitlelere şarkı söyler.
Hypothesis: Kilisenin tavanında çatlaklar var.
Label:<|im_end|>
<|im_start|>assistant
neutral, Parsed Label: 1
Debug Sample 1: Generated: <|im_start|>system
You are a helpful assistant for natural language inference. Classify the relationship between premise and hypothesis as entailment, neutral, or contradiction. Respond with exactly one word only: entailment, neutral, or contradiction. No explanation, no other text.<|im_end|>
<|im_start|>user
Premise: Bu kilise korosu, kilisedeki kitaptan neşeli şarkılar söylerken kitlelere şarkı söyler.
Hypothesis: Kilise 

Inference: 100%|██████████| 614/614 [20:39<00:00,  2.02s/it]


Debug Sample 0: Generated: <|im_start|>system
You are a helpful assistant for natural language inference. Classify the relationship between premise and hypothesis as entailment, neutral, or contradiction. Respond with exactly one word only: entailment, neutral, or contradiction. No explanation, no other text.<|im_end|>
<|im_start|>user
Premise: Yeni haklar yeterince güzel.
Hypothesis: Herkes gerçekten en yeni faydaları seviyor
Label:<|im_end|>
<|im_start|>assistant
neutral, Parsed Label: 1
Debug Sample 1: Generated: <|im_start|>system
You are a helpful assistant for natural language inference. Classify the relationship between premise and hypothesis as entailment, neutral, or contradiction. Respond with exactly one word only: entailment, neutral, or contradiction. No explanation, no other text.<|im_end|>
<|im_start|>user
Premise: Bu site, tüm ödül kazananların bir listesini ve Hükümet Yönetici makalelerinin aranabilir bir veritabanını içerir.
Hypothesis: Web sitesinde yer alan Hükümet 

Inference: 100%|██████████| 615/615 [27:15<00:00,  2.66s/it]


Debug Sample 0: Generated: <|im_start|>system
You are a helpful assistant for natural language inference. Classify the relationship between premise and hypothesis as entailment, neutral, or contradiction. Respond with exactly one word only: entailment, neutral, or contradiction. No explanation, no other text.<|im_end|>
<|im_start|>user
Premise: Katkınız, öğrencilerimize kaliteli bir eğitim sağlamamıza yardımcı oldu.
Hypothesis: Katkılarınızın öğrencilerimizin eğitimine faydası olmadı.
Label:<|im_end|>
<|im_start|>assistant
contradiction, Parsed Label: 2
Debug Sample 1: Generated: <|im_start|>system
You are a helpful assistant for natural language inference. Classify the relationship between premise and hypothesis as entailment, neutral, or contradiction. Respond with exactly one word only: entailment, neutral, or contradiction. No explanation, no other text.<|im_end|>
<|im_start|>user
Premise: Cevap onların nedeni ile ilgisi yoktur, ancak sözlükler bi-benzersiz ikame egzersizleri değil

Inference: 100%|██████████| 563/563 [24:55<00:00,  2.66s/it]


Debug Sample 0: Generated: <|im_start|>system
You are a helpful assistant for natural language inference. Classify the relationship between premise and hypothesis as entailment, neutral, or contradiction. Respond with exactly one word only: entailment, neutral, or contradiction. No explanation, no other text.<|im_end|>
<|im_start|>user
Premise: Herkese merhabalar! Az önce Türk Telekom Play Store üzerinden F1 2012 satın aldım fakat ülkenizde geçerli değil diyor? Ne yapmam lazım? Son düzenleyen: Moderatör: 16 Mayıs 2021.
Hypothesis: Su, hayat için önemlidir.
Label:<|im_end|>
<|im_start|>assistant
neutral, Parsed Label: 1
Debug Sample 1: Generated: <|im_start|>system
You are a helpful assistant for natural language inference. Classify the relationship between premise and hypothesis as entailment, neutral, or contradiction. Respond with exactly one word only: entailment, neutral, or contradiction. No explanation, no other text.<|im_end|>
<|im_start|>user
Premise: Savcının fitne - fesat içi

Inference: 100%|██████████| 577/577 [25:42<00:00,  2.67s/it]

Debug Sample 0: Generated: <|im_start|>system
You are a helpful assistant for natural language inference. Classify the relationship between premise and hypothesis as entailment, neutral, or contradiction. Respond with exactly one word only: entailment, neutral, or contradiction. No explanation, no other text.<|im_end|>
<|im_start|>user
Premise: Yaşamı boyunca pek çok zorluğun üstesinden gelen Ebisu sonunda çocukların, balıkçıların, refahın ve talihin koruyucu tanrısına dönüştü.
Hypothesis: Kiraz ağaçları Japonya'da yaygın olarak yetişir.
Label:<|im_end|>
<|im_start|>assistant
neutral, Parsed Label: 1
Debug Sample 1: Generated: <|im_start|>system
You are a helpful assistant for natural language inference. Classify the relationship between premise and hypothesis as entailment, neutral, or contradiction. Respond with exactly one word only: entailment, neutral, or contradiction. No explanation, no other text.<|im_end|>
<|im_start|>user
Premise: 2018 ve 2020 yılları arasında yavaş yavaş dev




In [None]:
# Summary: per config/split
for config_name, splits in all_metrics.items():
    for split_name, m in splits.items():
        print(f"{config_name} / {split_name}: acc={m['accuracy']:.4f}, F1_macro={m['f1_macro']:.4f}, F1_per_class={m['f1_per_class']}")

snli_tr_1_1 / test: acc=0.4698, F1_macro=0.3897, F1_per_class={'entailment': 0.6842874219683889, 'neutral': 0.4510919017288444, 'contradiction': 0.03366396152690111}
multinli_tr_1_1 / validation_matched: acc=0.4933, F1_macro=0.4589, F1_per_class={'entailment': 0.6585723815877251, 'neutral': 0.4577833125778331, 'contradiction': 0.26044466161739555}
multinli_tr_1_1 / validation_mismatched: acc=0.4939, F1_macro=0.4588, F1_per_class={'entailment': 0.6706246773360868, 'neutral': 0.4378698224852071, 'contradiction': 0.26792635658914726}
trglue_mnli / test_matched: acc=0.6348, F1_macro=0.6143, F1_per_class={'entailment': 0.7308085977482088, 'neutral': 0.6385125396367829, 'contradiction': 0.4736966265628686}
trglue_mnli / test_mismatched: acc=0.6619, F1_macro=0.6454, F1_per_class={'entailment': 0.7805567330789883, 'neutral': 0.6244579358196011, 'contradiction': 0.5312007208830818}
