In [None]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
import evaluate
import numpy as np

# Load MNLI dataset
mnli = load_dataset("nyu-mll/multi_nli")

# Load tokenizer and model
model_name = "microsoft/mdeberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Preprocess
def preprocess(example):
    return tokenizer(example["premise"], example["hypothesis"], truncation=True)

encoded = mnli.map(preprocess, batched=True)
encoded = encoded.rename_column("label", "labels")
encoded.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# Evaluation metric
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)
# Training arguments
training_args = TrainingArguments(
    output_dir="./mdeberta-mnli",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    fp16=True,  # if using GPU with mixed precision
    report_to="none",  # disable wandb
)

# Data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded["train"],
    eval_dataset=encoded["validation_matched"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train!
trainer.train()

model.save_pretrained("mdeberta-v3-mnli-finetuned")
tokenizer.save_pretrained("mdeberta-v3-mnli-finetuned")



In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

model = AutoModelForSequenceClassification.from_pretrained("mdeberta-v3-mnli-finetuned")
tokenizer = AutoTokenizer.from_pretrained("mdeberta-v3-mnli-finetuned")
model.eval()

In [1]:
import os
import pandas as pd

parent_folder = 'nli_dataset'
language_codes = [name for name in os.listdir(parent_folder)
                if os.path.isdir(os.path.join(parent_folder, name))]
language_codes.sort()
languages_to_run = language_codes

from tqdm import tqdm
labels = {}
result_accuracies = {}

for language_code in languages_to_run:
    print(language_code)
    labels[language_code]=[]
    result_accuracies[language_code] = []
    accurate = 0
    
    df = pd.read_csv("nli_dataset/{}/test.csv".format(language_code))
    
    for i in tqdm(range(600)): #length of devtest
        premise = df.iloc[i]['premise']
        hypothesis = df.iloc[i]['hypothesis']
        gold_label = df.iloc[i]['label']
        
        inputs = tokenizer(premise, hypothesis, return_tensors="pt", truncation=True)
        with torch.no_grad():
            logits = model(**inputs).logits
            pred = torch.argmax(logits, dim=-1).item()
            
        labels[language_code].append(pred)
        if pred == gold_label:
            accurate+=1
            
    result_accuracies[language_code] = accurate
    
    result_df = pd.DataFrame({
    "premise": df['premise'],
    "hypothesis": df['hypothesis'],
    "gpt_label": labels[language_code]
    })

    # Save to CSV
    result_df.to_csv("nli_predicted_labels_mdeberta/{}.csv".format(language_code), index=False)
    print(accurate)

print(result_accuracies)


In [7]:
result_accuracies = {'amh': 442, 'ara': 463, 'asm': 418, 'aym': 243, 'ben': 463, 'bul': 480, 'bzd': 265, 'cat': 488, 'cni': 255, 'deu': 482, 'ell': 486, 'eng': 530, 'ewe': 231, 'fra': 504, 'grn': 260, 'guj': 438, 'hau': 414, 'hch': 221, 'hin': 468, 'ibo': 403, 'ind': 453, 'jpn': 483, 'kan': 467, 'kin': 368, 'kor': 474, 'lin': 204, 'lug': 284, 'mal': 450, 'mar': 430, 'mya': 460, 'nah': 254, 'ori': 369, 'orm': 300, 'oto': 258, 'pan': 451, 'pat': 362, 'pol': 459, 'por': 542, 'quy': 266, 'ron': 320, 'rus': 473, 'shp': 281, 'sna': 382, 'sot': 358, 'spa': 495, 'swa': 418, 'tam': 456, 'tar': 220, 'tel': 443, 'tha': 446, 'tur': 463, 'twi': 276, 'urd': 418, 'vie': 474, 'wol': 246, 'xho': 390, 'yor': 256, 'zho': 473, 'zul': 398}


In [None]:
# from transformers import AutoTokenizer, AutoModelForSequenceClassification
# import torch

# model = AutoModelForSequenceClassification.from_pretrained("mdeberta-v3-mnli-finetuned")
# tokenizer = AutoTokenizer.from_pretrained("mdeberta-v3-mnli-finetuned")
# model.eval()
# # Your example
# premise = "The cat sat on the mat."
# hypothesis = "A cat is sitting."

# inputs = tokenizer(premise, hypothesis, return_tensors="pt", truncation=True)
# with torch.no_grad():
#     logits = model(**inputs).logits
#     pred = torch.argmax(logits, dim=-1).item()

# label_map = {0: "entailment", 1: "neutral", 2: "contradiction"}
# print(f"Prediction: {label_map[pred]}")


In [5]:
lang_code_to_name = {
    'amh': 'Amharic',
    'ara': 'Arabic',
    'asm': 'Assamese',
    'aym': 'Aymara',
    'ben': 'Bengali',
    'bul': 'Bulgarian',
    'bzd': 'Bribri',
    'cat': 'Catalan',
    'cni': 'Asháninka',
    'deu': 'German',
    'ell': 'Greek',
    'eng': 'English',
    'ewe': 'Ewe',
    'fra': 'French',
    'grn': 'Guarani',
    'guj': 'Gujarati',
    'hau': 'Hausa',
    'hch': 'Wixarika',
    'hin': 'Hindi',
    'ibo': 'Igbo',
    'ind': 'Indonesian',
    'jpn': 'Japanese',
    'kan': 'Kannada',
    'kin': 'Kinyarwanda',
    'kor': 'Korean',
    'lin': 'Lingala',
    'lug': 'Luganda',
    'mal': 'Malayalam',
    'mar': 'Marathi',
    'mya': 'Burmese',
    'nah': 'Nahuatl',
    'ori': 'Odia (Oriya)',
    'orm': 'Oromo',
    'oto': 'Otomi',
    'pan': 'Punjabi',
    'pat': 'Jamaican Patois',
    'pol': 'Polish',
    'por': 'Portuguese',
    'quy': 'Quechua',
    'ron': 'Romanian',
    'rus': 'Russian',
    'shp': 'Shipibo-Conibo',
    'sna': 'chiShona',
    'sot': 'Sesotho',
    'spa': 'Spanish',
    'swa': 'Swahili',
    'tam': 'Tamil',
    'tar': 'Rarámuri',
    'tel': 'Telugu',
    'tha': 'Thai',
    'tur': 'Turkish',
    'twi': 'Twi',
    'urd': 'Urdu',
    'vie': 'Vietnamese',
    'wol': 'Wolof',
    'xho': 'isiXhosa',
    'yor': 'Yoruba',
    'zho': 'Chinese',
    'zul': 'isiZulu'
}


In [8]:
import pandas as pd
df = pd.DataFrame({
    "Language name": [lang_code_to_name[language_code] for language_code in language_codes],
    "Language code": language_codes,
    "Accuracy": [round(result_accuracies[language_code]*100/600, 1) for language_code in language_codes]
})

# Save to CSV
df.to_csv("nli_results_mDeBERTa.csv", index=False)