# Surface-level Metrics

## BLEU (Bilingual Evaluation Understudy)

In [7]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from math import exp, log

In [12]:
def compute_bleu(reference_tokens, hypothesis_tokens, weights=(0.5, 0.5)):
    """
    Compute sentence-level BLEU score with smoothing.
    
    Args:
        reference_tokens (list): Reference sentence tokenized (e.g., ['रामः', 'वनं', 'गच्छति'])
        hypothesis_tokens (list): Hypothesis sentence tokenized
        weights (tuple): Weights for n-grams (e.g., (0.5, 0.5) for bigram)
    
    Returns:
        float: BLEU score
    """
    smoothie = SmoothingFunction().method1
    score = sentence_bleu([reference_tokens], hypothesis_tokens, weights=weights, smoothing_function=smoothie)
    return score

In [13]:
# Reference and Hypothesis tokens (Sanskrit)
reference = ['रामः', 'वनं', 'गच्छति']
hypothesis = ['गच्छति', 'रामः', 'वनं']

# Compute BLEU with bigram weights (unigram + bigram)
score = compute_bleu(reference, hypothesis, weights=(0.5, 0.5))

print(f"BLEU Score: {score:.3f}")


BLEU Score: 0.707


## METEOR

In [14]:
def compute_meteor(reference_tokens, hypothesis_tokens, gamma=0.5, beta=3.0):
    """
    Compute METEOR score for a single reference-hypothesis pair using unigram overlap.

    Args:
        reference_tokens (list): Tokenized reference sentence
        hypothesis_tokens (list): Tokenized hypothesis sentence
        gamma (float): Penalty weight (default 0.5)
        beta (float): Penalty exponent (default 3.0)

    Returns:
        float: METEOR score
    """
    # Step 1: Match unigrams
    matches = [token for token in hypothesis_tokens if token in reference_tokens]
    m = len(matches)
    if m == 0:
        return 0.0

    # Step 2: Compute Precision and Recall
    precision = m / len(hypothesis_tokens)
    recall = m / len(reference_tokens)

    # Step 3: Compute F-mean
    f_mean = (10 * precision * recall) / (9 * precision + recall)

    # Step 4: Estimate Chunks (simplified for example)
    # Chunks are counted as groups of matched tokens appearing in the same relative order
    def count_chunks(ref, hyp):
        indices = [ref.index(tok) for tok in hyp if tok in ref]
        chunks = 1
        for i in range(1, len(indices)):
            if indices[i] != indices[i-1] + 1:
                chunks += 1
        return chunks

    ch = count_chunks(reference_tokens, hypothesis_tokens)

    # Step 5: Penalty
    penalty = gamma * (ch / m) ** beta

    # Step 6: Final METEOR
    meteor_score = f_mean * (1 - penalty)
    return round(meteor_score, 3)

In [16]:
reference = ['रामः', 'वनं', 'गच्छति']
hypothesis = ['गच्छति', 'रामः', 'वनं']

score = compute_meteor(reference, hypothesis)
print(f"METEOR Score: {score}")

METEOR Score: 0.333


## ROUGE

### ROUGE-L 

In [17]:
from itertools import combinations
from collections import Counter

In [18]:
def lcs(X, Y):
    m, n = len(X), len(Y)
    dp = [[0]*(n+1) for _ in range(m+1)]
    for i in range(m):
        for j in range(n):
            if X[i] == Y[j]:
                dp[i+1][j+1] = dp[i][j] + 1
            else:
                dp[i+1][j+1] = max(dp[i][j+1], dp[i+1][j])
    return dp[m][n]

def compute_rouge_n(reference, hypothesis, n):
    ref_ngrams = Counter([tuple(reference[i:i+n]) for i in range(len(reference)-n+1)])
    hyp_ngrams = Counter([tuple(hypothesis[i:i+n]) for i in range(len(hypothesis)-n+1)])
    match = sum((ref_ngrams & hyp_ngrams).values())
    total = sum(ref_ngrams.values())
    return round(match / total, 3) if total > 0 else 0.0

def compute_rouge_l(reference, hypothesis, beta=1.2):
    lcs_len = lcs(reference, hypothesis)
    m, n = len(reference), len(hypothesis)
    recall = lcs_len / m if m else 0.0
    precision = lcs_len / n if n else 0.0
    if recall == 0 or precision == 0:
        return 0.0
    f1 = (1 + beta**2) * recall * precision / (recall + beta**2 * precision)
    return round(f1, 3)

def compute_rouge_w(reference, hypothesis, beta=1.2):
    # Weighted LCS: count squares of consecutive matches
    score = 0
    i = j = 0
    ref_len = len(reference)
    hyp_len = len(hypothesis)
    matched_lengths = []
    while i < ref_len:
        length = 0
        while j < hyp_len and reference[i] != hypothesis[j]:
            j += 1
        while i < ref_len and j < hyp_len and reference[i] == hypothesis[j]:
            length += 1
            i += 1
            j += 1
        if length > 0:
            matched_lengths.append(length)
        else:
            i += 1
    wlcs = sum(l**2 for l in matched_lengths)
    f = lambda x: x**2
    recall = wlcs / f(len(reference)) if reference else 0.0
    precision = wlcs / f(len(hypothesis)) if hypothesis else 0.0
    if recall == 0 or precision == 0:
        return 0.0
    f1 = (1 + beta**2) * recall * precision / (recall + beta**2 * precision)
    return round(f1, 3)

def skip_bigrams(tokens):
    return set(combinations(tokens, 2))

def compute_rouge_s(reference, hypothesis, beta=1.0):
    ref_sb = skip_bigrams(reference)
    hyp_sb = skip_bigrams(hypothesis)
    match = len(ref_sb & hyp_sb)
    recall = match / len(ref_sb) if ref_sb else 0.0
    precision = match / len(hyp_sb) if hyp_sb else 0.0
    if recall == 0 or precision == 0:
        return 0.0
    f1 = (1 + beta**2) * precision * recall / (recall + beta**2 * precision)
    return round(f1, 3)

def compute_rouge_su(reference, hypothesis, beta=1.0):
    ref_sb = skip_bigrams(reference)
    hyp_sb = skip_bigrams(hypothesis)
    ref_uni = set(reference)
    hyp_uni = set(hypothesis)
    sb_match = len(ref_sb & hyp_sb)
    uni_match = len(ref_uni & hyp_uni)
    p = (sb_match + uni_match) / (len(hyp_sb) + len(hyp_uni)) if (len(hyp_sb) + len(hyp_uni)) > 0 else 0.0
    r = (sb_match + uni_match) / (len(ref_sb) + len(ref_uni)) if (len(ref_sb) + len(ref_uni)) > 0 else 0.0
    if p == 0 or r == 0:
        return 0.0
    f1 = (1 + beta**2) * p * r / (r + beta**2 * p)
    return round(f1, 3)

In [19]:
reference = ['रामः', 'वनं', 'गच्छति']
hypothesis = ['गच्छति', 'रामः', 'वनं']

In [23]:
# Example: ROUGE-N
print("Unigrams",compute_rouge_n(reference, hypothesis, n=1))
print("Bigrams",compute_rouge_n(reference, hypothesis, n=2))

Unigrams 1.0
Bigrams 0.5


In [24]:
# Example: ROUGE-L
compute_rouge_l(reference, hypothesis)

0.667

In [25]:
# Example: ROUGE-W
compute_rouge_w(reference, hypothesis)

0.444

In [26]:
# Example: ROUGE-S
compute_rouge_s(reference, hypothesis)

0.333

In [27]:
# Example: ROUGE-SU
compute_rouge_su(reference, hypothesis)

0.667

## ChrF 

In [28]:
from itertools import combinations
from collections import Counter

In [29]:
def get_char_ngrams(text, n):
    return [text[i:i+n] for i in range(len(text) - n + 1)]

def compute_chrf(reference, hypothesis, n=3, beta=2):
    ref_text = ''.join(reference)
    hyp_text = ''.join(hypothesis)

    ref_ngrams = get_char_ngrams(ref_text, n)
    hyp_ngrams = get_char_ngrams(hyp_text, n)

    match = len(set(ref_ngrams) & set(hyp_ngrams))
    precision = match / len(hyp_ngrams) if hyp_ngrams else 0.0
    recall = match / len(ref_ngrams) if ref_ngrams else 0.0

    if precision == 0 or recall == 0:
        return 0.0

    chrf = (1 + beta**2) * precision * recall / (beta**2 * precision + recall)
    return round(chrf, 3)

In [32]:
reference = ['रामः', 'वनं', 'गच्छति।']
hypothesis = ['रामः', 'वनं', 'गतः।']

score = compute_chrf(reference, hypothesis, n=3, beta=2)
print(f"ChrF Score: {score}")

ChrF Score: 0.526


## ChrF++ 

In [33]:
from collections import Counter

In [34]:
def get_char_ngrams(text, n):
    return [text[i:i+n] for i in range(len(text) - n + 1)]

def get_word_ngrams(tokens, n):
    return [' '.join(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]

def compute_chrf_plus(reference_tokens, hypothesis_tokens, n_char=6, n_word=2, beta=2):
    ref_char = ''.join(reference_tokens)
    hyp_char = ''.join(hypothesis_tokens)

    # Character n-grams
    ref_char_ngrams = Counter(get_char_ngrams(ref_char, n_char))
    hyp_char_ngrams = Counter(get_char_ngrams(hyp_char, n_char))
    char_match = sum((ref_char_ngrams & hyp_char_ngrams).values())

    prec_char = char_match / sum(hyp_char_ngrams.values()) if hyp_char_ngrams else 0
    rec_char = char_match / sum(ref_char_ngrams.values()) if ref_char_ngrams else 0

    # Word n-grams
    ref_word_ngrams = Counter(get_word_ngrams(reference_tokens, n_word))
    hyp_word_ngrams = Counter(get_word_ngrams(hypothesis_tokens, n_word))
    word_match = sum((ref_word_ngrams & hyp_word_ngrams).values())

    prec_word = word_match / sum(hyp_word_ngrams.values()) if hyp_word_ngrams else 0
    rec_word = word_match / sum(ref_word_ngrams.values()) if ref_word_ngrams else 0

    # Combined precision and recall
    precision = (prec_char + prec_word) / 2
    recall = (rec_char + rec_word) / 2

    if precision == 0 or recall == 0:
        return 0.0

    chrfpp = (1 + beta**2) * precision * recall / (beta**2 * precision + recall)
    return round(chrfpp, 3)

In [37]:
reference = ['रामः', 'वनं', 'गच्छति।']
hypothesis = ['रामः', 'वनं', 'गतः।']

score = compute_chrf_plus(reference, hypothesis)
print(f"ChrF++ Score: {score}")

ChrF++ Score: 0.431


## Translation Edit Rate (TER)

In [41]:
def compute_ter(reference_tokens, hypothesis_tokens):
    """
    Simplified TER calculation: only counts insertions, deletions, substitutions,
    and a basic reordering shift as 1 edit each. More advanced TER needs alignment tools.

    Args:
        reference_tokens (list): Tokenized reference sentence
        hypothesis_tokens (list): Tokenized hypothesis sentence

    Returns:
        float: TER score
    """
    import numpy as np
    from difflib import SequenceMatcher

    ref_len = len(reference_tokens)
    if ref_len == 0:
        return 1.0 if len(hypothesis_tokens) > 0 else 0.0

    matcher = SequenceMatcher(None, reference_tokens, hypothesis_tokens)
    edits = 0

    for opcode in matcher.get_opcodes():
        tag, i1, i2, j1, j2 = opcode
        if tag != 'equal':
            edits += max(i2 - i1, j2 - j1)

    ter_score = edits / ref_len
    return round(ter_score, 3)

In [42]:
ref = ['रामः', 'वनं', 'गच्छति']
hyp1 = ['रामः', 'वनं', 'गच्छति']  # TER = 0
hyp2 = ['वनं', 'रामः', 'गच्छति']  # TER = 1/3 ≈ 0.33
hyp3 = ['रामः', 'गच्छति']         # TER = 1/3 ≈ 0.33

print("TER (Perfect Match):", compute_ter(ref, hyp1))
print("TER (Shift):", compute_ter(ref, hyp2))
print("TER (Missing Word):", compute_ter(ref, hyp3))

TER (Perfect Match): 0.0
TER (Shift): 0.667
TER (Missing Word): 0.333


## TER-M (Translation Edit Rate with Morphology)

In [44]:
from difflib import SequenceMatcher

In [45]:
# For demonstration, we simulate a morphological normalization function.
def simple_lemmatizer(tokens):
    """
    Simulated lemmatizer for Sanskrit-like text.
    Replace this with an actual lemmatizer for production.
    """
    lemma_map = {
        'रामः': 'राम',
        'रामं': 'राम',
        'गच्छति': 'गम्',
        'गतः': 'गम्',
        'वनं': 'वन',
        'गच्छामि': 'गम्'
    }
    return [lemma_map.get(token, token) for token in tokens]

def compute_ter_m(reference_tokens, hypothesis_tokens):
    """
    Computes TER-M (Translation Edit Rate with Morphology Awareness)
    by lemmatizing both reference and hypothesis before computing TER.
    """
    norm_ref = simple_lemmatizer(reference_tokens)
    norm_hyp = simple_lemmatizer(hypothesis_tokens)

    ref_len = len(norm_ref)
    if ref_len == 0:
        return 1.0 if len(norm_hyp) > 0 else 0.0

    matcher = SequenceMatcher(None, norm_ref, norm_hyp)
    edits = 0
    for opcode in matcher.get_opcodes():
        tag, i1, i2, j1, j2 = opcode
        if tag != 'equal':
            edits += max(i2 - i1, j2 - j1)

    return round(edits / ref_len, 3)

In [47]:
ref = ['रामः', 'वनं', 'गच्छति']
hyp1 = ['वनं', 'गच्छति', 'रामः']  # Different order but lemmatized match
hyp2 = ['वनं', 'गच्छति', 'रामं']   # Slightly different surface form, same lemma

print("TER-M (Order Change):", compute_ter_m(ref, hyp1))  # Expected: 0.0
print("TER-M (Lemma Match):", compute_ter_m(ref, hyp2))  # Expected: 0.0

TER-M (Order Change): 0.667
TER-M (Lemma Match): 0.667


## Exact Match (EM)

In [48]:
def compute_exact_match(reference, hypothesis):
    """
    Computes Exact Match (EM) score for a single example.

    Args:
        reference (list or str): Reference tokens or sentence string
        hypothesis (list or str): Predicted tokens or sentence string

    Returns:
        int: 1 if exact match, else 0
    """
    if isinstance(reference, list):
        reference = ' '.join(reference)
    if isinstance(hypothesis, list):
        hypothesis = ' '.join(hypothesis)

    return int(reference.strip() == hypothesis.strip())

def compute_exact_match_score(references, hypotheses):
    """
    Computes Exact Match score for a dataset of predictions.

    Args:
        references (list): List of reference strings or token lists
        hypotheses (list): List of predicted strings or token lists

    Returns:
        float: EM score as a proportion (0 to 1)
    """
    total = len(references)
    matches = sum(compute_exact_match(r, h) for r, h in zip(references, hypotheses))
    return round(matches / total, 3) if total > 0 else 0.0

In [49]:
# Example
ref = ['रामः वनं गच्छति']
hyp1 = ['रामः वनं गच्छति']     # Exact match
hyp2 = ['गच्छति रामः वनं']     # Order mismatch
hyp3 = ['रामः गच्छति']         # Missing word

refs = ref * 3
hyps = [hyp1[0], hyp2[0], hyp3[0]]
print("Exact Match Score:", compute_exact_match_score(refs, hyps))  # Expected: 0.333

Exact Match Score: 0.333


# Embedding-based Metrics

## BERTScore

In [97]:
from bert_score import score

In [98]:
def compute_bertscore(reference_list, hypothesis_list, lang='en', model_type='xlm-roberta-base', verbose=False):
    """
    Compute BERTScore (Precision, Recall, F1) using contextual embeddings.

    Args:
        reference_list (list): List of reference strings
        hypothesis_list (list): List of hypothesis strings
        lang (str): Language code (e.g., 'en', 'hi', 'sa')
        model_type (str): HuggingFace model type
        verbose (bool): If True, prints each sentence's score

    Returns:
        tuple: Average Precision, Recall, and F1 score
    """
    P, R, F1 = score(hypothesis_list, reference_list, lang=lang, model_type=model_type, verbose=verbose)
    return round(P.mean().item(), 4), round(R.mean().item(), 4), round(F1.mean().item(), 4)

In [99]:
# Example usage
ref = ['रामः वनं गच्छति।']
hyp = ['रामः वनं गतः।']

p, r, f1 = compute_bertscore(ref, hyp, lang='sa', model_type='xlm-roberta-base')
print(f"BERTScore-P: {p}")
print(f"BERTScore-R: {r}")
print(f"BERTScore-F1: {f1}")

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

BERTScore-P: 0.964
BERTScore-R: 0.9264
BERTScore-F1: 0.9448


## COMET

In [3]:
from comet import download_model, load_from_checkpoint
import torch

In [4]:
# COMET Scorer
class COMETScorer:
    def __init__(self, model_name="Unbabel/wmt22-comet-da"):
        self.model_path = download_model(model_name)
        self.model = load_from_checkpoint(self.model_path)

    def score(self, sources, hypotheses, references):
        data = [{"src": s, "mt": h, "ref": r} for s, h, r in zip(sources, hypotheses, references)]
        scores = self.model.predict(data, batch_size=1, gpus=1 if torch.cuda.is_available() else 0)["scores"]
        return [round(s, 4) for s in scores]

# ==== Example Data ====
srcs = ["राम जंगल में गया।"]                    # Source in Hindi
hyps = ["रामः वनं गतः।"]                        # Hypothesis in Sanskrit (system output)
refs = ["रामः वनं गच्छति।"]                     # Reference in Sanskrit (human translation)

# ==== Compute COMET Score ====
comet = COMETScorer()
print("COMET Score:", comet.score(srcs, hyps, refs))

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\Joshuva\.cache\huggingface\hub\models--Unbabel--wmt22-comet-da\snapshots\2760a223ac957f30acfb18c8aa649b01cf1d75f2\checkpoints\model.ckpt`
Encoder model frozen.
C:\Users\Joshuva\AppData\Roaming\Python\Python312\site-packages\pytorch_lightning\core\saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Predicting DataLoader 0: 100%|███████████████████████████████████████████████████████████| 1/1 [00:02<00:

COMET Score: [0.8752]





## LaBSE (Language-agnostic BERT Sentence Embedding)

In [102]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

In [103]:
class LaBSEScorer:
    def __init__(self, model_name='sentence-transformers/LaBSE'):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.model.eval()

    def embed(self, sentence):
        with torch.no_grad():
            inputs = self.tokenizer(sentence, return_tensors='pt', padding=True, truncation=True)
            output = self.model(**inputs)
            cls_embedding = output.last_hidden_state[:, 0, :]
            return F.normalize(cls_embedding, p=2, dim=1)

    def score(self, ref_sentence, hyp_sentence):
        ref_emb = self.embed(ref_sentence)
        hyp_emb = self.embed(hyp_sentence)
        cosine_sim = F.cosine_similarity(ref_emb, hyp_emb).item()
        return round(cosine_sim, 4)

In [104]:
# Example
scorer = LaBSEScorer()
reference = "रामः वनं गच्छति।"
hypothesis = "रामः वनं गतः।"
score = scorer.score(reference, hypothesis)
print(f"LaBSE Cosine Similarity: {score}")

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   6%|5         | 105M/1.88G [00:00<?, ?B/s]

LaBSE Cosine Similarity: 0.96


## YiSiScorer

In [1]:
import fasttext
import fasttext.util
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

In [2]:
class YiSiScorer:
    def __init__(self, contextual_model='sentence-transformers/LaBSE'):
        # Load contextual model (LaBSE)
        self.tokenizer = AutoTokenizer.from_pretrained(contextual_model)
        self.model = AutoModel.from_pretrained(contextual_model)
        self.model.eval()

        # Load FastText Sanskrit model
        fasttext.util.download_model('sa', if_exists='ignore')
        self.static_model = fasttext.load_model('cc.sa.300.bin')

    def cosine_similarity(self, vec1, vec2):
        return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2) + 1e-8)

    def compute_yisi_0(self, ref_tokens, hyp_tokens):
        ref_set = set(ref_tokens)
        hyp_set = set(hyp_tokens)
        match = len(ref_set & hyp_set)
        P = match / len(hyp_set) if hyp_set else 0.0
        R = match / len(ref_set) if ref_set else 0.0
        return round((2 * P * R) / (P + R), 3) if (P + R) else 0.0

    def compute_yisi_1(self, ref_tokens, hyp_tokens):
        def max_sim(word, other_words):
            vec1 = self.static_model.get_word_vector(word)
            return max(
                self.cosine_similarity(vec1, self.static_model.get_word_vector(w))
                for w in other_words
            )

        P = sum(max_sim(w, ref_tokens) for w in hyp_tokens) / len(hyp_tokens)
        R = sum(max_sim(w, hyp_tokens) for w in ref_tokens) / len(ref_tokens)
        return round((2 * P * R) / (P + R), 3) if (P + R) else 0.0

    def compute_yisi_2(self, ref_tokens, hyp_tokens):
        with torch.no_grad():
            inputs_ref = self.tokenizer(ref_tokens, return_tensors='pt', is_split_into_words=True, padding=True, truncation=True)
            inputs_hyp = self.tokenizer(hyp_tokens, return_tensors='pt', is_split_into_words=True, padding=True, truncation=True)
            ref_embs = self.model(**inputs_ref).last_hidden_state[0][1:1+len(ref_tokens)]
            hyp_embs = self.model(**inputs_hyp).last_hidden_state[0][1:1+len(hyp_tokens)]

            P = torch.stack([F.cosine_similarity(h.unsqueeze(0), ref_embs).max() for h in hyp_embs]).mean().item()
            R = torch.stack([F.cosine_similarity(r.unsqueeze(0), hyp_embs).max() for r in ref_embs]).mean().item()
            return round((2 * P * R) / (P + R), 3) if (P + R) else 0.0

    def compute_yisi_3(self, ref_sentence, hyp_sentence):
        with torch.no_grad():
            inputs_ref = self.tokenizer(ref_sentence, return_tensors='pt', padding=True, truncation=True)
            inputs_hyp = self.tokenizer(hyp_sentence, return_tensors='pt', padding=True, truncation=True)
            ref_emb = self.model(**inputs_ref).last_hidden_state[:, 0, :]
            hyp_emb = self.model(**inputs_hyp).last_hidden_state[:, 0, :]
            sim = F.cosine_similarity(ref_emb, hyp_emb).item()
            return round(sim, 4)

# ========== Test Example ==========
scorer = YiSiScorer()

# Token level
ref_tokens = ['रामः', 'वनं', 'गच्छति']
hyp_tokens = ['रामः', 'अरण्यं', 'याति']

# Sentence level
ref_sentence = "रामः वनं गच्छति।"
hyp_sentence = "Rama goes to the forest."

print("YiSi-0:", scorer.compute_yisi_0(ref_tokens, hyp_tokens))
print("YiSi-1:", scorer.compute_yisi_1(ref_tokens, hyp_tokens))
print("YiSi-2:", scorer.compute_yisi_2(ref_tokens, hyp_tokens))
print("YiSi-3:", scorer.compute_yisi_3(ref_sentence, hyp_sentence))

YiSi-0: 0.333
YiSi-1: 0.579
YiSi-2: 0.915
YiSi-3: 0.7757


## X(NLI)-R

In [108]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

In [109]:
class XNLIRScorer:
    def __init__(self, model_name="joeddav/xlm-roberta-large-xnli"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.model.eval()

    def predict_label(self, premise, hypothesis):
        inputs = self.tokenizer(premise, hypothesis, return_tensors='pt', truncation=True)
        with torch.no_grad():
            logits = self.model(**inputs).logits
        probs = F.softmax(logits, dim=1)
        label_id = torch.argmax(probs, dim=1).item()
        return label_id  # 0: entailment, 1: neutral, 2: contradiction

    def compute_xnli_r(self, reference, hypotheses):
        entail_count = 0
        for hyp in hypotheses:
            label = self.predict_label(reference, hyp)
            if label == 0:
                entail_count += 1
        return round(entail_count / len(hypotheses), 3) if hypotheses else 0.0

In [110]:
# Example usage
if __name__ == "__main__":
    scorer = XNLIRScorer()
    reference = "रामः वनं गच्छति।"
    hypotheses = [
        "रामः वनं गतः।",       # Entailment
        "रामः वनं न गच्छति।",  # Contradiction
        "रामः पुष्पं पश्यति।"   # Neutral
    ]
    score = scorer.compute_xnli_r(reference, hypotheses)
    print(f"X(NLI)-R Score: {score}")

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


X(NLI)-R Score: 0.333


## X(NLI)-D

In [111]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

In [112]:
# Load pre-trained XNLI model (XLM-RoBERTa)
model_name = "joeddav/xlm-roberta-large-xnli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Input: Reference (premise) and hypotheses
reference = "रामः वनं गच्छति।"
hypotheses = [
    "रामः वनं गतः।",        # Entailment
    "रामः पुष्पं पश्यति।",   # Neutral
    "रामः वनं न गच्छति।"     # Contradiction
]

# Class mapping
label_map = {0: "Contradiction", 1: "Neutral", 2: "Entailment"}

# Count non-contradiction cases
non_contradictions = 0

for hypo in hypotheses:
    inputs = tokenizer(reference, hypo, return_tensors="pt", truncation=True)
    with torch.no_grad():
        logits = model(**inputs).logits
    prediction = torch.argmax(F.softmax(logits, dim=1)).item()
    result = label_map[prediction]
    print(f"Hypothesis: {hypo}\n→ Prediction: {result}\n")
    
    if result != "Contradiction":
        non_contradictions += 1

# Compute X(NLI)-D Score
xnli_d_score = non_contradictions / len(hypotheses)
print(f"✅ X(NLI)-D Score: {xnli_d_score:.3f}")

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Hypothesis: रामः वनं गतः।
→ Prediction: Entailment

Hypothesis: रामः पुष्पं पश्यति।
→ Prediction: Neutral

Hypothesis: रामः वनं न गच्छति।
→ Prediction: Contradiction

✅ X(NLI)-D Score: 0.667


# Structural Metrics

## UAS (Unlabeled Attachment Score)

In [None]:
def compute_uas(reference_heads, predicted_heads):
    """
    Computes Unlabeled Attachment Score (UAS)

    Args:
        reference_heads (list): List of gold head indices or tokens
        predicted_heads (list): List of predicted head indices or tokens

    Returns:
        float: UAS score rounded to 3 decimals
    """
    if len(reference_heads) != len(predicted_heads):
        raise ValueError("Mismatch in reference and predicted lengths")

    correct = sum(1 for r, p in zip(reference_heads, predicted_heads) if r == p)
    total = len(reference_heads)
    return round(correct / total, 3) if total > 0 else 0.0

In [67]:
# Example
# Tokens: ['रामः', 'वनं', 'गच्छति']
# Head values can be actual tokens or their IDs
ref_heads = ['गच्छति', 'गच्छति', 'ROOT']
pred_heads = ['गच्छति', 'रामः', 'ROOT']

uas_score = compute_uas(ref_heads, pred_heads)
print("UAS Score:", uas_score)  # Expected: 2/3 = 0.667

UAS Score: 0.667


## Labeled Attachment Score (LAS)

In [68]:
def compute_las(reference_heads, predicted_heads, reference_labels, predicted_labels):
    """
    Computes Labeled Attachment Score (LAS)

    Args:
        reference_heads (list): List of gold head indices or tokens
        predicted_heads (list): List of predicted head indices or tokens
        reference_labels (list): List of gold dependency labels
        predicted_labels (list): List of predicted dependency labels

    Returns:
        float: LAS score rounded to 3 decimals
    """
    if not (len(reference_heads) == len(predicted_heads) == len(reference_labels) == len(predicted_labels)):
        raise ValueError("All input lists must have the same length")

    correct = sum(
        1 for rh, ph, rl, pl in zip(reference_heads, predicted_heads, reference_labels, predicted_labels)
        if rh == ph and rl == pl
    )
    total = len(reference_heads)
    return round(correct / total, 3) if total > 0 else 0.0

In [69]:
# Example
ref_heads = ['गच्छति', 'गच्छति', 'ROOT']
pred_heads = ['गच्छति', 'गच्छति', 'ROOT']
ref_labels = ['nominal subject', 'obj', 'root']
pred_labels = ['nominal subject', 'obl', 'root']

las_score = compute_las(ref_heads, pred_heads, ref_labels, pred_labels)
print("LAS Score:", las_score)  # Expected: 2/3 = 0.667

LAS Score: 0.667


## Bits Per Character (BPC)

In [70]:
import math

In [71]:
def compute_bpc(probabilities):
    """
    Computes Bits Per Character (BPC) for a given list of predicted probabilities.

    Args:
        probabilities (list): List of predicted probabilities for correct characters (0 < p <= 1)

    Returns:
        float: BPC score rounded to 3 decimals
    """
    if not probabilities:
        return 0.0

    total_log_loss = sum(-math.log2(p) for p in probabilities if p > 0)
    return round(total_log_loss / len(probabilities), 3)


# Example
if __name__ == "__main__":
    predicted_probs = [
        0.8, 0.7, 0.6, 0.9, 0.7,
        0.65, 0.7, 0.8, 0.7, 0.6,
        0.7, 0.85, 0.6, 0.75, 0.9
    ]
    bpc_score = compute_bpc(predicted_probs)
    print("BPC Score:", bpc_score)  # Expected: ~0.531

BPC Score: 0.467


# Ranking & Retrieval-based Metrics

## Mean Reciprocal Rank (MRR)

In [95]:
def compute_mrr(rank_list):
    """
    Computes Mean Reciprocal Rank (MRR)

    Args:
        rank_list (list): A list of integers where each integer is the rank position
                          of the first correct answer for each query.

    Returns:
        float: MRR score rounded to 3 decimals
    """
    if not rank_list:
        return 0.0

    reciprocal_sum = sum(1.0 / r for r in rank_list if r > 0)
    return round(reciprocal_sum / len(rank_list), 3)

In [96]:
# Example
ranks = [1, 2, 3]  # first relevant answer found at positions 1, 2, 3
mrr_score = compute_mrr(ranks)
print("MRR Score:", mrr_score)  

MRR Score: 0.611


## Mean Average Precision (MAP)

In [75]:
def average_precision(relevance_list):
    """
    Computes Average Precision (AP) for a single query

    Args:
        relevance_list (list): List of 0s and 1s indicating relevance at each rank position

    Returns:
        float: Average Precision (AP)
    """
    num_relevant = sum(relevance_list)
    if num_relevant == 0:
        return 0.0

    score = 0.0
    correct = 0
    for i, rel in enumerate(relevance_list):
        if rel:
            correct += 1
            score += correct / (i + 1)
    return round(score / num_relevant, 3)

def mean_average_precision(all_queries):
    """
    Computes Mean Average Precision (MAP) across multiple queries

    Args:
        all_queries (list): List of relevance lists, one per query

    Returns:
        float: Mean Average Precision (MAP)
    """
    if not all_queries:
        return 0.0
    return round(sum(average_precision(q) for q in all_queries) / len(all_queries), 3)

In [76]:
# rel(k) for 3 queries:
queries = [
        [1, 0, 1, 0],   # AP = (1 + 2/3)/2 = 0.833
        [1, 1, 0],      # AP = (1 + 2/2)/2 = 0.75
        [1, 0, 0]       # AP = 1.0
    ]

map_score = mean_average_precision(queries)
print("MAP Score:", map_score)  

MAP Score: 0.944


# Statistical & Human Agreement Metrics

## Perplexity

In [77]:
import math

In [78]:
def compute_perplexity(probabilities):
    """
    Computes Perplexity for a given list of conditional probabilities.

    Args:
        probabilities (list): List of P(w_i | w_1^{i-1}) values (0 < p <= 1)

    Returns:
        float: Perplexity score rounded to 3 decimals
    """
    if not probabilities:
        return 0.0

    log_sum = sum(math.log(p) for p in probabilities if p > 0)
    avg_log = log_sum / len(probabilities)
    perplexity = math.exp(-avg_log)
    return round(perplexity, 3)

In [79]:
# Example
model_probs = [0.4, 0.5, 0.2]  # रामः, वनं, गच्छति
result = compute_perplexity(model_probs)
print("Perplexity:", result)  # Expected: ~2.92

Perplexity: 2.924


## Token-level F1 Score for Segmentation

In [80]:
def compute_token_level_f1(gold_boundaries, predicted_boundaries):
    """
    Compute token-level F1 Score for segmentation.

    Args:
        gold_boundaries (list): List of boundary indices from gold segmentation.
        predicted_boundaries (list): List of boundary indices from system output.

    Returns:
        dict: Dictionary with precision, recall, and F1 score.
    """
    gold_set = set(gold_boundaries)
    pred_set = set(predicted_boundaries)

    true_positives = len(gold_set & pred_set)
    false_positives = len(pred_set - gold_set)
    false_negatives = len(gold_set - pred_set)

    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0.0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0.0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

    return {
        "precision": round(precision, 3),
        "recall": round(recall, 3),
        "f1_score": round(f1, 3)
    }

In [81]:
# Example
# Reference: रा | जा | अनु | गच्छति → boundaries at indices 2, 4, 7
# Predicted: राजा | नु | गच्छति → boundaries at indices 4, 6
gold = [2, 4, 7]
predicted = [4, 6]

result = compute_token_level_f1(gold, predicted)
print("Token-Level Segmentation Evaluation:")
print(result)  # Expected: {'precision': 0.5, 'recall': 0.333, 'f1_score': 0.4}

Token-Level Segmentation Evaluation:
{'precision': 0.5, 'recall': 0.333, 'f1_score': 0.4}


## Cohen’s Kappa (κ)

In [82]:
def compute_cohens_kappa(labels_a, labels_b):
    """
    Computes Cohen's Kappa between two annotators' labels.

    Args:
        labels_a (list): Labels from annotator A
        labels_b (list): Labels from annotator B

    Returns:
        float: Cohen's Kappa score rounded to 3 decimals
    """
    if len(labels_a) != len(labels_b):
        raise ValueError("Annotation lists must be of same length")

    total = len(labels_a)
    observed_agreement = sum(a == b for a, b in zip(labels_a, labels_b)) / total

    from collections import Counter
    counter_a = Counter(labels_a)
    counter_b = Counter(labels_b)

    expected_agreement = sum((counter_a[label] / total) * (counter_b[label] / total)
                             for label in set(counter_a) | set(counter_b))

    if expected_agreement == 1:
        return 1.0

    kappa = (observed_agreement - expected_agreement) / (1 - expected_agreement)
    return round(kappa, 3)

In [83]:
labels_annotator_a = ['LOC', 'O', 'LOC', 'O', 'LOC']
labels_annotator_b = ['LOC', 'O', 'O', 'O', 'LOC']

kappa_score = compute_cohens_kappa(labels_annotator_a, labels_annotator_b)
print("Cohen's Kappa:", kappa_score)  # Expected ≈ 0.615

Cohen's Kappa: 0.615


## Krippendorff’s Alpha

In [84]:
def compute_cohens_kappa(labels_a, labels_b):
    """
    Computes Cohen's Kappa between two annotators' labels.

    Args:
        labels_a (list): Labels from annotator A
        labels_b (list): Labels from annotator B

    Returns:
        float: Cohen's Kappa score rounded to 3 decimals
    """
    if len(labels_a) != len(labels_b):
        raise ValueError("Annotation lists must be of same length")

    total = len(labels_a)
    observed_agreement = sum(a == b for a, b in zip(labels_a, labels_b)) / total

    from collections import Counter
    counter_a = Counter(labels_a)
    counter_b = Counter(labels_b)

    expected_agreement = sum((counter_a[label] / total) * (counter_b[label] / total)
                             for label in set(counter_a) | set(counter_b))

    if expected_agreement == 1:
        return 1.0

    kappa = (observed_agreement - expected_agreement) / (1 - expected_agreement)
    return round(kappa, 3)

In [85]:
# Example
labels_annotator_a = ['LOC', 'O', 'LOC', 'O', 'LOC']
labels_annotator_b = ['LOC', 'O', 'O', 'O', 'LOC']

kappa_score = compute_cohens_kappa(labels_annotator_a, labels_annotator_b)
print("Cohen's Kappa:", kappa_score) 

Cohen's Kappa: 0.615


## Informedness (Bookmaker Informedness / Youden’s J Statistic)

In [86]:
def compute_informedness(tp, tn, fp, fn):
    """
    Computes Bookmaker Informedness (Youden’s J Statistic).

    Args:
        tp (int): True Positives
        tn (int): True Negatives
        fp (int): False Positives
        fn (int): False Negatives

    Returns:
        float: Informedness score rounded to 3 decimals
    """
    tpr = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    tnr = tn / (tn + fp) if (tn + fp) > 0 else 0.0
    informedness = tpr + tnr - 1
    return round(informedness, 3)

In [87]:
# Example
TP = 3
TN = 5
FP = 1
FN = 1

informed_score = compute_informedness(TP, TN, FP, FN)
print("Informedness (Youden’s J):", informed_score)  

Informedness (Youden’s J): 0.583


## Matthews Correlation Coefficient (MCC)

In [91]:
import math

def compute_mcc(tp, tn, fp, fn):
    """
    Computes Matthews Correlation Coefficient (MCC)

    Args:
        tp (int): True Positives
        tn (int): True Negatives
        fp (int): False Positives
        fn (int): False Negatives

    Returns:
        float: MCC score rounded to 3 decimals
    """
    numerator = (tp * tn) - (fp * fn)
    denominator = math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))

    if denominator == 0:
        return 0.0

    return round(numerator / denominator, 3)

In [92]:
# Example
TP = 3
TN = 4
FP = 2
FN = 1

mcc_score = compute_mcc(TP, TN, FP, FN)
print("Matthews Correlation Coefficient:", mcc_score) 

Matthews Correlation Coefficient: 0.408


## LEPOR (Length Penalty, Precision, n-gram Position difference Penalty, Recall)

In [93]:
def compute_lepor(precision, recall, ref_len, hyp_len, ref_positions, hyp_positions, alpha=0.5, beta=1):
    """
    Computes LEPOR score for machine translation evaluation.

    Args:
        precision (float): token precision
        recall (float): token recall
        ref_len (int): length of reference sentence
        hyp_len (int): length of hypothesis sentence
        ref_positions (list): reference token positions (1-indexed)
        hyp_positions (list): hypothesis token positions (1-indexed, aligned to ref tokens)
        alpha (float): weight between precision and recall (default 0.5)
        beta (float): scaling factor (default 1)

    Returns:
        float: LEPOR score rounded to 3 decimals
    """
    # Length Penalty
    lp = min(hyp_len / ref_len, ref_len / hyp_len)

    # Harmonic Mean of Precision and Recall
    harmonic = (precision * recall) / (alpha * precision + (1 - alpha) * recall) if (precision > 0 and recall > 0) else 0.0

    # Normalized Position Penalty (NPP)
    pos_diff_sum = sum(abs(h - r) for h, r in zip(hyp_positions, ref_positions))
    npp = pos_diff_sum / (ref_len * hyp_len) if ref_len * hyp_len > 0 else 0.0

    lepor = lp * (harmonic ** beta) * (1 - npp)
    return round(lepor, 3)

In [94]:
# Example
ref_tokens = ['रामः', 'वनं', 'गच्छति']
hyp_tokens = ['गच्छति', 'रामः', 'वनं']

precision = recall = 1.0
ref_len = hyp_len = 3
ref_positions = [1, 2, 3]
hyp_positions = [2, 3, 1]  # aligned to the same tokens in reference

lepor_score = compute_lepor(precision, recall, ref_len, hyp_len, ref_positions, hyp_positions)
print("LEPOR Score:", lepor_score)  # Expected ≈ 0.556

LEPOR Score: 0.556
