# Chapter 12 â€” Evaluation Beyond Perplexity (Colab)

BLEU, ROUGE-L, METEOR (simplified), and diversity checks. Self-contained,
one creation per cell, with short docstrings and comments.


In [None]:
# Imports and style
import math
from collections import Counter
import matplotlib.pyplot as plt
plt.style.use('seaborn-v0_8')
%config InlineBackend.figure_format = 'svg'
'ok'


In [None]:
# Tiny BLEU (corpus)
def ngram_counts(tokens, n):
    return Counter(tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1))
def bleu_corpus(references, hypotheses, max_n=4, smooth=True):
    """Educational BLEU with Add-1 smoothing and brevity penalty.
    references: list of list of token lists; hypotheses: list of token lists.
    Returns BLEU in [0,1].
    """
    num, den = [0]*max_n, [0]*max_n; ref_len = 0; hyp_len = 0
    for refs, hyp in zip(references, hypotheses):
        hyp_len += len(hyp)
        rl = [len(r) for r in refs]
        ref_len += min(rl, key=lambda L: (abs(L-len(hyp)), L))
        for n in range(1, max_n+1):
            h = ngram_counts(hyp, n)
            m = Counter();
            for r in refs: m |= ngram_counts(r, n)
            overlap = {g: min(c, m.get(g,0)) for g,c in h.items()}
            num[n-1] += sum(overlap.values()); den[n-1] += max(1, sum(h.values()))
    prec = [((num[i]+1)/(den[i]+1) if smooth else (0 if den[i]==0 else num[i]/den[i]))
            for i in range(max_n)]
    gm = math.exp(sum((1/max_n)*math.log(max(p,1e-16)) for p in prec))
    bp = 1.0 if hyp_len > ref_len else math.exp(1 - ref_len/max(1,hyp_len))
    return bp*gm
refs = [["the cat is on the mat".split()]]
hyp = ["the cat sat on the mat".split()]
round(bleu_corpus(refs, hyp), 3)


In [None]:
# ROUGE-L via LCS
def lcs_length(a, b):
    dp = [0]*(len(b)+1)
    for i in range(1, len(a)+1):
        prev = 0
        for j in range(1, len(b)+1):
            tmp = dp[j]
            dp[j] = prev + 1 if a[i-1]==b[j-1] else max(dp[j], dp[j-1])
            prev = tmp
    return dp[len(b)]
def rouge_l(references, hypotheses, beta=1.2):
    """Average F-measure over best reference per example.
    """
    scores = []
    for refs, hyp in zip(references, hypotheses):
        best = 0.0
        for r in refs:
            l = lcs_length(r, hyp)
            if l==0: continue
            p = l/max(1,len(hyp)); q = l/max(1,len(r))
            b2 = beta*beta; f = (1+b2)*p*q/max(b2*p+q,1e-12)
            best = max(best, f)
        scores.append(best)
    return sum(scores)/max(1,len(scores))
refs = [["the cat is on the mat".split()]]
hyp = ["the cat sat on the mat".split()]
round(rouge_l(refs, hyp), 3)


In [None]:
# Simplified METEOR: unigram F-mean with chunk penalty
def matching_chunks(h, r):
    pos = {}
    for j,t in enumerate(r): pos.setdefault(t, []).append(j)
    matches, chunks, prev = 0, 0, None
    for t in h:
        if not pos.get(t): continue
        j = pos[t].pop(0); matches += 1
        if prev is None or j != prev+1: chunks += 1
        prev = j
    return matches, chunks
def meteor_simple(references, hypotheses, alpha=0.9, beta=3.0, gamma=0.5):
    scores = []
    for refs, hyp in zip(references, hypotheses):
        best = 0.0
        for r in refs:
            hc, rc = Counter(hyp), Counter(r)
            overlap = sum(min(hc[t], rc[t]) for t in hc)
            P = overlap/max(1,len(hyp)); R = overlap/max(1,len(r))
            if P==0 or R==0: cand = 0.0
            else:
                Fm = (P*R)/max(alpha*P+(1-alpha)*R, 1e-12)
                m, ch = matching_chunks(hyp, r)
                pen = 0.0 if m==0 else gamma*((ch/m)**beta)
                cand = Fm*(1-pen)
            best = max(best, cand)
        scores.append(best)
    return sum(scores)/max(1,len(scores))
refs = [["the cat is on the mat".split()]]
hyp = ["the cat sat on the mat".split()]
round(meteor_simple(refs, hyp), 3)


In [None]:
# Diversity: distinct-1 / distinct-2
def distinct_n(hyps, n=1):
    grams, total = Counter(), 0
    for h in hyps:
        c = Counter(tuple(h[i:i+n]) for i in range(len(h)-n+1))
        grams.update(c); total += sum(c.values())
    return 0.0 if total==0 else len(grams)/total
hyps = ["the cat sat on the mat".split(),
        "the cat sat on the mat".split()]
round(distinct_n(hyps,1),3), round(distinct_n(hyps,2),3)
