In [10]:
import copy
import sys, math, re
from collections import defaultdict

def precook(s, n=4, out=False):
    words = s.split()
    counts = defaultdict(int)
    for k in range(1, n + 1):
        for i in range(len(words) - k + 1):
            ngram = tuple(words[i:i + k])
            counts[ngram] += 1
    return (len(words), counts)

def cook_refs(refs, eff=None, n=4):
    reflen = []
    maxcounts = {}
    for ref in refs:
        rl, counts = precook(ref, n)
        reflen.append(rl)
        for (ngram, count) in counts.items():
            maxcounts[ngram] = max(maxcounts.get(ngram, 0), count)
    if eff == "shortest":
        reflen = min(reflen)
    elif eff == "average":
        reflen = float(sum(reflen)) / len(reflen)
    return (reflen, maxcounts)

def cook_test(test, tup_obj, eff=None, n=4):
    (reflen, refmaxcounts) = tup_obj
    testlen, counts = precook(test, n, True)
    result = {}
    if eff == "closest":
        result["reflen"] = min((abs(l - testlen), l) for l in reflen)[1]
    else:
        result["reflen"] = reflen
    result["testlen"] = testlen
    result["guess"] = [max(0, testlen - k + 1) for k in range(1, n + 1)]
    result['correct'] = [0] * n
    for (ngram, count) in counts.items():
        result["correct"][len(ngram) - 1] += min(refmaxcounts.get(ngram, 0), count)
    return result

class BleuScorer(object):
    __slots__ = "n", "crefs", "ctest", "_score", "_ratio", "_testlen", "_reflen", "special_reflen"
    def copy(self):
        new = BleuScorer(n=self.n)
        new.ctest = copy.copy(self.ctest)
        new.crefs = copy.copy(self.crefs)
        new._score = None
        return new
    def __init__(self, test=None, refs=None, n=4, special_reflen=None):
        self.n = n
        self.crefs = []
        self.ctest = []
        self.cook_append(test, refs)
        self.special_reflen = special_reflen
    def cook_append(self, test, refs):
        if refs is not None:
            self.crefs.append(cook_refs(refs))
            if test is not None:
                cooked_test = cook_test(test, self.crefs[-1])
                self.ctest.append(cooked_test)
            else:
                self.ctest.append(None)
        self._score = None
    def ratio(self, option=None):
        self.compute_score(option=option)
        return self._ratio
    def score_ratio(self, option=None):
        return (self.fscore(option=option), self.ratio(option=option))
    def score_ratio_str(self, option=None):
        return "%.4f (%.2f)" % self.score_ratio(option)
    def reflen(self, option=None):
        self.compute_score(option=option)
        return self._reflen
    def testlen(self, option=None):
        self.compute_score(option=option)
        return self._testlen
    def retest(self, new_test):
        if type(new_test) is str:
            new_test = [new_test]
        assert len(new_test) == len(self.crefs), new_test
        self.ctest = []
        for t, rs in zip(new_test, self.crefs):
            self.ctest.append(cook_test(t, rs))
        self._score = None
        return self
    def rescore(self, new_test):
        return self.retest(new_test).compute_score()
    def size(self):
        assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
        return len(self.crefs)
    def __iadd__(self, other):
        if type(other) is tuple:
            self.cook_append(other[0], other[1])
        else:
            assert self.compatible(other), "incompatible BLEUs."
            self.ctest.extend(other.ctest)
            self.crefs.extend(other.crefs)
            self._score = None

        return self
    def compatible(self, other):
        return isinstance(other, BleuScorer) and self.n == other.n
    def single_reflen(self, option="average"):
        return self._single_reflen(self.crefs[0][0], option)
    def _single_reflen(self, reflens, option=None, testlen=None):
        if option == "shortest":
            reflen = min(reflens)
        elif option == "average":
            reflen = float(sum(reflens)) / len(reflens)
        elif option == "closest":
            reflen = min((abs(l - testlen), l) for l in reflens)[1]
        else:
            assert False, "unsupported reflen option %s" % option
        return reflen
    def recompute_score(self, option=None, verbose=0):
        self._score = None
        return self.compute_score(option, verbose)
    def compute_score(self, option=None, verbose=0):
        n = self.n
        small = 1e-9
        tiny = 1e-15
        bleu_list = [[] for _ in range(n)]
        if self._score is not None:
            return self._score
        if option is None:
            option = "average" if len(self.crefs) == 1 else "closest"
        self._testlen = 0
        self._reflen = 0
        totalcomps = {'testlen': 0, 'reflen': 0, 'guess': [0] * n, 'correct': [0] * n}
        for comps in self.ctest:
            testlen = comps['testlen']
            self._testlen += testlen
            if self.special_reflen is None:
                reflen = self._single_reflen(comps['reflen'], option, testlen)
            else:
                reflen = self.special_reflen
            self._reflen += reflen
            for key in ['guess', 'correct']:
                for k in range(n):
                    totalcomps[key][k] += comps[key][k]
            bleu = 1.
            for k in range(n):
                bleu *= (float(comps['correct'][k]) + tiny) / (float(comps['guess'][k]) + small)
                bleu_list[k].append(bleu ** (1. / (k + 1)))
            ratio = (testlen + tiny) / (reflen + small)  ## N.B.: avoid zero division
            if ratio < 1:
                for k in range(n):
                    bleu_list[k][-1] *= math.exp(1 - 1 / ratio)
            if verbose > 1:
                print(comps, reflen)
        totalcomps['reflen'] = self._reflen
        totalcomps['testlen'] = self._testlen
        bleus = []
        bleu = 1.
        for k in range(n):
            bleu *= float(totalcomps['correct'][k] + tiny) / (totalcomps['guess'][k] + small)
            bleus.append(bleu ** (1. / (k + 1)))
        ratio = (self._testlen + tiny) / (self._reflen + small)  ## N.B.: avoid zero division
        if ratio < 1:
            for k in range(n):
                bleus[k] *= math.exp(1 - 1 / ratio)
        if verbose > 0:
            print(totalcomps)
            print("ratio:", ratio)
        self._score = bleus
        return self._score, bleu_list

class Bleu:
    def __init__(self, n=4):
        self._n = n
        self._hypo_for_image = {}
        self.ref_for_image = {}
    def compute_score(self, gts, res):
        assert (gts.keys() == res.keys())
        imgIds = gts.keys()
        bleu_scorer = BleuScorer(n=self._n)
        for id in imgIds:
            hypo = res[id]
            ref = gts[id]
            assert (type(hypo) is list)
            assert (len(hypo) == 1)
            assert (type(ref) is list)
            assert (len(ref) >= 1)
            bleu_scorer += (hypo[0], ref)
        #score, scores = bleu_scorer.compute_score(option='shortest')
        score, scores = bleu_scorer.compute_score(option='closest', verbose=1)
        #score, scores = bleu_scorer.compute_score(option='average', verbose=1)
        # return (bleu, bleu_info)
        return score, scores
    def method(self):
        return "Bleu"

In [23]:
refs = {0: ["this is a test", "this is also a test"]}
hyps = {0: ["this is a good test"]}

for i in range(1,10):
    bleu = Bleu(i)
    score, scores = bleu.compute_score(refs, hyps)
    print(score)

{'testlen': 5, 'reflen': 4, 'guess': [5], 'correct': [4]}
ratio: 1.2499999996875002
[0.7999999998400001]
{'testlen': 5, 'reflen': 4, 'guess': [5, 4], 'correct': [4, 2]}
ratio: 1.2499999996875002
[0.7999999998400001, 0.6324555318913736]
{'testlen': 5, 'reflen': 4, 'guess': [5, 4, 3], 'correct': [4, 2, 1]}
ratio: 1.2499999996875002
[0.7999999998400001, 0.6324555318913736, 0.5108729547956411]
{'testlen': 5, 'reflen': 4, 'guess': [5, 4, 3, 2], 'correct': [4, 2, 1, 0]}
ratio: 1.2499999996875002
[0.7999999998400001, 0.6324555318913736, 0.5108729547956411, 9.036020033199396e-05]


IndexError: list index out of range