# Translation Evaluation

In [1]:
import os

import pandas as pd
from sacrebleu.metrics import BLEU, CHRF

## Initializing Translation Scores DataFrame

In [None]:
scores = pd.DataFrame(
    columns=["lang", "BLEU_num_beams_1", "chrF++_num_beams_1", "COMET_num_beams_1"]
)

## Translation Quality Scores for Beam Size of 1

In [None]:
lang_data = pd.read_csv("../lang_data/lang_data.csv")

In [None]:
# Mapping from language to language-specific BLEU tokenizers
BLEU_LANG_MAP = {
    "cmn_Hans": "zh",
    "cmn_Hant": "zh",
    "jpn_Jpan": "ja",
    "kor_Hang": "ko"
}

In [None]:
for lang in lang_data["lang"]:
    if f"hyp.{lang}_1" not in os.listdir("translations/num_beams_1/"): continue
    # Reference sentences
    with open(f"../floresp-v2.0-rc.2/dev/dev.{lang}") as ref:
        sents_ref = [line.strip() for line in ref.readlines()]
    # Translated sentences
    with open(f"translations/num_beams_1/hyp.{lang}_1") as hyp:
        sents_hyp = [line.strip() for line in hyp.readlines()]
    trg_lang = BLEU_LANG_MAP.get(lang, None)
    bleu_score = BLEU(trg_lang=trg_lang).corpus_score(sents_hyp, [sents_ref]).score
    chrf_score = CHRF(word_order=2).corpus_score(sents_hyp, [sents_ref]).score
    lang_scores = {
        "lang": lang,
        "BLEU_num_beams_1": bleu_score,
        "chrF++_num_beams_1": chrf_score
    }
    scores = scores.append(lang_scores, ignore_index=True)

In [None]:
sbatch comet.sh  # To be run on the cluster