# Translation Evaluation

In [1]:
import os

import pandas as pd
from sacrebleu.metrics import BLEU, CHRF

## Initializing Translation Scores DataFrame

In [None]:
scores = pd.DataFrame(
    columns=["lang", "BLEU_num_beams_1", "chrF++_num_beams_1", "COMET_num_beams_1"]
)

In [None]:
scores = pd.read_csv("scores.csv")

In [None]:
scores

## Translation Quality Scores for Beam Size of 1

In [None]:
lang_data = pd.read_csv("../lang_data/lang_data.csv")

In [None]:
# Mapping from language to language-specific BLEU tokenizers
BLEU_LANG_MAP = {
    "cmn_Hans": "zh",
    "cmn_Hant": "zh",
    "jpn_Jpan": "ja",
    "kor_Hang": "ko"
}

In [None]:
for lang in lang_data["lang"]:
    if (
        f"hyp.{lang}_1" not in os.listdir("translations/num_beams_1/")
        or lang in scores["lang"]
    ): continue
    # Reference sentences
    with open(f"../floresp-v2.0-rc.2/dev/dev.{lang}") as ref:
        sents_ref = [line.strip() for line in ref.readlines()]
    # Translated sentences
    with open(f"translations/num_beams_1/hyp.{lang}_1") as hyp:
        sents_hyp = [line.strip() for line in hyp.readlines()]
    trg_lang = BLEU_LANG_MAP.get(lang, None)
    bleu_score = BLEU(trg_lang=trg_lang).corpus_score(sents_hyp, [sents_ref]).score
    chrf_score = CHRF(word_order=2).corpus_score(sents_hyp, [sents_ref]).score
    lang_scores = {
        "lang": lang,
        "BLEU_num_beams_1": bleu_score,
        "chrF++_num_beams_1": chrf_score
    }
    scores = scores.append(lang_scores, ignore_index=True)

In [18]:
with open(f"../floresp-v2.0-rc.2/dev/dev.fra_Latn") as ref:
    sents_ref = [line.strip() for line in ref.readlines()]
# Translated sentences
with open(f"translations/num_beams_1/hyp.fra_Latn_1") as hyp:
    sents_hyp = [line.strip() for line in hyp.readlines()]
bleu_score = BLEU().corpus_score(sents_hyp, [sents_ref]).score
chrf_score = CHRF(word_order=2).corpus_score(sents_hyp, [sents_ref]).score
lang_scores = {
    "lang": "fra_Latn",
    "BLEU_num_beams_1": bleu_score,
    "chrF++_num_beams_1": chrf_score
}
print(lang_scores)

{'lang': 'fra_Latn', 'BLEU_num_beams_1': 24.688650807328536, 'chrF++_num_beams_1': 48.61267581492919}


In [None]:
sbatch comet.sh  # To be run on the cluster

In [19]:
scores_2 = pd.read_csv("scores_2.csv")
scores_2.drop_duplicates(subset="lang", inplace=True, ignore_index=True)

In [22]:
scores_2

Unnamed: 0,lang,BLEU_num_beams_1,chrF++_num_beams_1,COMET_num_beams_1
0,afr_Latn,37.496051,63.591824,
1,arb_Arab,26.069451,53.587915,
2,bul_Cyrl,38.570824,62.250001,
3,cat_Latn,40.476136,62.695251,
4,ces_Latn,29.600198,54.107049,
...,...,...,...,...
119,wol_Latn,6.147244,28.263424,
120,xho_Latn,13.017241,47.332211,
121,yor_Latn,5.639960,25.394067,
122,yue_Hant,0.775981,17.332046,


In [21]:
scores_2.to_csv("scores.csv", index=False)

In [None]:
scores_2_langs = scores_2["lang"].tolist()

In [None]:
set(translated_langs) - set(scores_2_langs)

In [2]:
scores = pd.read_csv("scores.csv")
comet_scores = pd.read_csv("comet_scores.csv")

In [8]:
scores = scores.merge(comet_scores, on="lang", how="left")

In [10]:
scores_recovered = pd.read_csv("../recovered/scores.csv")

In [12]:
scores = scores.merge(scores_recovered, on="lang", how="left")

In [16]:
scores.drop(columns=["score", "BLEU_num_beams_1_y", "chrF++_num_beams_1_y", "COMET_num_beams_1_y"], inplace=True)

In [17]:
scores.to_csv("scores.csv", index=False)

In [34]:
batch_1 = pd.read_csv("batch_1.csv")
batch_2 = pd.read_csv("batch_2.csv")

In [35]:
comet_scores = pd.concat([batch_1, batch_2], ignore_index=True)

In [37]:
# Sort by lang
comet_scores.sort_values("lang", inplace=truncated_cube_graph)

In [38]:
comet_scores

Unnamed: 0,lang,score
33,ace_Arab,0.6170
34,ace_Latn,0.5926
35,acm_Arab,0.8089
0,afr_Latn,0.8660
36,amh_Ethi,0.8743
...,...,...
118,wol_Latn,0.6088
119,xho_Latn,0.7618
120,yor_Latn,0.6572
121,yue_Hant,0.8311


In [39]:
comet_scores.to_csv("comet_scores_final.csv", index=False)

In [28]:
comet_scores.rename(columns={"score": "COMET_num_beams_1"}, inplace=True)

In [30]:
scores = pd.read_csv("scores.csv")

In [31]:
scores = scores.merge(comet_scores, on="lang", how="left")

In [33]:
scores.to_csv("bullshit.csv", index=False)

In [44]:
! comet-score -s dev.eng_Latn -t translations/num_beams_1/hyp.fra_Latn_1 -r dev.fra_Latn --gpus 0 --quiet --only_system >> comet_fra.txt

Fetching 5 files: 100%|████████████████████████| 5/5 [00:00<00:00, 27776.85it/s]
/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pytorch_lightning/core/saving.py:188: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pytorch_lightning/trainer/setup.py:187: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.


In [45]:
scores = pd.read_csv("scores.csv")

In [47]:
# Sort by lang
scores.sort_values("lang", inplace=True)

In [49]:
scores.to_csv("scores.csv", index=False)

## Translation Quality Gain per `num_beams`

In [7]:
scores = pd.read_csv("scores/scores.csv")

In [4]:
# Gain from num_beams=1 to num_beams=3
scores["BLEU_num_beams_3_gain"] = (scores["BLEU_num_beams_3"] - scores["BLEU_num_beams_1"])
scores["chrF++_num_beams_3_gain"] = scores["chrF++_num_beams_3"] - scores["chrF++_num_beams_1"]
scores["COMET_num_beams_3_gain"] = scores["COMET_num_beams_3"] - scores["COMET_num_beams_1"]

# Gain from num_beams=3 to num_beams=5
scores["BLEU_num_beams_5_gain"] = scores["BLEU_num_beams_5"] - scores["BLEU_num_beams_3"]
scores["chrF++_num_beams_5_gain"] = scores["chrF++_num_beams_5"] - scores["chrF++_num_beams_3"]
scores["COMET_num_beams_5_gain"] = scores["COMET_num_beams_5"] - scores["COMET_num_beams_3"]

# Gain from num_beams=5 to num_beams=7
scores["BLEU_num_beams_7_gain"] = scores["BLEU_num_beams_7"] - scores["BLEU_num_beams_5"]
scores["chrF++_num_beams_7_gain"] = scores["chrF++_num_beams_7"] - scores["chrF++_num_beams_5"]
scores["COMET_num_beams_7_gain"] = scores["COMET_num_beams_7"] - scores["COMET_num_beams_5"]

# Total gain from num_beams=1 to num_beams=7
scores["BLEU_gain_total"] = scores["BLEU_num_beams_7"] - scores["BLEU_num_beams_1"]
scores["chrF++_gain_total"] = scores["chrF++_num_beams_7"] - scores["chrF++_num_beams_1"]
scores["COMET_gain_total"] = scores["COMET_num_beams_7"] - scores["COMET_num_beams_1"]

# Total gain from num_beams=1 to num_beams=7 as a percentage of num_beams=1
scores["BLEU_gain_total_pct"] = (scores["BLEU_gain_total"] / scores["BLEU_num_beams_1"]) * 100
scores["chrF++_gain_total_pct"] = (scores["chrF++_gain_total"] / scores["chrF++_num_beams_1"]) * 100
scores["COMET_gain_total_pct"] = (scores["COMET_gain_total"] / scores["COMET_num_beams_1"]) * 100


In [11]:
# Rearrange columns and drop average gain columns
scores = scores[[
    "lang",

    # num_beams=1
    "BLEU_num_beams_1", "chrF++_num_beams_1", "COMET_num_beams_1",

    # num_beams=3
    "BLEU_num_beams_3", "BLEU_num_beams_3_gain",
    "chrF++_num_beams_3", "chrF++_num_beams_3_gain",
    "COMET_num_beams_3", "COMET_num_beams_3_gain",

    # num_beams=5
    "BLEU_num_beams_5", "BLEU_num_beams_5_gain",
    "chrF++_num_beams_5", "chrF++_num_beams_5_gain",
    "COMET_num_beams_5", "COMET_num_beams_5_gain",

    # num_beams=7
    "BLEU_num_beams_7", "BLEU_num_beams_7_gain",
    "chrF++_num_beams_7", "chrF++_num_beams_7_gain",
    "COMET_num_beams_7", "COMET_num_beams_7_gain",

    # Total gain from num_beams=1 to num_beams=7
    "BLEU_gain_total", "chrF++_gain_total", "COMET_gain_total",

    # Total gain from num_beams=1 to num_beams=7 as a percentage of num_beams=1
    "BLEU_gain_total_pct", "chrF++_gain_total_pct", "COMET_gain_total_pct",
]]

In [7]:
scores.to_csv("scores/scores.csv", index=False)