<a href="https://colab.research.google.com/github/ymoslem/Adaptive-MT-LLM-Fine-tuning/blob/main/Evaluation-Adaptive-MT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Evaluation with BLEU, chrF++, TER, and COMET

This notebook is part of the repository [Adaptive-MT-LLM-Fine-tuning](https://github.com/ymoslem/Adaptive-MT-LLM-Fine-tuning).

## Load files

In [None]:
import os

data_path = "/content/drive/MyDrive/data/"
directory = os.path.join(data_path, "spanish")

os.chdir(directory)
os.getcwd()

In [None]:
# Load test datasets

source_test_file = "all-filtered.es.real.test"
target_test_file = "all-filtered.en.real.test"

with open(source_test_file, encoding="utf-8") as source, open(target_test_file, encoding="utf-8") as target:
  source_sentences = [sent.strip() for sent in source.readlines()]
  target_sentences = [sent.strip() for sent in target.readlines()]

print(source_sentences[0])
print(target_sentences[0])

Período de validez después de abierto el envase: 10 horas.
Shelf life after first opening the container: 10 hours.


In [None]:
# Read the translations

translations_file_name = "translations.en"

with open(translations_file_name, encoding="utf-8") as translated:
  translations = [sent.strip() for sent in translated.readlines()]

print(translations_file_name, "\n")
print(*translations[0:5], sep="\n")

In [None]:
len(translations)

10000

In [None]:
# Check if there are missing translations
count = 0
for idx, line in enumerate(translations):
  if len(line.strip()) == 0:
    count += 1
    print(idx,
          source_sentences[idx].strip(),
          target_sentences[idx].strip(),
          sep="\n",
          end="\n\n")
print("Missing translations:", count)

Missing translations: 0


## Evaluation

## Calculate BLEU, CHRF and TER

In [None]:
!pip3 install sacrebleu sentencepiece -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.7/119.7 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import sacrebleu

references = target_sentences
translations = translations


# Calculate BLEU
bleu = sacrebleu.corpus_bleu(translations, [references])  # for spBLEU: tokenize='flores200'
bleu = round(bleu.score, 2)
print("BLEU:", bleu)

# Calculate CHRF
chrf = sacrebleu.corpus_chrf(translations, [references], word_order=2)  # for chrF++ word_order=2
chrf = round(chrf.score, 2)
print("CHRF:", chrf)

# Calculate TER
metric = sacrebleu.metrics.TER()
ter = metric.corpus_score(translations, [references])
ter = round(ter.score, 2)
print("TER:", ter)

## Calculate COMET

In [None]:
import os
os.environ['TRANSFORMERS_CACHE'] = "/content/drive/MyDrive/models/"

!pip3 install unbabel-comet -q

# !pip3 install git+https://github.com/Unbabel/COMET.git -q

In [None]:
from comet import download_model, load_from_checkpoint
import pandas as pd

references = target_sentences
translations = translations

# Calculate COMET
df = pd.DataFrame({"src":source_sentences, "mt":translations, "ref":references})
data = df.to_dict('records')
# model_path = download_model("wmt20-comet-da")  # to download the model if you did not yet
model_path = "/content/drive/MyDrive/models/wmt20-comet-da/checkpoints/model.ckpt"
model = load_from_checkpoint(model_path)

seg_scores, sys_score = model.predict(data, batch_size=128, gpus=1).values()
comet = round(sys_score*100, 2)
print("COMET:", comet)

In [None]:
# Convert scores to polars dataframe
import polars as pl

df = pl.DataFrame({"BLEU": bleu,
                   "ChrF++": chrf,
                   "TER": ter,
                   "COMET": comet},
                  )

df.head()