# Evaluating results of pre-trained models

In [1]:
!pip install evaluate sentence_transformers jiwer

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0m

In [2]:
from enum import Enum

from evaluate import load
from sentence_transformers import SentenceTransformer, util


class SimilarityMetric(Enum):
    """Enum for string similarity metrics. Each metric must implement the evaluate method."""
    SACREBLEU = 0
    WER = 1
    SEMANTIC_SIMILARITY = 2

    def evaluate(self, references: list[str], predictions: list[str]) -> float:
        """Evaluate the given similarity metric between two corpora.
        Performs simple string cleaning for whitespace and punctuation.
        :param references: list of references (official translations)
        :param predictions: list of candidates (model translations)
        :return: average similarity score when evaluating this specific metric on the corpora
        """
        if self == SimilarityMetric.SACREBLEU:
            # sacrebleu expects a list of references for each candidate
            references = [[ref] for ref in references]
            predictions = [cand for cand in predictions]
            sacrebleu = load("sacrebleu")
            results = sacrebleu.compute(predictions=predictions, references=references)
            return round(results["score"], 1)

        elif self == SimilarityMetric.WER:
            wer = load("wer")
            # Normally 0 is the best WER so we invert such that 1 is the best to match other metrics
            wer_score = 1 - wer.compute(predictions=predictions, references=references)
            return round(wer_score * 100, 1)

        else:  # semantic similarity
            similarity_model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")
            n = len(references)
            score = 0
            for reference, candidate in zip(references, predictions):
                reference_embedding = similarity_model.encode(reference)
                candidate_embedding = similarity_model.encode(candidate)
                cosine_similarity = util.cos_sim(reference_embedding, candidate_embedding)
                score += cosine_similarity[0].item()
            # Return as percentage
            return round(score / n * 100, 1)

In [3]:
import pandas as pd
import os
import csv


def evaluate_for_model(model_name: str):
    folder_path = f"translations/{model_name}/"
    csv_filepath = f"{model_name}-metrics.csv"
    with open(csv_filepath, 'w', newline='') as csv_file:
        field_names = ['model', 'filename', 'sacrebleu', 'wer', 'semsim']
        writer = csv.DictWriter(csv_file, fieldnames=field_names)
        if csv_file.tell() == 0:
            writer.writeheader()

        for root, dirs, files in os.walk(folder_path):
            for file_name in files:
                file_path = os.path.join(root, file_name)
                df = pd.read_csv(file_path)
                print(f"File: {file_path}")

                # file_path looks like translations/model/testset.csv
                test_set = file_path.split('/')[-1]
                line = {'model': model_name, 'filename': test_set}

                for metric in SimilarityMetric:
                    score = metric.evaluate(df["reference"].tolist(), df["actual"].tolist())
                    print(f"\t{metric.name}: {score}")

                    if metric == SimilarityMetric.SACREBLEU:
                        line['sacrebleu'] = score
                    elif metric == SimilarityMetric.SEMANTIC_SIMILARITY:
                        line['semsim'] = score
                    else:
                        line['wer'] = score

                writer.writerow(line)

In [4]:
evaluate_for_model("helsinki-nlp")

File: translations/helsinki-nlp/orphanet-definitions-te.csv
	SACREBLEU: 46.3
	WER: 54.3
	SEMANTIC_SIMILARITY: 93.9
File: translations/helsinki-nlp/hpo.csv
	SACREBLEU: 47.8
	WER: 57.9


KeyboardInterrupt: 

In [None]:
evaluate_for_model("madlad")

In [None]:
evaluate_for_model("nllb-3B")