# Evaluating results of final models

In [12]:
# !pip install evaluate sentence_transformers jiwer torchmetrics

In [14]:
from evaluations.corpus_similarity import CorpusSimilarity
import pandas as pd
import os
import csv


def evaluate_for_model(model_name: str):
    folder_path = f"translations/{model_name}/"
    csv_filepath = f"metrics/{model_name}.csv"
    with open(csv_filepath, 'w', newline='') as csv_file:
        field_names = ['model', 'filename', 'sacrebleu', "'ter'", 'semsim']
        writer = csv.DictWriter(csv_file, fieldnames=field_names)
        if csv_file.tell() == 0:
            writer.writeheader()

        for root, dirs, files in os.walk(folder_path):
            for file_name in files:
                file_path = os.path.join(root, file_name)
                df = pd.read_csv(file_path)
                print(f"File: {file_path}")

                # file_path looks like translations/model/testset.csv
                test_set = file_path.split('/')[-1]
                line = {'model': model_name, 'filename': test_set}

                for metric in CorpusSimilarity:
                    score = metric.evaluate(df["reference"].tolist(), df["actual"].tolist())
                    print(f"\t{metric.name}: {score}")

                    if metric == SimilarityMetric.SACREBLEU:
                        line['sacrebleu'] = score
                    elif metric == SimilarityMetric.TER:
                        line["'ter'"] = score
                    else:
                        line['semsim'] = score

                writer.writerow(line)

In [15]:
evaluate_for_model("finetuned-cl")

In [16]:
# evaluate_for_model("finetuned-all")

In [17]:
# evaluate_for_model("helsinki-nlp")

In [18]:
# evaluate_for_model("madlad")

In [19]:
# evaluate_for_model("nllb-3B")