# Evaluating results of final models

In [12]:
# !pip install evaluate sentence_transformers jiwer torchmetrics

In [14]:
from evaluations.corpus_similarity import CorpusSimilarity
import pandas as pd
import os
import csv


def evaluate_for_model(folder_path: str, model_name: str):
    translations_path = f"{folder_path}/translations/"
    output_csv = f"{folder_path}/metrics/{model_name}.csv"

    with open(output_csv, 'w', newline='') as csv_file:
        field_names = ['model', 'filename', 'sacrebleu', "'ter'", 'semsim']
        writer = csv.DictWriter(csv_file, fieldnames=field_names)
        if csv_file.tell() == 0:
            writer.writeheader()

        for root, dirs, files in os.walk(translations_path):
            for file_name in files:
                file_path = os.path.join(root, file_name)
                df = pd.read_csv(file_path)
                print(f"File: {file_path}")

                # file_path looks like translations/model/dataset.csv
                dataset = file_path.split('/')[-1]
                line = {'model': model_name, 'filename': dataset}

                for metric in CorpusSimilarity:
                    score = metric.evaluate(df["reference"].tolist(), df["actual"].tolist())
                    print(f"\t{metric.name}: {score}")

                    match metric:
                        case CorpusSimilarity.SACREBLEU:
                            line['sacrebleu'] = score
                        case CorpusSimilarity.TER:
                            line["'ter'"] = score
                        case CorpusSimilarity.SEMANTIC_SIMILARITY:
                            line['semsim'] = score

                writer.writerow(line)

## Pre-trained experiments

In [None]:
folder = "pretrained_experiments"

In [None]:
evaluate_for_model(folder, "helsinki-nlp")

In [None]:
evaluate_for_model(folder, "madlad")

In [None]:
evaluate_for_model(folder, "nllb-3B")

In [None]:
evaluate_for_model(folder, "nllb-600M")

## Test experiments

#### No processing

In [15]:
folder = "test_data/models"

In [16]:
evaluate_for_model(folder, "finetuned-all")

In [None]:
evaluate_for_model(folder, "finetuned-cl")

In [17]:
evaluate_for_model(folder, "helsinki-nlp")

In [18]:
evaluate_for_model(folder, "madlad")

In [19]:
evaluate_for_model(folder, "nllb-3B")

### Processing techniques

In [None]:
def evaluate(folder_path: str):
    translations_path = f"{folder_path}/translations/"
    output_csv = f"{folder_path}/metrics.csv"

    with open(output_csv, 'w', newline='') as csv_file:
        field_names = ['model', 'filename', 'sacrebleu', "'ter'", 'semsim']
        writer = csv.DictWriter(csv_file, fieldnames=field_names)
        if csv_file.tell() == 0:
            writer.writeheader()

        for root, dirs, files in os.walk(translations_path):
            for file_name in files:
                file_path = os.path.join(root, file_name)
                df = pd.read_csv(file_path)
                print(f"File: {file_path}")

                # file_path looks like translations/dataset.csv
                dataset = file_path.split('/')[-1]
                line = {'filename': dataset}

                for metric in CorpusSimilarity:
                    score = metric.evaluate(df["reference"].tolist(), df["actual"].tolist())
                    print(f"\t{metric.name}: {score}")

                    match metric:
                        case CorpusSimilarity.SACREBLEU:
                            line['sacrebleu'] = score
                        case CorpusSimilarity.TER:
                            line["'ter'"] = score
                        case CorpusSimilarity.SEMANTIC_SIMILARITY:
                            line['semsim'] = score

                writer.writerow(line)

#### Abbreviation expansion

#### Synonym replacement