# Evaluating results of final models

In [6]:
import csv
import os

import pandas as pd

from evaluation.similarity.corpus_similarity import CorpusSimilarity


def evaluate_for_model(folder_path: str, model_name: str):
    translations_path = f"{folder_path}/translations/{model_name}/"
    output_csv = f"{folder_path}/metrics/{model_name}.csv"

    with open(output_csv, 'w', newline='') as csv_file:
        field_names = ['model', 'filename', 'sacrebleu', "`ter`", 'semsim']
        writer = csv.DictWriter(csv_file, fieldnames=field_names)
        if csv_file.tell() == 0:
            writer.writeheader()

        for root, dirs, files in os.walk(translations_path):
            for file_name in files:
                file_path = os.path.join(root, file_name)
                df = pd.read_csv(file_path)
                print(f"File: {file_path}")

                # file_path looks like translations/model/dataset.csv
                dataset = file_path.split('/')[-1]
                line = {'model': model_name, 'filename': dataset}

                for metric in CorpusSimilarity:
                    score = metric.evaluate(df["reference"].tolist(), df["actual"].tolist())
                    print(f"\t{metric.name}: {score}")

                    match metric:
                        case CorpusSimilarity.SACREBLEU:
                            line['sacrebleu'] = score
                        case CorpusSimilarity.TER:
                            line["`ter`"] = score
                        case CorpusSimilarity.SEMANTIC_SIMILARITY:
                            line['semsim'] = score

                writer.writerow(line)

## Pre-trained experiments

In [7]:
folder = "pretrained_experiments"

In [8]:
evaluate_for_model(folder, "helsinki-nlp")

File: pretrained_experiments/translations/helsinki-nlp/orphanet-definitions-tr.csv
	SACREBLEU: 45.6
	TER: 61.6
	SEMANTIC_SIMILARITY: 93.6
File: pretrained_experiments/translations/helsinki-nlp/clinspen-tr.csv
	SACREBLEU: 39.6
	TER: 55.6
	SEMANTIC_SIMILARITY: 92.2
File: pretrained_experiments/translations/helsinki-nlp/orphanet-terms.csv
	SACREBLEU: 46.0
	TER: 61.4
	SEMANTIC_SIMILARITY: 94.5
File: pretrained_experiments/translations/helsinki-nlp/medline.csv
	SACREBLEU: 54.5
	TER: 70.4
	SEMANTIC_SIMILARITY: 93.7
File: pretrained_experiments/translations/helsinki-nlp/pubmed-tr.csv
	SACREBLEU: 42.9
	TER: 55.8
	SEMANTIC_SIMILARITY: 92.3
File: pretrained_experiments/translations/helsinki-nlp/khresmoi-tr.csv
	SACREBLEU: 48.0
	TER: 66.1
	SEMANTIC_SIMILARITY: 95.5
File: pretrained_experiments/translations/helsinki-nlp/snomed.csv
	SACREBLEU: 28.2
	TER: 41.1
	SEMANTIC_SIMILARITY: 81.4
File: pretrained_experiments/translations/helsinki-nlp/preferred-en2es.csv
	SACREBLEU: 27.1
	TER: 37.6
	SEMANTIC_S

In [9]:
evaluate_for_model(folder, "madlad")

File: pretrained_experiments/translations/madlad/orphanet-definitions-tr.csv
	SACREBLEU: 50.7
	TER: 64.3
	SEMANTIC_SIMILARITY: 93.9
File: pretrained_experiments/translations/madlad/clinspen-tr.csv
	SACREBLEU: 39.3
	TER: 50.1
	SEMANTIC_SIMILARITY: 90.7
File: pretrained_experiments/translations/madlad/orphanet-terms.csv
	SACREBLEU: 53.4
	TER: 66.7
	SEMANTIC_SIMILARITY: 95.5
File: pretrained_experiments/translations/madlad/medline.csv
	SACREBLEU: 56.4
	TER: 71.1
	SEMANTIC_SIMILARITY: 94.0
File: pretrained_experiments/translations/madlad/pubmed-tr.csv
	SACREBLEU: 45.7
	TER: 57.9
	SEMANTIC_SIMILARITY: 92.6
File: pretrained_experiments/translations/madlad/khresmoi-tr.csv
	SACREBLEU: 48.3
	TER: 66.5
	SEMANTIC_SIMILARITY: 95.5
File: pretrained_experiments/translations/madlad/snomed.csv
	SACREBLEU: 31.5
	TER: 42.5
	SEMANTIC_SIMILARITY: 82.1
File: pretrained_experiments/translations/madlad/preferred-en2es.csv
	SACREBLEU: 16.9
	TER: 15.2
	SEMANTIC_SIMILARITY: 84.7


In [10]:
evaluate_for_model(folder, "nllb-3B")

File: pretrained_experiments/translations/nllb-3B/orphanet-definitions-tr.csv
	SACREBLEU: 45.3
	TER: 59.7
	SEMANTIC_SIMILARITY: 93.8
File: pretrained_experiments/translations/nllb-3B/clinspen-tr.csv
	SACREBLEU: 36.6
	TER: 48.0
	SEMANTIC_SIMILARITY: 89.5
File: pretrained_experiments/translations/nllb-3B/orphanet-terms.csv
	SACREBLEU: 38.4
	TER: 53.1
	SEMANTIC_SIMILARITY: 93.2
File: pretrained_experiments/translations/nllb-3B/medline.csv
	SACREBLEU: 51.9
	TER: 67.2
	SEMANTIC_SIMILARITY: 93.2
File: pretrained_experiments/translations/nllb-3B/pubmed-tr.csv
	SACREBLEU: 43.5
	TER: 55.1
	SEMANTIC_SIMILARITY: 92.2
File: pretrained_experiments/translations/nllb-3B/khresmoi-tr.csv
	SACREBLEU: 46.7
	TER: 65.0
	SEMANTIC_SIMILARITY: 95.4
File: pretrained_experiments/translations/nllb-3B/snomed.csv
	SACREBLEU: 18.7
	TER: 12.6
	SEMANTIC_SIMILARITY: 79.7
File: pretrained_experiments/translations/nllb-3B/preferred-en2es.csv
	SACREBLEU: 12.7
	TER: 0.5
	SEMANTIC_SIMILARITY: 81.3


In [11]:
evaluate_for_model(folder, "nllb-600M")

File: pretrained_experiments/translations/nllb-600M/orphanet-definitions-tr.csv
	SACREBLEU: 35.7
	TER: 51.3
	SEMANTIC_SIMILARITY: 92.5
File: pretrained_experiments/translations/nllb-600M/clinspen-tr.csv
	SACREBLEU: 36.0
	TER: 49.5
	SEMANTIC_SIMILARITY: 91.0
File: pretrained_experiments/translations/nllb-600M/orphanet-terms.csv
	SACREBLEU: 38.4
	TER: 53.1
	SEMANTIC_SIMILARITY: 93.2
File: pretrained_experiments/translations/nllb-600M/medline.csv
	SACREBLEU: 51.9
	TER: 67.2
	SEMANTIC_SIMILARITY: 93.2
File: pretrained_experiments/translations/nllb-600M/pubmed-tr.csv
	SACREBLEU: 43.5
	TER: 55.1
	SEMANTIC_SIMILARITY: 92.2
File: pretrained_experiments/translations/nllb-600M/khresmoi-tr.csv
	SACREBLEU: 45.2
	TER: 63.9
	SEMANTIC_SIMILARITY: 95.1
File: pretrained_experiments/translations/nllb-600M/snomed.csv
	SACREBLEU: 18.7
	TER: 12.6
	SEMANTIC_SIMILARITY: 79.7
File: pretrained_experiments/translations/nllb-600M/preferred-en2es.csv
	SACREBLEU: 12.7
	TER: 0.5
	SEMANTIC_SIMILARITY: 81.3


## Test experiments

#### No processing

In [12]:
folder = "test_data/models"

In [14]:
evaluate_for_model(folder, "finetuned-all")

File: test_data/models/translations/finetuned-all/pubmed-te.csv
	SACREBLEU: 46.3
	TER: 59.7
	SEMANTIC_SIMILARITY: 93.5
File: test_data/models/translations/finetuned-all/clinspen-te.csv
	SACREBLEU: 55.4
	TER: 67.7
	SEMANTIC_SIMILARITY: 94.1
File: test_data/models/translations/finetuned-all/hpo.csv
	SACREBLEU: 49.2
	TER: 59.7
	SEMANTIC_SIMILARITY: 92.6
File: test_data/models/translations/finetuned-all/orphanet-definitions-te.csv
	SACREBLEU: 61.6
	TER: 73.6
	SEMANTIC_SIMILARITY: 96.0
File: test_data/models/translations/finetuned-all/khresmoi-te.csv
	SACREBLEU: 47.9
	TER: 65.7
	SEMANTIC_SIMILARITY: 95.7


In [15]:
evaluate_for_model(folder, "finetuned-cl")

File: test_data/models/translations/finetuned-cl/pubmed-te.csv
	SACREBLEU: 48.5
	TER: 61.6
	SEMANTIC_SIMILARITY: 94.0
File: test_data/models/translations/finetuned-cl/clinspen-te.csv
	SACREBLEU: 41.9
	TER: 56.6
	SEMANTIC_SIMILARITY: 92.8
File: test_data/models/translations/finetuned-cl/hpo.csv
	SACREBLEU: 50.2
	TER: 61.9
	SEMANTIC_SIMILARITY: 92.6
File: test_data/models/translations/finetuned-cl/orphanet-definitions-te.csv
	SACREBLEU: 57.7
	TER: 71.5
	SEMANTIC_SIMILARITY: 95.4
File: test_data/models/translations/finetuned-cl/khresmoi-te.csv
	SACREBLEU: 48.6
	TER: 66.6
	SEMANTIC_SIMILARITY: 95.8


In [16]:
evaluate_for_model(folder, "helsinki-nlp")

File: test_data/models/translations/helsinki-nlp/pubmed-te.csv
	SACREBLEU: 48.7
	TER: 62.0
	SEMANTIC_SIMILARITY: 94.0
File: test_data/models/translations/helsinki-nlp/clinspen-te.csv
	SACREBLEU: 40.2
	TER: 55.1
	SEMANTIC_SIMILARITY: 92.1
File: test_data/models/translations/helsinki-nlp/hpo.csv
	SACREBLEU: 48.2
	TER: 60.1
	SEMANTIC_SIMILARITY: 92.2
File: test_data/models/translations/helsinki-nlp/orphanet-definitions-te.csv
	SACREBLEU: 46.4
	TER: 62.2
	SEMANTIC_SIMILARITY: 93.9
File: test_data/models/translations/helsinki-nlp/khresmoi-te.csv
	SACREBLEU: 49.6
	TER: 67.6
	SEMANTIC_SIMILARITY: 95.9


In [17]:
evaluate_for_model(folder, "madlad")

File: test_data/models/translations/madlad/pubmed-te.csv
	SACREBLEU: 51.8
	TER: 63.8
	SEMANTIC_SIMILARITY: 94.3
File: test_data/models/translations/madlad/clinspen-te.csv
	SACREBLEU: 38.8
	TER: 48.5
	SEMANTIC_SIMILARITY: 90.5
File: test_data/models/translations/madlad/hpo.csv
	SACREBLEU: 54.2
	TER: 63.0
	SEMANTIC_SIMILARITY: 92.8
File: test_data/models/translations/madlad/orphanet-definitions-te.csv
	SACREBLEU: 50.8
	TER: 64.0
	SEMANTIC_SIMILARITY: 93.8
File: test_data/models/translations/madlad/khresmoi-te.csv
	SACREBLEU: 50.1
	TER: 68.2
	SEMANTIC_SIMILARITY: 96.1


In [18]:
evaluate_for_model(folder, "nllb-3B")

File: test_data/models/translations/nllb-3B/pubmed-te.csv
	SACREBLEU: 49.3
	TER: 61.8
	SEMANTIC_SIMILARITY: 93.9
File: test_data/models/translations/nllb-3B/clinspen-te.csv
	SACREBLEU: 36.1
	TER: 45.4
	SEMANTIC_SIMILARITY: 90.0
File: test_data/models/translations/nllb-3B/hpo.csv
	SACREBLEU: 45.1
	TER: 51.4
	SEMANTIC_SIMILARITY: 90.5
File: test_data/models/translations/nllb-3B/orphanet-definitions-te.csv
	SACREBLEU: 45.9
	TER: 60.6
	SEMANTIC_SIMILARITY: 93.7
File: test_data/models/translations/nllb-3B/khresmoi-te.csv
	SACREBLEU: 49.3
	TER: 67.4
	SEMANTIC_SIMILARITY: 96.0


### Processing techniques

In [21]:
def evaluate(folder_path: str):
    translations_path = f"{folder_path}/translations/"
    output_csv = f"{folder_path}/metrics.csv"

    with open(output_csv, 'w', newline='') as csv_file:
        field_names = ['filename', 'sacrebleu', "`ter`", 'semsim']
        writer = csv.DictWriter(csv_file, fieldnames=field_names)
        if csv_file.tell() == 0:
            writer.writeheader()

        for root, dirs, files in os.walk(translations_path):
            for file_name in files:
                file_path = os.path.join(root, file_name)
                df = pd.read_csv(file_path)
                print(f"File: {file_path}")

                # file_path looks like translations/dataset.csv
                dataset = file_path.split('/')[-1]
                line = {'filename': dataset}

                for metric in CorpusSimilarity:
                    score = metric.evaluate(df["reference"].tolist(), df["translation"].tolist())
                    print(f"\t{metric.name}: {score}")

                    match metric:
                        case CorpusSimilarity.SACREBLEU:
                            line['sacrebleu'] = score
                        case CorpusSimilarity.TER:
                            line["`ter`"] = score
                        case CorpusSimilarity.SEMANTIC_SIMILARITY:
                            line['semsim'] = score

                writer.writerow(line)

#### Abbreviation expansion

In [22]:
evaluate("test_data/abbreviation_expansion")

File: test_data/abbreviation_expansion/translations/pubmed-te.csv
	SACREBLEU: 45.4
	TER: 58.4
	SEMANTIC_SIMILARITY: 92.2
File: test_data/abbreviation_expansion/translations/clinspen-te.csv
	SACREBLEU: 50.2
	TER: 63.4
	SEMANTIC_SIMILARITY: 93.1
File: test_data/abbreviation_expansion/translations/hpo.csv
	SACREBLEU: 47.7
	TER: 58.3
	SEMANTIC_SIMILARITY: 92.0
File: test_data/abbreviation_expansion/translations/orphanet-definitions-te.csv
	SACREBLEU: 61.2
	TER: 73.2
	SEMANTIC_SIMILARITY: 95.7
File: test_data/abbreviation_expansion/translations/khresmoi-te.csv
	SACREBLEU: 47.1
	TER: 64.9
	SEMANTIC_SIMILARITY: 94.7


#### Synonym replacement

In [23]:
evaluate("test_data/synonym_replacement")

File: test_data/synonym_replacement/translations/pubmed-te.csv
	SACREBLEU: 45.8
	TER: 59.2
	SEMANTIC_SIMILARITY: 93.4
File: test_data/synonym_replacement/translations/clinspen-te.csv
	SACREBLEU: 54.3
	TER: 66.2
	SEMANTIC_SIMILARITY: 93.5
File: test_data/synonym_replacement/translations/hpo.csv
	SACREBLEU: 46.8
	TER: 56.9
	SEMANTIC_SIMILARITY: 91.6
File: test_data/synonym_replacement/translations/orphanet-definitions-te.csv
	SACREBLEU: 58.9
	TER: 71.7
	SEMANTIC_SIMILARITY: 95.3
File: test_data/synonym_replacement/translations/khresmoi-te.csv
	SACREBLEU: 46.1
	TER: 64.4
	SEMANTIC_SIMILARITY: 95.0
