# Translating biomedical text

## Loading and evaluation functions

In [None]:
import json

import pandas as pd
from tqdm import tqdm

from evaluations.models import TranslationModel


def load_all_sentences(test_dataset: str) -> pd.DataFrame:
    data = []
    with open(test_dataset, "r") as f:
        for line in f:
            loaded = json.loads(line)
            english, spanish = loaded['en'].strip('"'), loaded['es'].strip('"')
            data.append({"en": english, "es": spanish})
    return pd.DataFrame(data)


def load_sample_sentences(test_dataset: str, num_rows=500, seed=42) -> pd.DataFrame:
    df = load_all_sentences(test_dataset)
    return df.sample(n=num_rows, random_state=seed)


def run_model(model: TranslationModel, test_sentences: pd.DataFrame):
    """Runs a model on the test sentences.
    Creates a dataframe for the results with two columns: 'reference' and 'candidate'
    :param model: the MT model to evaluate
    :param test_sentences: a dataframe for a parallel corpus"""
    df = pd.DataFrame(columns=['reference', 'candidate'])
    for _, row in tqdm(test_sentences.iterrows(), total=test_sentences.shape[0]):
        english, reference = row['en'], row['es']
        candidate = model.translate(english)
        df.loc[len(df.index)] = [reference, candidate]
    return df


def run_model_on_datasets(translation_model: TranslationModel, dataset_names: list[str], corpus_directory: str,
                          sample: bool, output_directory: str):
    for test_dataset in dataset_names:
        print(f"Test dataset: {test_dataset}")
        test_path = f"{corpus_directory}/{test_dataset}"

        if sample:
            test_sentences = load_sample_sentences(test_path)
        else:
            test_sentences = load_all_sentences(test_path)

        results = run_model(translation_model, test_sentences)
        out_file = test_dataset.replace(".jsonl", ".csv")
        results.to_csv(f"{output_directory}/translations/{translation_model}/{out_file}", index=False)

## Preliminary experiments

In [None]:
from evaluations.models import HelsinkiNLP, Madlad, NLLB3B, NLLB600M

In [None]:
corpus = "../corpus/train"
output_dir = "pretrained_experiments"
filenames = ["clinspen-tr", "khresmoi-tr", "medline", "orphanet-definitions-tr", "orphanet-terms", "preferred-en2es",
             "pubmed-tr", "snomed"]
filenames = [f + ".jsonl" for f in filenames]

In [None]:
run_model_on_datasets(HelsinkiNLP(), filenames, corpus, sample=True, output_directory=output_dir)

In [None]:
run_model_on_datasets(NLLB600M(), filenames, corpus, sample=True, output_directory=output_dir)

In [None]:
run_model_on_datasets(NLLB3B(), filenames, corpus, sample=True, output_directory=output_dir)

In [None]:
run_model_on_datasets(Madlad(), filenames, corpus, sample=True, output_directory=output_dir)

## Test data

### All models

In [None]:
from evaluations.models import FineTuned

In [None]:
corpus = "../corpus/test"
output_dir = "test_data/models"
filenames = ["clinspen-te", "hpo", "khresmoi-te", "orphanet-definitions-te", "pubmed-te"]
filenames = [f + ".jsonl" for f in filenames]

In [None]:
run_model_on_datasets(FineTuned(), filenames, corpus, sample=False, output_directory=output_dir)

In [None]:
run_model_on_datasets(HelsinkiNLP(), filenames, corpus, sample=False, output_directory=output_dir)

In [None]:
run_model_on_datasets(NLLB3B(), filenames, corpus, sample=False, output_directory=output_dir)

In [None]:
run_model_on_datasets(Madlad(), filenames, corpus, sample=False, output_directory=output_dir)