# Running pre-trained models on biomedical texts to evaluate them
## Infrastructure

In [None]:
from google.colab import drive

drive.mount('/content/drive')
%cd /content/drive/MyDrive/UPM/Thesis/Experiments/Train

In [None]:
!pip install transformers tqdm sentencepiece sacremoses accelerate ipywidgets protobuf
# pytorch

In [None]:
from abc import abstractmethod


class TranslationModel:
    def __init__(self, checkpoint_name: str):
        self.checkpoint_name = checkpoint_name

    @abstractmethod
    def translate(self, source: str) -> str:
        """Translates a source text with the model
        :param source: the text to translate
        :return: str - the translation"""
        pass

    def __str__(self):
        return self.checkpoint_name

### Madlad (Google T5)

In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)


class MadladModel(TranslationModel):
    def __init__(self):
        super().__init__('jbochi/madlad400-3b-mt')
        self.tokenizer = T5Tokenizer.from_pretrained(self.checkpoint_name)
        self.model = T5ForConditionalGeneration.from_pretrained(self.checkpoint_name).to(device)

    def translate(self, source: str) -> str:
        input_ids = self.tokenizer(f"<2es> {source}", max_length=1024, truncation=True,
                                   return_tensors="pt").input_ids.to(device)
        outputs = self.model.generate(input_ids=input_ids, max_new_tokens=1024, num_beams=4, early_stopping=True)
        translated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return translated_text


## Loading and evaluation functions

In [None]:
import json

import pandas as pd
from tqdm import tqdm


def load_sentences(test_dataset: str, num_rows=500, seed=42) -> pd.DataFrame:
    """Loads a test dataset in .jsonl format into a dataframe and randomly selects n rows
    :param test_dataset: filename of the test dataset
    :param num_rows: number of rows to select
    :param seed: random seed for reproducibility
    :return pd.Dataframe: the parallel corpus as a dataframe"""
    data = []
    with open(test_dataset, "r") as f:
        for line in f:
            loaded = json.loads(line)
            english, spanish = loaded['en'].strip('"'), loaded['es'].strip('"')
            data.append({"en": english, "es": spanish})
    df = pd.DataFrame(data)
    return df.sample(n=num_rows, random_state=seed)


def run_model(model: TranslationModel, test_sentences: pd.DataFrame):
    """Runs a model on the test sentences. Creates a dataframe for the results with two columns: 'Reference' and 'Actual'
    :param model: the MT model to evaluate
    :param test_sentences: a dataframe for a parallel corpus"""
    df = pd.DataFrame(columns=['reference', 'actual'])

    for _, row in tqdm(test_sentences.iterrows()):
        english, spanish = row['en'], row['es']
        translation = model.translate(english)
        df.loc[len(df.index)] = [spanish, translation]

    return df


def evaluate_on_all_test_data(translation_model: TranslationModel, dataset_names: list[str],
                              corpus_directory: str):
    for test_dataset in dataset_names:
        test_path = corpus_directory + test_dataset
        print(f"Test dataset: {test_dataset}")
        test_sentences = load_sentences(test_path, num_rows=500)
        results = run_model(translation_model, test_sentences)
        results.to_csv("madlad/" + test_dataset.replace(".jsonl", ".csv"), index=False)

## Evaluating the models

In [None]:
test_directory = "data/"
filenames = ["abstracts-tr.jsonl", "clinspen-tr.jsonl", "khresmoi-tr.jsonl", "medline.jsonl", "orphanet-definitions-tr.jsonl", "orphanet-terms.jsonl", "preferred-en2es.jsonl", "snomed.jsonl"]

In [None]:
evaluate_on_all_test_data(MadladModel(), filenames, test_directory)