# Running pre-trained models on biomedical texts to evaluate them
## Infrastructure

In [8]:
# !pip install transformers
# !pip install tqdm
# !pip install sentencepiece
# !pip install sacremoses
# !pip install accelerate
# !pip install ipywidgets
# !pip install protobuf
# !pip install pytorch

In [9]:
from abc import abstractmethod


class TranslationModel:
    def __init__(self, checkpoint_name: str):
        self.checkpoint_name = checkpoint_name

    @abstractmethod
    def translate(self, source: str) -> str:
        """Translates a source text with the model
        :param source: the text to translate
        :return: str - the translation"""
        pass

    def __str__(self):
        return self.checkpoint_name

### Helsinki-NLP/opus-mt-en-es

In [10]:
from transformers import MarianMTModel, MarianTokenizer


class HelsinkiNLPModel(TranslationModel):
    def __init__(self):
        super().__init__("Helsinki-NLP/opus-mt-en-es")
        self.model = MarianMTModel.from_pretrained(self.checkpoint_name)
        self.tokenizer = MarianTokenizer.from_pretrained(self.checkpoint_name)

    def translate(self, source: str) -> str:
        input_ids = self.tokenizer.encode(source, return_tensors="pt")
        translated_tokens = self.model.generate(input_ids, num_beams=4, early_stopping=True)
        translated_text = self.tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
        return translated_text

### facebook/nllb-200-distilled-600M

In [11]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


class NLLBModel(TranslationModel):
    def __init__(self):
        super().__init__("facebook/nllb-200-distilled-600M")
        self.tokenizer = AutoTokenizer.from_pretrained(self.checkpoint_name)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(self.checkpoint_name, device_map="auto")

    def translate(self, source: str) -> str:
        inputs = self.tokenizer(source, return_tensors="pt")
        translated_tokens = self.model.generate(
            **inputs, forced_bos_token_id=self.tokenizer.lang_code_to_id["spa_Latn"], max_length=30
        )
        translated_text = self.tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
        return translated_text

## Madlad (Google T5)

In [12]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)


class MadladModel(TranslationModel):
    def __init__(self):
        super().__init__('jbochi/madlad400-3b-mt')
        self.tokenizer = T5Tokenizer.from_pretrained(self.checkpoint_name)
        self.model = T5ForConditionalGeneration.from_pretrained(self.checkpoint_name).to(device)

    def translate(self, source: str) -> str:
        input_ids = self.tokenizer(f"<2es> {source}", max_length=512, truncation=True,
                                   return_tensors="pt").input_ids.to(device)
        outputs = self.model.generate(input_ids=input_ids, max_new_tokens=512, num_beams=4, early_stopping=True)
        translated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return translated_text


## Loading and evaluation functions

In [13]:
import json
import os

import pandas as pd
from tqdm import tqdm

from pretrained_models.preliminary_experiments.translation_model import TranslationModel, NLLBModel


def load_sentences(test_dataset: str, num_rows=1000, seed=42) -> pd.DataFrame:
    """Loads a test dataset in .jsonl format into a dataframe and randomly selects n rows
    :param test_dataset: filename of the test dataset
    :param num_rows: number of rows to select
    :param seed: random seed for reproducibility
    :return pd.Dataframe: the parallel corpus as a dataframe"""
    data = []
    with open(test_dataset, "r") as f:
        for line in f:
            data.append(json.loads(line))
    df = pd.DataFrame(data)
    return df.sample(n=num_rows, random_state=seed)


def run_models(model: TranslationModel, test_sentences: pd.DataFrame, dataset_name: str):
    """Runs a model on the test sentences. Creates a dataframe for the results with two columns: 'Reference' and 'Actual'
    :param model: the MT model to evaluate
    :param test_sentences: a dataframe for a parallel corpus"""

    dataset_without_extension = dataset_name.split(".jsonl")[0]
    folder = f"translations/{dataset_without_extension}/"

    print(f"\tModel: {model}")
    df = pd.DataFrame(columns=['reference', 'actual'])

    for _, row in tqdm(test_sentences.iterrows()):
        english, spanish = row['en'], row['es']
        translation = model.translate(english)
        df.loc[len(df.index)] = [spanish, translation]

    result_filename = folder + model.checkpoint_name.split("/")[0] + ".csv"
    os.makedirs(os.path.dirname(result_filename), exist_ok=True)
    df.to_csv(result_filename)


def evaluate_on_all_test_data(translation_model: TranslationModel, dataset_names: list[str],
                              corpus_directory: str):
    for test_dataset in dataset_names:
        test_path = corpus_directory + test_dataset
        print(f"Test dataset: {test_dataset}")
        test_sentences = load_sentences(test_path, num_rows=3)
        run_models(translation_model, test_sentences, test_dataset)

## Evaluating the models

In [None]:
test_directory = "/home/zakiamin/PycharmProjects/hpo_translation/corpus/train/"
# "/Users/zaki/PycharmProjects/hpo_translation/corpus/train/"
filenames = ["abstracts.jsonl", "khresmoi.jsonl", "medline.jsonl", "scielo.jsonl", "snomed.jsonl"]

In [None]:
evaluate_on_all_test_data(HelsinkiNLPModel(), filenames, test_directory)

In [None]:
evaluate_on_all_test_data(NLLBModel(), filenames, test_directory)

In [None]:
evaluate_on_all_test_data(MadladModel(), filenames, test_directory)