# Running pre-trained models on biomedical texts to evaluate them
## Infrastructure

In [1]:
from abc import abstractmethod

using_gpu = False
device = "gpu" if using_gpu else "cpu"


class TranslationModel:
    def __init__(self, checkpoint_name: str):
        self.checkpoint_name = checkpoint_name

    @abstractmethod
    def translate(self, source: str) -> str:
        """Translates a source text with the model
        :param source: the text to translate
        :return: str - the translation"""
        pass

In [2]:
from enum import Enum

!pip install nltk
!pip install sentence_transformers
!pip install sacrebleu

import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from sentence_transformers import SentenceTransformer, util
import sacrebleu

similarity_model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")


class SimilarityMetric(Enum):
    """Enum for string similarity metrics. Each metric must implement the evaluate method."""
    BLEU = 0
    SIMPLE = 1
    EDIT_DISTANCE = 2
    SEMANTIC_SIMILARITY = 3
    SACREBLEU = 4

    def evaluate(self, reference: str, candidate: str) -> float:
        """Evaluate the given string similarity metric between two strings.
        Performs simple string cleaning for whitespace and punctuation.
        :param reference: reference and official term
        :param candidate: model-produced translated term
        :return: similarity score when evaluating this specific metric
        """
        if self == SimilarityMetric.BLEU:
            reference_tokens = nltk.word_tokenize(reference.lower())
            candidate_tokens = nltk.word_tokenize(candidate.lower())
            return sentence_bleu([reference_tokens], candidate_tokens,
                                 smoothing_function=SmoothingFunction().method1)
        elif self == SimilarityMetric.SIMPLE:
            return 1 if reference == candidate else 0
        elif self == SimilarityMetric.EDIT_DISTANCE:
            return 1 - nltk.edit_distance(reference, candidate) / max(len(reference), len(candidate))
        elif self == SimilarityMetric.SEMANTIC_SIMILARITY:
            query_embedding = similarity_model.encode(reference)
            passage_embedding = similarity_model.encode(candidate)
            cosine_similarity = util.cos_sim(query_embedding, passage_embedding)
            return cosine_similarity[0].item()
        else:
            bleu = sacrebleu.raw_corpus_bleu(candidate, [reference])
            return bleu.score

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0m[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linu

  from .autonotebook import tqdm as notebook_tqdm


### Helsinki-NLP/opus-mt-en-es

In [3]:
!pip install transformers
!pip install tqdm
!pip install sentencepiece
!pip install sacremoses
from transformers import MarianMTModel, MarianTokenizer
import sacremoses


class HelsinkiNLPModel(TranslationModel):
    def __init__(self):
        super().__init__("Helsinki-NLP/opus-mt-en-es")
        self.model = MarianMTModel.from_pretrained(self.checkpoint_name)
        self.tokenizer = MarianTokenizer.from_pretrained(self.checkpoint_name)

    def translate(self, source: str) -> str:
        input_ids = self.tokenizer.encode(source, return_tensors="pt")
        translated_tokens = self.model.generate(input_ids, num_beams=4, early_stopping=True)
        translated_text = self.tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
        return translated_text

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0m[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linu

### facebook/nllb-200-distilled-600M

In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


class NLLBModel(TranslationModel):
    def __init__(self):
        super().__init__("facebook/nllb-200-distilled-600M")
        self.tokenizer = AutoTokenizer.from_pretrained(self.checkpoint_name)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(self.checkpoint_name)

    def translate(self, source: str) -> str:
        inputs = self.tokenizer(source, return_tensors="pt")
        translated_tokens = self.model.generate(
            **inputs, forced_bos_token_id=self.tokenizer.lang_code_to_id["spa_Latn"], max_length=30
        )
        return self.tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]

### google-t5

In [5]:
from transformers import T5Tokenizer, T5ForConditionalGeneration


class T5Model(TranslationModel):
    def __init__(self):
        super().__init__("google-t5/t5-small")
        self.tokenizer = T5Tokenizer.from_pretrained(self.checkpoint_name)
        self.model = T5ForConditionalGeneration.from_pretrained(self.checkpoint_name)

    def translate(self, source: str) -> str:
        input_text = "translate English to Spanish: " + source
        input_ids = self.tokenizer.encode(input_text, return_tensors="pt", max_length=512)
        outputs = self.model.generate(input_ids=input_ids, num_beams=4, early_stopping=True)
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

## Loading test data

In [6]:
import json

import pandas as pd
from tqdm import tqdm

from pretrained_models.preliminary_experiments.translation_model import TranslationModel, NLLBModel


def load_sentences(test_dataset: str) -> pd.DataFrame:
    """Loads a test dataset in .jsonl format into a dataframe
    :param test_dataset: filename of the test dataset
    :return pd.Dataframe: the parallel corpus as a dataframe"""
    data = []
    with open(test_dataset, "r") as f:
        for line in f:
            data.append(json.loads(line))
    return pd.DataFrame(data)


def evaluate_models_on(mt_models: list[TranslationModel], test_sentences: pd.DataFrame) -> dict[
    TranslationModel, dict[SimilarityMetric, float]]:
    """For each model, obtains the average for all metrics over all test sentences.
    :param mt_models: the list of translation models to evaluate
    :param test_sentences: a dataframe for a parallel corpus"""

    results = {model: {metric: 0 for metric in SimilarityMetric} for model in mt_models}

    for model in mt_models:
        print(f"Model: {model}")
        for _, row in tqdm(test_sentences.iterrows()):
            english, spanish = row['en'], row['es']
            for metric in SimilarityMetric:
                similarity = metric.evaluate(spanish, model.translate(english))
                results[model][metric] += similarity

    n = test_sentences.shape[0]
    for model in mt_models:
        for metric in results[model]:
            results[model][metric] /= n

    return results


def evaluate_on_all_test_data(translation_models: list[TranslationModel], test_datasets: list[str]):
    for test_dataset in test_datasets:
        print(f"Test dataset: {test_dataset}")
        test_sentences = load_sentences(test_dataset)
        model_metrics = evaluate_models_on(translation_models, test_sentences)
        print(model_metrics)


directory_prefix = "/Users/zaki/PycharmProjects/hpo_translation/corpus/test/"
filenames = ["abstract5.jsonl"]  # + ["abstracts.jsonl", "clinspen.jsonl", "khresmoi.jsonl"]
all_test_datasets = [directory_prefix + filename for filename in filenames]

## Evaluating the models

In [7]:
all_models = [HelsinkiNLPModel(), NLLBModel(), T5Model()]
evaluate_on_all_test_data(all_models, all_test_datasets)

  return self.fget.__get__(instance, owner)()
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Test dataset: /Users/zaki/PycharmProjects/hpo_translation/corpus/test/abstract5.jsonl
Model: <__main__.HelsinkiNLPModel object at 0x1302f0af0>


5it [01:03, 12.71s/it]


Model: <pretrained_models.preliminary_experiments.translation_model.NLLBModel object at 0x1319298e0>


0it [00:00, ?it/s]the `lang_code_to_id` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder` this attribute will be removed in `transformers` v4.38
5it [01:28, 17.76s/it]


Model: <__main__.T5Model object at 0x1318ece50>


0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
5it [00:16,  3.37s/it]

{<__main__.HelsinkiNLPModel object at 0x1302f0af0>: {<SimilarityMetric.BLEU: 0>: 0.31283084407223, <SimilarityMetric.SIMPLE: 1>: 0.0, <SimilarityMetric.EDIT_DISTANCE: 2>: 0.6284120710816267, <SimilarityMetric.SEMANTIC_SIMILARITY: 3>: 0.951473867893219, <SimilarityMetric.SACREBLEU: 4>: 18.920882472626918}, <pretrained_models.preliminary_experiments.translation_model.NLLBModel object at 0x1319298e0>: {<SimilarityMetric.BLEU: 0>: 0.1778587805216802, <SimilarityMetric.SIMPLE: 1>: 0.0, <SimilarityMetric.EDIT_DISTANCE: 2>: 0.4411531474040361, <SimilarityMetric.SEMANTIC_SIMILARITY: 3>: 0.8756338953971863, <SimilarityMetric.SACREBLEU: 4>: 24.861797534566787}, <__main__.T5Model object at 0x1318ece50>: {<SimilarityMetric.BLEU: 0>: 0.003911432380992995, <SimilarityMetric.SIMPLE: 1>: 0.0, <SimilarityMetric.EDIT_DISTANCE: 2>: 0.17933786188169604, <SimilarityMetric.SEMANTIC_SIMILARITY: 3>: 0.7251034379005432, <SimilarityMetric.SACREBLEU: 4>: 3.4863833608273813}}



