# Running pre-trained models on biomedical texts to evaluate them
## Infrastructure

In [8]:
from google.colab import drive

drive.mount('/content/drive')
%cd /content/drive/MyDrive/Year abroad/Thesis/Experiments

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Year abroad/Thesis/Experiments


In [9]:
!pip install transformers tqdm sentencepiece sacremoses accelerate ipywidgets protobuf
# pytorch

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sacremoses
Successfully installed sacremoses-0.1.1
Collecting accelerate
  Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia

In [10]:
from abc import abstractmethod


class TranslationModel:
    def __init__(self, checkpoint_name: str):
        self.checkpoint_name = checkpoint_name

    @abstractmethod
    def translate(self, source: str) -> str:
        """Translates a source text with the model
        :param source: the text to translate
        :return: str - the translation"""
        pass

    def __str__(self):
        return self.checkpoint_name

### facebook/nllb-200-distilled-600M

In [11]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


class NLLBModel(TranslationModel):
    def __init__(self):
        super().__init__("facebook/nllb-200-distilled-600M")
        self.tokenizer = AutoTokenizer.from_pretrained(self.checkpoint_name)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(self.checkpoint_name)

    def translate(self, source: str) -> str:
        inputs = self.tokenizer(source, return_tensors="pt")
        translated_tokens = self.model.generate(
            **inputs, forced_bos_token_id=self.tokenizer.lang_code_to_id["spa_Latn"], max_length=1000
        )
        translated_text = self.tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
        return translated_text

## Loading and evaluation functions

In [25]:
import json

import pandas as pd
from tqdm import tqdm


def load_sentences(test_dataset: str, num_rows=500, seed=42) -> pd.DataFrame:
    """Loads a test dataset in .jsonl format into a dataframe and randomly selects n rows
    :param test_dataset: filename of the test dataset
    :param num_rows: number of rows to select
    :param seed: random seed for reproducibility
    :return pd.Dataframe: the parallel corpus as a dataframe"""
    data = []
    with open(test_dataset, "r") as f:
        for line in f:
            loaded = json.loads(line)
            english, spanish = loaded['en'].strip('"'), loaded['es'].strip('"')
            data.append({"en": english, "es": spanish})
    df = pd.DataFrame(data)
    return df.sample(n=num_rows, random_state=seed)


def run_models(model: TranslationModel, test_sentences: pd.DataFrame):
    """Runs a model on the test sentences. Creates a dataframe for the results with two columns: 'Reference' and 'Actual'
    :param model: the MT model to evaluate
    :param test_sentences: a dataframe for a parallel corpus"""
    df = pd.DataFrame(columns=['reference', 'actual'])

    for _, row in tqdm(test_sentences.iterrows()):
        english, spanish = row['en'], row['es']
        translation = model.translate(english)
        df.loc[len(df.index)] = [spanish, translation]

    return df


def evaluate_on_all_test_data(translation_model: TranslationModel, dataset_names: list[str],
                              corpus_directory: str):
    for test_dataset in dataset_names:
        test_path = corpus_directory + test_dataset
        print(f"Test dataset: {test_dataset}")
        test_sentences = load_sentences(test_path, num_rows=500)
        results = run_models(translation_model, test_sentences)
        results.to_csv("nllb/" + test_dataset.replace(".jsonl", ".csv"), index=False)

## Evaluating the models

In [26]:
test_directory = "train/"
filenames = ["abstracts.jsonl", "khresmoi.jsonl", "medline.jsonl", "scielo.jsonl", "snomed.jsonl"]

In [None]:
evaluate_on_all_test_data(NLLBModel(), filenames, test_directory)

Test dataset: abstracts.jsonl


16it [00:39,  2.61s/it]