If you're opening this Notebook on colab, you will probably need to install 🤗 Transformers and 🤗 Datasets. Uncomment the following cell and run it.

In [None]:
# # If running in Colab
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
! pip install datasets transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import transformers
from transformers import AutoTokenizer

print(transformers.__version__)

4.28.1


# Fine-tuning a language model

## Preparing the dataset

In [None]:
from datasets import load_dataset

In [None]:
from datasets import DatasetDict

path_to_train = '/content/drive/MyDrive/AI/CLEF2023/pretraining/train.txt'
dataset_split = load_dataset("text", data_files={"train": path_to_train}, split=[
    'train[0%:90%]', 'train[90%:100%]'])

datasets = DatasetDict({'train': dataset_split[0], 'validation': dataset_split[1]})



  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
datasets

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 277002
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 30778
    })
})

In [None]:
datasets["train"][10]

{'text': 'implante percutáneo de los electrodos de un neuroestimulador en un componente neuromuscular '}

To get a sense of what the data looks like, the following function will show some examples picked randomly in the dataset.

In [None]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [None]:
show_random_elements(datasets["train"])

Unnamed: 0,text
0,ejercicio de la extremidad isquémica con realización de electromiograma y determinación de ácido láctico
1,"Suplemento en axila, izquierda, con sustituto sintético, abordaje abierto"
2,inyección de cuello con guía ecográfica
3,artrectomía de la rodilla
4,"Suplemento en hueso lagrimal, derecho, con sustituto de tejido no autólogo, abordaje endoscópico percutáneo"
5,derivación por enfermera psicoterapeuta
6,Talent Cnverter - buscar Dispsitiv
7,"Reparación de tendón tobillo, derecho, abordaje endoscópico percutáneo"
8,"Escisión de túnica vaginal, derecha, abordaje abierto"
9,"Oclusión de válvula ileocecal, con dispositivo extraluminal, abordaje endoscópico percutáneo"


## Masked language modeling

In [None]:
model_checkpoint = "PlanTL-GOB-ES/roberta-base-biomedical-clinical-es"
output_path = '/content/drive/MyDrive/AI/CLEF2023/'

In [None]:
def tokenize_function(examples):
  return tokenizer(examples["text"], padding='longest')

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
tokenized_datasets = datasets.map(tokenize_function, batched=True, batch_size=-1, remove_columns=["text"])



In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 277002
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 30778
    })
})

In [None]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=-1
)



In [None]:
from transformers import AutoModelForMaskedLM
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [None]:
from transformers import Trainer, TrainingArguments

model_name = model_checkpoint.split("/")[-1]
training_args = TrainingArguments(
    f"{output_path}/multibatch-{model_name}-finetuned-mlm-no-concat-v1",
    evaluation_strategy = "epoch",
    learning_rate=6e-4,
    weight_decay=0.01,
    adam_epsilon=1e-6,
    adam_beta1=0.9,
    adam_beta2=0.98,
    warmup_ratio=0.048,
    num_train_epochs=10,
    per_device_train_batch_size=80,
    per_device_eval_batch_size=80,
    gradient_accumulation_steps=100,
    save_steps = 34,
    push_to_hub=False,
)

In [None]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
    data_collator=data_collator,
)

In [None]:
# to resume training from a checkpoint, specify its path here.
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
0,No log,2.335603
1,No log,2.02151
2,No log,1.913015


KeyboardInterrupt: ignored

In [None]:
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Epoch,Training Loss,Validation Loss
0,No log,2.335603
1,No log,2.02151
2,No log,1.913015
3,No log,1.929285


Perplexity: 6.88
