In [None]:
import os

task = "ner"
run_name = "pretrained-roberta-base-biomedical-clinical-es-crf_v1" # letters, numbers, dashes, underscores, dots.
model_checkpoint = "PlanTL-GOB-ES/roberta-base-biomedical-clinical-es"
notebooks_dir_path = "notebook_dir" # dir of this notebook
output_path = 'output_path'

# save the model at the end of the training. Use with Trainer load_best_model_at_end=True to save the best model.
os.environ["WANDB_LOG_MODEL"] = "end" # set to "false" to prevent saving.

In [None]:
# If running in Colab
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%%capture
!pip install datasets transformers accelerate seqeval wandb

In [None]:
!wandb login # Click the authorize link first. The login box is just right on the last row, after the :

In [None]:
import wandb
from datetime import datetime
import pytz

timestamp = '{date:%Y-%m-%d-%H_%M_%S}'.format(date=datetime.now(pytz.timezone('Europe/Sofia')))
full_run_name = f"{run_name}_{timestamp}"

wandb_run = wandb.init(project="SympTEMIST 2023 - Pre-training", name=full_run_name, save_code=True)

In [None]:
import transformers
from transformers import AutoTokenizer

print(transformers.__version__)

# Fine-tuning a language model

## Preparing the dataset

In [None]:
from datasets import load_dataset

In [None]:
from datasets import DatasetDict

path_to_train = '/content/drive/MyDrive/AI/SympTEMIST 2023/data/pretraining/train.txt'
dataset_split = load_dataset("text", data_files={"train": path_to_train}, split=[
    'train[0%:90%]', 'train[90%:100%]'])

datasets = DatasetDict({'train': dataset_split[0], 'validation': dataset_split[1]})

In [None]:
datasets

In [None]:
datasets["train"][10]

To get a sense of what the data looks like, the following function will show some examples picked randomly in the dataset.

In [None]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [None]:
show_random_elements(datasets["train"])

## Masked language modeling

In [None]:
def tokenize_function(examples):
  return tokenizer(examples["text"], padding='longest')

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
tokenized_datasets = datasets.map(tokenize_function, batched=True, batch_size=-1, remove_columns=["text"])

In [None]:
tokenized_datasets

In [None]:
def group_texts(examples):
  examples['labels'] = examples['input_ids'].copy()
  return examples

In [None]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=-1
)

In [None]:
from transformers import AutoModelForMaskedLM
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    f"{output_path}/{full_run_name}",
    learning_rate=6e-4,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    weight_decay=0.01,
    adam_epsilon=1e-6,
    adam_beta1=0.9,
    adam_beta2=0.98,
    warmup_ratio=0.048,
    num_train_epochs=5,
    per_device_train_batch_size=80,
    per_device_eval_batch_size=80,
    gradient_accumulation_steps=100,
    push_to_hub=False,
    report_to="wandb",
    load_best_model_at_end=True,
    run_name=full_run_name # this is just to have a decent model name in WANDB, instead of an auto-generated one
)

In [None]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
    data_collator=data_collator,
)

In [None]:
# to resume training from a checkpoint, specify its path here.
trainer.train()

In [None]:
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
wandb_run.finish()