In [1]:
from transformers import AutoModelForMaskedLM, AutoTokenizer, DataCollatorForLanguageModeling, Trainer, \
    TrainingArguments, T5ForConditionalGeneration
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from datasets import DatasetDict, Dataset
from pathlib import Path
from pynvml import *
import torch
import argparse
import sys
import random
import pandas as pd
import numpy as np
import itertools
import evaluate
import math

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used // 1024 ** 2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()


def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, max_length=128)



In [2]:
# parser = argparse.ArgumentParser(description='Task-adaptive model')
# parser.add_argument('--model', help='Path to BERT-like model')
# parser.add_argument('--model_name',
#                     help='Name of the BERT-like model. Default = "" which corresponds to ClinicalBERT',
#                     default='')
# config = parser.parse_args(sys.argv[1:])

MODEL = T5ForConditionalGeneration.from_pretrained("luqh/ClinicalT5-base", from_flax=True)

#MODEL = f"cardiffnlp/twitter-roberta-base-{task}" #minerva: download model from hugging face and put in folder, update to path

# set seed
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

RuntimeError: This version of jaxlib was built using AVX instructions, which your CPU and/or operating system do not support. You may be able work around this issue by building jaxlib from source.

In [None]:
# Create task Dataset
task_dt_path = 'task_adapt_sentences.csv'
sentences = pd.read_csv(task_dt_path, header=None)
sentences = sentences[0].str.lower().str.split('.').values.tolist()
sentences = list(itertools.chain.from_iterable(sentences))

train, dev = train_test_split(sentences, test_size=0.2, random_state=42)
# dev, test = train_test_split(test, test_size=0.5, random_state=42)

task_dt = {'train': {},
           'dev': {}}
           #'test': {}}

for s in train:
    task_dt['train'].setdefault('text', list()).append(s)

for s in dev:
    task_dt['dev'].setdefault('text', list()).append(s)

task_dt = DatasetDict({k: Dataset.from_dict(task_dt[k]) for k in task_dt.keys() if k != 'test'})
task_dt.flatten()

tokenizer = AutoTokenizer.from_pretrained("luqh/ClinicalT5-base")
tokenizer.add_tokens(['[DATE]', '[TIME]'], special_tokens=True)
tkn_dt = task_dt.map(tokenize_function, batched=True, num_proc=4)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)
    model = AutoModelForMaskedLM.from_pretrained(
        MODEL,
        from_tf=False).to('cuda')
    print_gpu_utilization()
else:
    model = AutoModelForMaskedLM.from_pretrained(
        MODEL,
        from_tf=False)
model.resize_token_embeddings(len(tokenizer))

batch_size = 8 # [4, 8, 16]
logging_steps = len(task_dt["train"]) // batch_size

training_args = TrainingArguments(
    output_dir="runs/ta_pretraining",
    evaluation_strategy="epoch",
    learning_rate=1e-4, # [5e-6, 1e-5]
    num_train_epochs=5, 
    weight_decay=0.01, # [1e-6, 1e-4, 1e-8]
    warmup_ratio=0.01, #[1e-6, 1e-4, 1e-8]
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_strategy='epoch',
    save_total_limit=2,
    logging_steps=logging_steps,
    adam_epsilon=1e-6, # try to add AdamW optimizer post changing the batch size
    seed=42
) # seed = 42, data_seed = 42

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tkn_dt['train'],
    eval_dataset=tkn_dt['dev'],
    data_collator=data_collator   
)

In [None]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
print("\n\n")
result = trainer.train() #resume_from_checkpoint=True)
print(result)
print("\n\n")
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")